@@ -8,20 +8,22 @@ def test_every_mdn_tag_is_mapped() -> None:
88 """Regression guard for GH-720: no MDN element may be forgotten."""
99 missing = sorted (MDN_ELEMENTS - HTML_EL_TO_XML_EL .keys ())
1010 assert not missing , f"Tags without conversion rule: { missing } "
11-
11+
1212 # Verify the mapping has reasonable values (no empty strings, etc.)
1313 for html_tag , xml_tag in HTML_EL_TO_XML_EL .items ():
1414 assert xml_tag , f"Empty mapping for '{ html_tag } '"
15- assert isinstance (xml_tag , str ), f"Non-string mapping for '{ html_tag } ': { xml_tag } "
16- assert xml_tag .islower (), f"Non-lowercase mapping for '{ html_tag } ': { xml_tag } "
15+ assert isinstance (xml_tag , str ), \
16+ f"Non-string mapping for '{ html_tag } ': { xml_tag } "
17+ assert xml_tag .islower (), \
18+ f"Non-lowercase mapping for '{ html_tag } ': { xml_tag } "
1719
1820
1921def test_explicit_mappings_preserved () -> None :
2022 """Verify that explicit conversions are preserved correctly."""
2123 # Test some key explicit mappings
2224 expected_mappings = {
2325 "h1" : "head" ,
24- "h2" : "head" ,
26+ "h2" : "head" ,
2527 "h3" : "head" ,
2628 "ul" : "list" ,
2729 "ol" : "list" ,
@@ -36,21 +38,222 @@ def test_explicit_mappings_preserved() -> None:
3638 "p" : "p" ,
3739 "div" : "div" ,
3840 }
39-
41+
4042 for html_tag , expected_xml_tag in expected_mappings .items ():
4143 assert HTML_EL_TO_XML_EL [html_tag ] == expected_xml_tag , \
42- f"Expected { html_tag } -> { expected_xml_tag } , got { HTML_EL_TO_XML_EL [html_tag ]} "
44+ f"Expected { html_tag } -> { expected_xml_tag } , " \
45+ f"got { HTML_EL_TO_XML_EL [html_tag ]} "
4346
4447
4548def test_identity_mappings_for_unspecified_elements () -> None :
4649 """Verify that elements without explicit mapping get identity mapping."""
4750 # Elements that should have identity mappings (tag -> tag)
4851 identity_elements = {
49- "article" , "section" , "aside" , "nav" , "main" , "header" , "footer" ,
50- "plaintext" , "content" , "image" , "menuitem" , "shadow" , "selectedcontent"
52+ "article" , "section" , "aside" , "nav" , "main" , "header" ,
53+ "footer" , "plaintext" , "content" , "image" , "menuitem" ,
54+ "shadow" , "selectedcontent"
5155 }
52-
56+
5357 for element in identity_elements :
54- assert element in HTML_EL_TO_XML_EL , f"Element '{ element } ' missing from mapping"
58+ assert element in HTML_EL_TO_XML_EL , \
59+ f"Element '{ element } ' missing from mapping"
5560 assert HTML_EL_TO_XML_EL [element ] == element , \
56- f"Expected identity mapping for '{ element } ', got '{ HTML_EL_TO_XML_EL [element ]} '"
61+ f"Expected identity mapping for '{ element } ', " \
62+ f"got '{ HTML_EL_TO_XML_EL [element ]} '"
63+
64+
65+ def test_lesser_known_elements_preservation () -> None :
66+ """Test lesser-known HTML elements are preserved during processing."""
67+ from lxml import html , etree
68+ from trafilatura .htmlprocessing import convert_tags
69+ from trafilatura .core import Extractor
70+
71+ # HTML snippet with lesser-known and legacy elements
72+ test_html = """<html><body>
73+ <article>
74+ <ruby>漢<rt>kan</rt>字<rt>ji</rt></ruby>
75+ <p>The <abbr title="HTML">HTML</abbr> spec includes
76+ <dfn>semantic elements</dfn> for meaning.</p>
77+ <p>Event: <data value="2025-01-15T14:30:00">2:30 PM</data></p>
78+ <p>Please <mark>remember this</mark> information.</p>
79+ <p>Arabic: <bdi>مرحبا</bdi> means hello.</p>
80+ <blockquote>Quote text <cite>Author</cite></blockquote>
81+
82+ <!-- Legacy elements -->
83+ <center>Centered text</center>
84+ <nobr>Non-breaking text</nobr>
85+ <big>Bigger text</big>
86+
87+ <!-- Modern elements -->
88+ <search>Search content</search>
89+ <fencedframe src="example.html">Fallback</fencedframe>
90+ <progress value="70" max="100">70%</progress>
91+ <meter value="6" max="10">6/10</meter>
92+
93+ <template id="tmpl">Template content</template>
94+
95+ <details>
96+ <summary>Expandable</summary>
97+ <p>Hidden content</p>
98+ </details>
99+ </article>
100+ </body></html>"""
101+
102+ # Parse the HTML
103+ doc = html .fromstring (test_html )
104+
105+ # Apply tag conversion with minimal necessary options
106+ options = Extractor ()
107+ options .formatting = True # Only formatting is needed for this test
108+
109+ # Before the patch, many elements would be stripped or ignored.
110+ # With the patch, they're preserved due to MDN element mapping.
111+ converted_doc = convert_tags (doc , options )
112+ result_html = etree .tostring (converted_doc , encoding = 'unicode' )
113+
114+ # Verify specific lesser-known elements are preserved
115+ # These demonstrate elements that would have been lost before the patch
116+ assert '<ruby>' in result_html and '</ruby>' in result_html
117+ assert '<rt>' in result_html and '</rt>' in result_html
118+ assert '<abbr' in result_html and '</abbr>' in result_html
119+ assert '<dfn>' in result_html and '</dfn>' in result_html
120+ assert '<data' in result_html and '</data>' in result_html
121+ assert '<mark>' in result_html and '</mark>' in result_html
122+ assert '<bdi>' in result_html and '</bdi>' in result_html
123+ assert '<cite>' in result_html and '</cite>' in result_html
124+
125+ # Legacy elements that are now preserved
126+ assert '<center>' in result_html and '</center>' in result_html
127+ assert '<nobr>' in result_html and '</nobr>' in result_html
128+ assert '<big>' in result_html and '</big>' in result_html
129+
130+ # Modern elements that are now preserved
131+ assert '<search>' in result_html and '</search>' in result_html
132+ assert '<fencedframe' in result_html and '</fencedframe>' in result_html
133+ assert '<progress' in result_html and '</progress>' in result_html
134+ assert '<meter' in result_html and '</meter>' in result_html
135+ assert '<template' in result_html and '</template>' in result_html
136+
137+ # Verify text content is still accessible
138+ text_content = converted_doc .text_content ()
139+ assert '漢' in text_content and '字' in text_content # Ruby characters
140+ assert 'kan' in text_content and 'ji' in text_content # Ruby text
141+ assert 'HTML' in text_content # Abbreviation text
142+ assert 'semantic elements' in text_content # Definition text
143+ assert '2:30 PM' in text_content # Data element text
144+ assert 'remember this' in text_content # Mark element text
145+ assert 'مرحبا' in text_content # Bidirectional text
146+ assert 'Quote text' in text_content # Blockquote text
147+ assert 'Author' in text_content # Citation text
148+ assert 'Centered text' in text_content # Legacy center text
149+ assert 'Non-breaking text' in text_content # Legacy nobr text
150+ assert 'Bigger text' in text_content # Legacy big text
151+ assert 'Search content' in text_content # Search element text
152+ assert 'Fallback' in text_content # Fencedframe fallback
153+ assert '70%' in text_content # Progress text
154+ assert '6/10' in text_content # Meter text
155+ assert 'Template content' in text_content # Template text
156+ assert 'Expandable' in text_content # Summary text
157+ assert 'Hidden content' in text_content # Details content
158+
159+
160+ def test_comprehensive_tag_conversion_before_after () -> None :
161+ """Demonstrate before/after behavior of comprehensive tag conversion."""
162+ from lxml import html , etree
163+ from trafilatura .htmlprocessing import HTML_EL_TO_XML_EL , convert_tags
164+ from trafilatura .core import Extractor
165+
166+ # Simple HTML with elements that weren't handled before the patch
167+ simple_html = ('<body><search>Search</search><ruby>Ruby</ruby>'
168+ '<nobr>NoBreak</nobr></body>' )
169+
170+ # Parse HTML
171+ doc = html .fromstring (simple_html )
172+
173+ # Apply tag conversion using the public API
174+ options = Extractor ()
175+ converted_doc = convert_tags (doc , options )
176+ converted_html = etree .tostring (converted_doc , encoding = 'unicode' )
177+
178+ # Verify that:
179+ # 1. Elements are preserved (not stripped)
180+ # 2. Identity mappings work (element stays the same)
181+ # 3. All elements from MDN list have mappings
182+ assert '<search>' in converted_html # Should be preserved
183+ assert '<ruby>' in converted_html # Should be preserved
184+ assert '<nobr>' in converted_html # Should be preserved
185+
186+ # Verify elements have proper mappings
187+ assert HTML_EL_TO_XML_EL ['search' ] == 'search' # Identity mapping
188+ assert HTML_EL_TO_XML_EL ['ruby' ] == 'ruby' # Identity mapping
189+ assert HTML_EL_TO_XML_EL ['nobr' ] == 'nobr' # Identity mapping
190+
191+
192+ def test_table_elements_excluded_from_conversion () -> None :
193+ """Ensure table elements are not converted to avoid conflicts."""
194+ from lxml import html , etree
195+ from trafilatura .htmlprocessing import convert_tags
196+ from trafilatura .core import Extractor
197+
198+ # HTML with table elements that should NOT be converted
199+ table_html = '''<body>
200+ <table>
201+ <tr><td>Cell 1</td><th>Header 1</th></tr>
202+ <tr><td>Cell 2</td><th>Header 2</th></tr>
203+ </table>
204+ </body>'''
205+
206+ # Parse HTML
207+ doc = html .fromstring (table_html )
208+
209+ # Apply tag conversion
210+ options = Extractor ()
211+ converted_doc = convert_tags (doc , options )
212+ result_html = etree .tostring (converted_doc , encoding = 'unicode' )
213+
214+ # Verify that table elements are NOT converted (remain as-is)
215+ # This prevents conflicts with main_extractor's table processing logic
216+ assert '<table>' in result_html and '</table>' in result_html
217+ assert '<tr>' in result_html and '</tr>' in result_html
218+ assert '<td>' in result_html and '</td>' in result_html
219+ assert '<th>' in result_html and '</th>' in result_html
220+
221+ # Verify they did NOT get converted to their XML equivalents
222+ assert '<row>' not in result_html # tr should NOT be converted to row
223+ assert '<cell>' not in result_html # td/th should NOT be converted
224+
225+
226+ def test_conversions_consistency () -> None :
227+ """Ensure all CONVERSIONS keys are excluded to maintain consistency."""
228+ from trafilatura .htmlprocessing import (
229+ CONVERSIONS , _EXCLUDED_TAGS # test-hook
230+ )
231+
232+ # All CONVERSIONS keys must be in _EXCLUDED_TAGS to prevent conflicts
233+ conversions_keys = set (CONVERSIONS .keys ())
234+ missing_exclusions = conversions_keys - _EXCLUDED_TAGS
235+
236+ assert not missing_exclusions , \
237+ f"CONVERSIONS keys not in _EXCLUDED_TAGS: { missing_exclusions } . " \
238+ f"This will cause conflicts in convert_tags processing."
239+
240+
241+ def test_unsafe_tags_are_cleaned () -> None :
242+ """Verify potentially unsafe HTML elements are handled."""
243+ from trafilatura .htmlprocessing import (
244+ _ALL_TAGS_TO_CONVERT # test-hook
245+ )
246+ from trafilatura .settings import MANUALLY_CLEANED
247+
248+ # Tags that could pose security risks if preserved unchecked
249+ unsafe_tags = {'embed' , 'object' , 'svg' , 'math' , 'canvas' , 'script' ,
250+ 'iframe' , 'frame' , 'frameset' , 'applet' }
251+
252+ preserved_unsafe = unsafe_tags & _ALL_TAGS_TO_CONVERT
253+ manually_cleaned = set (MANUALLY_CLEANED )
254+ risky_tags = preserved_unsafe - manually_cleaned
255+
256+ assert not risky_tags , \
257+ f"Potentially unsafe tags are preserved but not in " \
258+ f"MANUALLY_CLEANED: { risky_tags } . Consider adding them to " \
259+ f"MANUALLY_CLEANED or removing from MDN_ELEMENTS."
0 commit comments