Skip to content

Commit a4c6730

Browse files
committed
Ensure complete MDN HTML element coverage by mapping previously unhandled HTML elements to XML counterparts (#720)
1 parent 80c8588 commit a4c6730

File tree

3 files changed

+303
-57
lines changed

3 files changed

+303
-57
lines changed

tests/test_html_elements.py

Lines changed: 214 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,22 @@ def test_every_mdn_tag_is_mapped() -> None:
88
"""Regression guard for GH-720: no MDN element may be forgotten."""
99
missing = sorted(MDN_ELEMENTS - HTML_EL_TO_XML_EL.keys())
1010
assert not missing, f"Tags without conversion rule: {missing}"
11-
11+
1212
# Verify the mapping has reasonable values (no empty strings, etc.)
1313
for html_tag, xml_tag in HTML_EL_TO_XML_EL.items():
1414
assert xml_tag, f"Empty mapping for '{html_tag}'"
15-
assert isinstance(xml_tag, str), f"Non-string mapping for '{html_tag}': {xml_tag}"
16-
assert xml_tag.islower(), f"Non-lowercase mapping for '{html_tag}': {xml_tag}"
15+
assert isinstance(xml_tag, str), \
16+
f"Non-string mapping for '{html_tag}': {xml_tag}"
17+
assert xml_tag.islower(), \
18+
f"Non-lowercase mapping for '{html_tag}': {xml_tag}"
1719

1820

1921
def test_explicit_mappings_preserved() -> None:
2022
"""Verify that explicit conversions are preserved correctly."""
2123
# Test some key explicit mappings
2224
expected_mappings = {
2325
"h1": "head",
24-
"h2": "head",
26+
"h2": "head",
2527
"h3": "head",
2628
"ul": "list",
2729
"ol": "list",
@@ -36,21 +38,222 @@ def test_explicit_mappings_preserved() -> None:
3638
"p": "p",
3739
"div": "div",
3840
}
39-
41+
4042
for html_tag, expected_xml_tag in expected_mappings.items():
4143
assert HTML_EL_TO_XML_EL[html_tag] == expected_xml_tag, \
42-
f"Expected {html_tag} -> {expected_xml_tag}, got {HTML_EL_TO_XML_EL[html_tag]}"
44+
f"Expected {html_tag} -> {expected_xml_tag}, " \
45+
f"got {HTML_EL_TO_XML_EL[html_tag]}"
4346

4447

4548
def test_identity_mappings_for_unspecified_elements() -> None:
4649
"""Verify that elements without explicit mapping get identity mapping."""
4750
# Elements that should have identity mappings (tag -> tag)
4851
identity_elements = {
49-
"article", "section", "aside", "nav", "main", "header", "footer",
50-
"plaintext", "content", "image", "menuitem", "shadow", "selectedcontent"
52+
"article", "section", "aside", "nav", "main", "header",
53+
"footer", "plaintext", "content", "image", "menuitem",
54+
"shadow", "selectedcontent"
5155
}
52-
56+
5357
for element in identity_elements:
54-
assert element in HTML_EL_TO_XML_EL, f"Element '{element}' missing from mapping"
58+
assert element in HTML_EL_TO_XML_EL, \
59+
f"Element '{element}' missing from mapping"
5560
assert HTML_EL_TO_XML_EL[element] == element, \
56-
f"Expected identity mapping for '{element}', got '{HTML_EL_TO_XML_EL[element]}'"
61+
f"Expected identity mapping for '{element}', " \
62+
f"got '{HTML_EL_TO_XML_EL[element]}'"
63+
64+
65+
def test_lesser_known_elements_preservation() -> None:
66+
"""Test lesser-known HTML elements are preserved during processing."""
67+
from lxml import html, etree
68+
from trafilatura.htmlprocessing import convert_tags
69+
from trafilatura.core import Extractor
70+
71+
# HTML snippet with lesser-known and legacy elements
72+
test_html = """<html><body>
73+
<article>
74+
<ruby>漢<rt>kan</rt>字<rt>ji</rt></ruby>
75+
<p>The <abbr title="HTML">HTML</abbr> spec includes
76+
<dfn>semantic elements</dfn> for meaning.</p>
77+
<p>Event: <data value="2025-01-15T14:30:00">2:30 PM</data></p>
78+
<p>Please <mark>remember this</mark> information.</p>
79+
<p>Arabic: <bdi>مرحبا</bdi> means hello.</p>
80+
<blockquote>Quote text <cite>Author</cite></blockquote>
81+
82+
<!-- Legacy elements -->
83+
<center>Centered text</center>
84+
<nobr>Non-breaking text</nobr>
85+
<big>Bigger text</big>
86+
87+
<!-- Modern elements -->
88+
<search>Search content</search>
89+
<fencedframe src="example.html">Fallback</fencedframe>
90+
<progress value="70" max="100">70%</progress>
91+
<meter value="6" max="10">6/10</meter>
92+
93+
<template id="tmpl">Template content</template>
94+
95+
<details>
96+
<summary>Expandable</summary>
97+
<p>Hidden content</p>
98+
</details>
99+
</article>
100+
</body></html>"""
101+
102+
# Parse the HTML
103+
doc = html.fromstring(test_html)
104+
105+
# Apply tag conversion with minimal necessary options
106+
options = Extractor()
107+
options.formatting = True # Only formatting is needed for this test
108+
109+
# Before the patch, many elements would be stripped or ignored.
110+
# With the patch, they're preserved due to MDN element mapping.
111+
converted_doc = convert_tags(doc, options)
112+
result_html = etree.tostring(converted_doc, encoding='unicode')
113+
114+
# Verify specific lesser-known elements are preserved
115+
# These demonstrate elements that would have been lost before the patch
116+
assert '<ruby>' in result_html and '</ruby>' in result_html
117+
assert '<rt>' in result_html and '</rt>' in result_html
118+
assert '<abbr' in result_html and '</abbr>' in result_html
119+
assert '<dfn>' in result_html and '</dfn>' in result_html
120+
assert '<data' in result_html and '</data>' in result_html
121+
assert '<mark>' in result_html and '</mark>' in result_html
122+
assert '<bdi>' in result_html and '</bdi>' in result_html
123+
assert '<cite>' in result_html and '</cite>' in result_html
124+
125+
# Legacy elements that are now preserved
126+
assert '<center>' in result_html and '</center>' in result_html
127+
assert '<nobr>' in result_html and '</nobr>' in result_html
128+
assert '<big>' in result_html and '</big>' in result_html
129+
130+
# Modern elements that are now preserved
131+
assert '<search>' in result_html and '</search>' in result_html
132+
assert '<fencedframe' in result_html and '</fencedframe>' in result_html
133+
assert '<progress' in result_html and '</progress>' in result_html
134+
assert '<meter' in result_html and '</meter>' in result_html
135+
assert '<template' in result_html and '</template>' in result_html
136+
137+
# Verify text content is still accessible
138+
text_content = converted_doc.text_content()
139+
assert '漢' in text_content and '字' in text_content # Ruby characters
140+
assert 'kan' in text_content and 'ji' in text_content # Ruby text
141+
assert 'HTML' in text_content # Abbreviation text
142+
assert 'semantic elements' in text_content # Definition text
143+
assert '2:30 PM' in text_content # Data element text
144+
assert 'remember this' in text_content # Mark element text
145+
assert 'مرحبا' in text_content # Bidirectional text
146+
assert 'Quote text' in text_content # Blockquote text
147+
assert 'Author' in text_content # Citation text
148+
assert 'Centered text' in text_content # Legacy center text
149+
assert 'Non-breaking text' in text_content # Legacy nobr text
150+
assert 'Bigger text' in text_content # Legacy big text
151+
assert 'Search content' in text_content # Search element text
152+
assert 'Fallback' in text_content # Fencedframe fallback
153+
assert '70%' in text_content # Progress text
154+
assert '6/10' in text_content # Meter text
155+
assert 'Template content' in text_content # Template text
156+
assert 'Expandable' in text_content # Summary text
157+
assert 'Hidden content' in text_content # Details content
158+
159+
160+
def test_comprehensive_tag_conversion_before_after() -> None:
161+
"""Demonstrate before/after behavior of comprehensive tag conversion."""
162+
from lxml import html, etree
163+
from trafilatura.htmlprocessing import HTML_EL_TO_XML_EL, convert_tags
164+
from trafilatura.core import Extractor
165+
166+
# Simple HTML with elements that weren't handled before the patch
167+
simple_html = ('<body><search>Search</search><ruby>Ruby</ruby>'
168+
'<nobr>NoBreak</nobr></body>')
169+
170+
# Parse HTML
171+
doc = html.fromstring(simple_html)
172+
173+
# Apply tag conversion using the public API
174+
options = Extractor()
175+
converted_doc = convert_tags(doc, options)
176+
converted_html = etree.tostring(converted_doc, encoding='unicode')
177+
178+
# Verify that:
179+
# 1. Elements are preserved (not stripped)
180+
# 2. Identity mappings work (element stays the same)
181+
# 3. All elements from MDN list have mappings
182+
assert '<search>' in converted_html # Should be preserved
183+
assert '<ruby>' in converted_html # Should be preserved
184+
assert '<nobr>' in converted_html # Should be preserved
185+
186+
# Verify elements have proper mappings
187+
assert HTML_EL_TO_XML_EL['search'] == 'search' # Identity mapping
188+
assert HTML_EL_TO_XML_EL['ruby'] == 'ruby' # Identity mapping
189+
assert HTML_EL_TO_XML_EL['nobr'] == 'nobr' # Identity mapping
190+
191+
192+
def test_table_elements_excluded_from_conversion() -> None:
193+
"""Ensure table elements are not converted to avoid conflicts."""
194+
from lxml import html, etree
195+
from trafilatura.htmlprocessing import convert_tags
196+
from trafilatura.core import Extractor
197+
198+
# HTML with table elements that should NOT be converted
199+
table_html = '''<body>
200+
<table>
201+
<tr><td>Cell 1</td><th>Header 1</th></tr>
202+
<tr><td>Cell 2</td><th>Header 2</th></tr>
203+
</table>
204+
</body>'''
205+
206+
# Parse HTML
207+
doc = html.fromstring(table_html)
208+
209+
# Apply tag conversion
210+
options = Extractor()
211+
converted_doc = convert_tags(doc, options)
212+
result_html = etree.tostring(converted_doc, encoding='unicode')
213+
214+
# Verify that table elements are NOT converted (remain as-is)
215+
# This prevents conflicts with main_extractor's table processing logic
216+
assert '<table>' in result_html and '</table>' in result_html
217+
assert '<tr>' in result_html and '</tr>' in result_html
218+
assert '<td>' in result_html and '</td>' in result_html
219+
assert '<th>' in result_html and '</th>' in result_html
220+
221+
# Verify they did NOT get converted to their XML equivalents
222+
assert '<row>' not in result_html # tr should NOT be converted to row
223+
assert '<cell>' not in result_html # td/th should NOT be converted
224+
225+
226+
def test_conversions_consistency() -> None:
227+
"""Ensure all CONVERSIONS keys are excluded to maintain consistency."""
228+
from trafilatura.htmlprocessing import (
229+
CONVERSIONS, _EXCLUDED_TAGS # test-hook
230+
)
231+
232+
# All CONVERSIONS keys must be in _EXCLUDED_TAGS to prevent conflicts
233+
conversions_keys = set(CONVERSIONS.keys())
234+
missing_exclusions = conversions_keys - _EXCLUDED_TAGS
235+
236+
assert not missing_exclusions, \
237+
f"CONVERSIONS keys not in _EXCLUDED_TAGS: {missing_exclusions}. " \
238+
f"This will cause conflicts in convert_tags processing."
239+
240+
241+
def test_unsafe_tags_are_cleaned() -> None:
242+
"""Verify potentially unsafe HTML elements are handled."""
243+
from trafilatura.htmlprocessing import (
244+
_ALL_TAGS_TO_CONVERT # test-hook
245+
)
246+
from trafilatura.settings import MANUALLY_CLEANED
247+
248+
# Tags that could pose security risks if preserved unchecked
249+
unsafe_tags = {'embed', 'object', 'svg', 'math', 'canvas', 'script',
250+
'iframe', 'frame', 'frameset', 'applet'}
251+
252+
preserved_unsafe = unsafe_tags & _ALL_TAGS_TO_CONVERT
253+
manually_cleaned = set(MANUALLY_CLEANED)
254+
risky_tags = preserved_unsafe - manually_cleaned
255+
256+
assert not risky_tags, \
257+
f"Potentially unsafe tags are preserved but not in " \
258+
f"MANUALLY_CLEANED: {risky_tags}. Consider adding them to " \
259+
f"MANUALLY_CLEANED or removing from MDN_ELEMENTS."

trafilatura/html_elements_reference.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,58 +14,58 @@
1414
MDN_ELEMENTS: Set[str] = {
1515
# ——— Document root ———
1616
"html",
17-
17+
1818
# ——— Document metadata ———
1919
"base", "head", "link", "meta", "style", "title",
20-
20+
2121
# ——— Sectioning root ———
2222
"body",
23-
23+
2424
# ——— Content sectioning ———
2525
"address", "article", "aside", "footer", "header", "h1", "h2", "h3", "h4",
2626
"h5", "h6", "hgroup", "main", "nav", "section", "search",
27-
27+
2828
# ——— Text content ———
2929
"blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr", "li",
3030
"menu", "ol", "p", "pre", "ul",
31-
31+
3232
# ——— Inline text semantics ———
3333
"a", "abbr", "b", "bdi", "bdo", "br", "cite", "code", "data", "dfn", "em",
34-
"i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small", "span",
34+
"i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small", "span",
3535
"strong", "sub", "sup", "time", "u", "var", "wbr",
36-
36+
3737
# ——— Image & multimedia ———
3838
"area", "audio", "img", "map", "track", "video",
39-
39+
4040
# ——— Embedded content ———
4141
"embed", "fencedframe", "iframe", "object", "picture", "source",
42-
42+
4343
# ——— SVG and MathML ———
4444
"svg", "math",
45-
45+
4646
# ——— Scripting ———
4747
"canvas", "noscript", "script",
48-
48+
4949
# ——— Demarcating edits ———
5050
"del", "ins",
51-
51+
5252
# ——— Table content ———
5353
"caption", "col", "colgroup", "table", "tbody", "td", "tfoot",
5454
"th", "thead", "tr",
55-
55+
5656
# ——— Forms ———
5757
"button", "datalist", "fieldset", "form", "input", "label", "legend",
58-
"meter", "optgroup", "option", "output", "progress", "select",
58+
"meter", "optgroup", "option", "output", "progress", "select",
5959
"selectedcontent", "textarea",
60-
60+
6161
# ——— Interactive elements ———
6262
"details", "dialog", "summary",
63-
63+
6464
# ——— Web Components ———
6565
"slot", "template",
66-
66+
6767
# ——— Obsolete/deprecated (included for completeness) ———
68-
"acronym", "big", "center", "content", "dir", "font", "frame", "frameset",
69-
"image", "marquee", "menuitem", "nobr", "noembed", "noframes", "param",
68+
"acronym", "big", "center", "content", "dir", "font", "frame", "frameset",
69+
"image", "marquee", "menuitem", "nobr", "noembed", "noframes", "param",
7070
"plaintext", "rb", "rtc", "shadow", "strike", "tt", "xmp",
71-
}
71+
}

0 commit comments

Comments
 (0)