From 98c8a987e8c9884584cee5dc7ed8697d255db220 Mon Sep 17 00:00:00 2001 From: flan Date: Mon, 1 Oct 2018 01:47:27 +0200 Subject: [PATCH 1/3] Extract links from HTML5 media tags Port of wget commit 6a2d67b5836a6f1b9c989968a5392ff3511bc1f9. --- wpull/scraper/html.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wpull/scraper/html.py b/wpull/scraper/html.py index a1672515..efff9a04 100644 --- a/wpull/scraper/html.py +++ b/wpull/scraper/html.py @@ -264,6 +264,7 @@ class ElementWalker(object): LINK_ATTRIBUTES = frozenset([ 'action', 'archive', 'background', 'cite', 'classid', 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', + 'poster', 'usemap', 'dynsrc', 'lowsrc', ]) @@ -293,6 +294,9 @@ class ElementWalker(object): 'table': {'background': ATTR_INLINE}, 'td': {'background': ATTR_INLINE}, 'th': {'background': ATTR_INLINE}, + 'video': {'src': ATTR_INLINE, 'poster': ATTR_INLINE}, + 'audio': {'src': ATTR_INLINE, 'poster': ATTR_INLINE}, + 'source': {'src': ATTR_INLINE}, } '''Mapping of element tag names to attributes containing links.''' DYNAMIC_ATTRIBUTES = ('onkey', 'oncli', 'onmou') From bade6e9aff0b5249697d5fc967c046263f7cc6d2 Mon Sep 17 00:00:00 2001 From: flan Date: Wed, 10 Oct 2018 23:47:20 +0200 Subject: [PATCH 2/3] Add tests for 98c8a987e8c9884584cee5dc7ed8697d255db220 --- wpull/scraper/html_test.py | 4 ++++ wpull/testing/samples/many_urls.html | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/wpull/scraper/html_test.py b/wpull/scraper/html_test.py index 175a8349..0d0f8468 100644 --- a/wpull/scraper/html_test.py +++ b/wpull/scraper/html_test.py @@ -65,7 +65,11 @@ def test_html_scraper_links(self): 'http://example.com/applet/applet_src.class', 'http://example.com/bgsound.mid', 'http://example.com/audio_src.wav', + 'http://example.com/audio_poster.jpeg', 'http://example.net/source_src.wav', + 'http://example.com/video_src.webm', + 'http://example.com/video_poster.jpeg', + 'http://example.net/source_src.webm', 'http://example.com/embed_src.mov', 'http://example.com/fig_src.png', 'http://example.com/frame_src.html', diff --git a/wpull/testing/samples/many_urls.html b/wpull/testing/samples/many_urls.html index 08a9a624..666becd7 100644 --- a/wpull/testing/samples/many_urls.html +++ b/wpull/testing/samples/many_urls.html @@ -41,9 +41,12 @@ -