Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ yarn build
Create a Python virtual environment and install required packages:

```
python3.8 -m venv venv
python3.6 -m venv venv
source venv/bin/activate
pip install -r requirements/base.txt
```
Expand Down Expand Up @@ -248,7 +248,7 @@ under the `/sample/src` subdirectory.
To regenerate these files, first serve the sample website locally:

```
python -m http.server -d ./sample/src
cd ./sample/src && python -m http.server
```

This starts the sample website running at http://localhost:8000.
Expand Down
84 changes: 84 additions & 0 deletions crawler/management/commands/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import os.path

import djclick as click
from wpull.application.builder import Builder
from wpull.application.options import AppArgumentParser

from crawler import wpull_plugin


@click.command()
@click.argument("start_url")
@click.argument("db_filename", type=click.Path())
@click.option(
"--max-pages", type=int, help="Maximum number of pages to crawl", default=0
)
@click.option("--depth", type=int, help="Maximum crawl depth", default=0)
@click.option(
"--recreate",
is_flag=True,
show_default=True,
default=False,
help="Overwrite SQLite database if it already exists",
)
@click.option("--resume", is_flag=True)
def command(start_url, db_filename, max_pages, depth, recreate, resume):
"""Crawl a website to a SQLite database."""
if os.path.exists(db_filename):
if not recreate and not resume:
raise click.ClickException(
f"File {db_filename} already exists, "
"use --recreate to recreate "
"or --resume to resume a previous crawl."
)

if recreate:
os.remove(db_filename)

wpull_progress_filename = f"{db_filename}.wpull.db"
click.echo(
f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume."
)

if not resume and os.path.exists(wpull_progress_filename):
os.path.remove(wpull_progress_filename)

arg_parser = AppArgumentParser()
args = arg_parser.parse_args(
[
start_url,
"--quiet",
"--recursive",
"--delete-after",
"--no-robots",
"--wait=0.5",
"--random-wait",
"--dns-timeout=5",
"--connect-timeout=5",
"--read-timeout=30",
"--session-timeout=30",
"--span-hosts",
"--link-extractors=html",
"--follow-tags=a",
"--user-agent=CFPB website indexer",
"--no-check-certificate",
f"--level={depth}",
f"--plugin-script={wpull_plugin.__file__}",
f"--plugin-args={db_filename},{max_pages}",
f"--database={wpull_progress_filename}",
]
)
builder = Builder(args)
app = builder.build()

# This is required due to the use of async code in wpull. Unfortunately
# wpull hooks aren't called in a way that allows us to wrap Django database
# calls with sync_to_async. This is only safe because we only download one
# URL at a time.
# https://docs.djangoproject.com/en/3.2/topics/async/#async-safety
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

exit_status = app.run_sync()
click.echo(f"done, exiting with status {exit_status}")
return exit_status
121 changes: 120 additions & 1 deletion crawler/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import lxml.etree
import lxml.html.soupparser
import re
from urllib import parse

from django.db import models
from django.utils import timezone

from modelcluster.models import ClusterableModel
from modelcluster.fields import ParentalManyToManyField
Expand Down Expand Up @@ -35,6 +41,105 @@ class Page(Request, ClusterableModel):
components = ParentalManyToManyField(Component, related_name="pages")
links = ParentalManyToManyField(Link, related_name="links")

def __str__(self):
return self.url

HTML_COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)")
HTML_EXTERNAL_SITE = re.compile("/external-site/")
HTML_WHITESPACE = re.compile(r"\s+")

@classmethod
def from_html(
cls,
url,
html,
internal_link_host,
):
try:
tree = lxml.html.fromstring(html)
except lxml.etree.ParserError:
# https://bugs.launchpad.net/lxml/+bug/1949271
tree = lxml.html.soupparser.fromstring(html)

title_tag = tree.find(".//title")
title = title_tag.text.strip() if title_tag is not None else None
language = tree.find(".").get("lang")

if title is None:
return

body = cls._get_cleaned_body_from_tree(tree)

if body is not None:
text = cls.HTML_WHITESPACE.sub(" ", body.text_content()).strip()
else:
text = None

page = Page(
timestamp=timezone.now(),
url=url,
title=title,
language=language,
html=html,
text=text,
)

if body is None:
return page

hrefs = list(
set(
href
for element, attribute, href, pos in body.iterlinks()
if "a" == element.tag and "href" == attribute
)
)

# Remove any external link URL wrapping.
for i, href in enumerate(hrefs):
parsed_href = parse.urlparse(href)
if not cls.HTML_EXTERNAL_SITE.match(parsed_href.path):
continue

if parsed_href.netloc and internal_link_host != parsed_href.netloc:
continue

ext_url = parse.parse_qs(parsed_href.query).get("ext_url")
if ext_url:
hrefs[i] = ext_url[0]

page.links = [Link(href=href) for href in sorted(hrefs)]

body_html = lxml.etree.tostring(body, encoding="unicode")

class_names = set(cls.HTML_COMPONENT_SEARCH.findall(body_html))
page.components = [
Component(class_name=class_name) for class_name in sorted(class_names)
]

return page

@staticmethod
def _get_cleaned_body_from_tree(tree):
"""Extract page body without header, footer, images, or scripts."""
body = tree.find("./body")

if body is not None:
drop_element_selectors = [
".o-header",
".o-footer",
".skip-nav",
"img",
"script",
"style",
]

for drop_element_selector in drop_element_selectors:
for element in body.cssselect(drop_element_selector):
element.drop_tree()

return body


class ErrorBase(Request):
status_code = models.PositiveIntegerField(db_index=True)
Expand All @@ -43,10 +148,24 @@ class ErrorBase(Request):
class Meta(Request.Meta):
abstract = True

def __str__(self):
s = self.url

if self.referrer:
s += f" (from {self.referrer})"

s += f" {self.status_code}"

return s


class Error(ErrorBase):
pass
def __str__(self):
return super().__str__() + " !"


class Redirect(ErrorBase):
location = models.TextField(db_index=True)

def __str__(self):
return super().__str__() + f" -> {self.location}"
Empty file added crawler/tests/__init__.py
Empty file.
110 changes: 110 additions & 0 deletions crawler/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from operator import attrgetter
from unittest.mock import patch

import lxml.etree

from django.test import SimpleTestCase

from crawler.models import Error, Page, Redirect


class PageTests(SimpleTestCase):
def test_from_html_no_title_returns_none(self):
self.assertIsNone(
Page.from_html(
"https://example.com/",
"<html><head></head><body>This page has no title.</body></html>",
"example.com",
)
)

def check_from_html(self):
html = """
<html lang="en">
<head><title>Test page</title></head>
<body>
<script>Ignore me!</script>
<div class="m-links">Links</div>
<div><a href="/page/">A regular link on the same domain.</a></div>
<div class="a-external-link">
<a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F">
An external link pointing to another domain
</a>
<a href="/external-site/">
An external link missing its target
</a>
<a href="https://example.org/external-site/">
A link on another domain that also uses /external-site/
</a>
</div>
</body>
</html>
""".strip()

page = Page.from_html("https://example.com/", html, "example.com")
self.assertEqual(str(page), "https://example.com/")
self.assertEqual(page.title, "Test page")
self.assertEqual(page.language, "en")
self.assertEqual(page.html, html)
self.assertEqual(
page.text,
(
"Links "
"A regular link on the same domain. "
"An external link pointing to another domain "
"An external link missing its target "
"A link on another domain that also uses /external-site/"
),
)
self.assertCountEqual(
page.components.values_list("class_name", flat=True),
["a-external-link", "m-links"],
)
self.assertCountEqual(
page.links.values_list("href", flat=True),
[
"/external-site/",
"/page/",
"https://example.org/",
"https://example.org/external-site/",
],
)

def test_from_html(self):
self.check_from_html()

def test_from_html_etree_fallback_parser(self):
with patch(
"lxml.html.fromstring",
side_effect=lxml.etree.ParserError("testing parser error"),
):
self.check_from_html()

def test_from_html_no_body(self):
html = '<html lang="en"><head><title>Test page with no body</head></html>'
page = Page.from_html("https://example.com/", html, "example.com")
self.assertEqual(str(page), "https://example.com/")
self.assertEqual(page.title, "Test page with no body")
self.assertEqual(page.language, "en")
self.assertEqual(page.html, html)
self.assertIsNone(page.text)


class ErrorTests(SimpleTestCase):
def test_error_str(self):
self.assertEqual(
str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !"
)

def test_error_str_with_referrer(self):
self.assertEqual(
str(
Redirect(
url="/redirect/",
referrer="/source/",
status_code=301,
location="/destination/",
)
),
"/redirect/ (from /source/) 301 -> /destination/",
)
Loading