pax_global_header00006660000000000000000000000064147754532210014524gustar00rootroot0000000000000052 comment=cb69916a4deac15f05d1b2848d953c7c935ee36c lxml_html_clean-0.4.2/000077500000000000000000000000001477545322100146715ustar00rootroot00000000000000lxml_html_clean-0.4.2/.github/000077500000000000000000000000001477545322100162315ustar00rootroot00000000000000lxml_html_clean-0.4.2/.github/workflows/000077500000000000000000000000001477545322100202665ustar00rootroot00000000000000lxml_html_clean-0.4.2/.github/workflows/main.yml000066400000000000000000000012051477545322100217330ustar00rootroot00000000000000on: push: branches: - main pull_request: branches: - main name: Run Tox tests jobs: tox_test: name: Tox test steps: - name: Checkout uses: actions/checkout@v2 - name: Run Tox tests id: test uses: fedora-python/tox-github-action@main with: tox_env: ${{ matrix.tox_env }} dnf_install: gcc libxml2-devel libxslt-devel strategy: matrix: tox_env: - py36 - py38 - py39 - py310 - py311 - py312 - py313 - mypy # Use GitHub's Linux Docker host runs-on: ubuntu-latest lxml_html_clean-0.4.2/.gitignore000066400000000000000000000001201477545322100166520ustar00rootroot00000000000000__pycache__ *.pyc *.pyo .tox dist/ docs/_build build/ lxml_html_clean.egg-info/ lxml_html_clean-0.4.2/.readthedocs.yaml000066400000000000000000000004661477545322100201260ustar00rootroot00000000000000# .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 build: os: ubuntu-22.04 tools: python: "3.12" sphinx: configuration: docs/conf.py python: install: - requirements: docs/requirements.txt lxml_html_clean-0.4.2/CHANGES.rst000066400000000000000000000103061477545322100164730ustar00rootroot00000000000000========================= lxml_html_clean changelog ========================= Unreleased ========== 0.4.2 (2025-04-09) ================== Bugs fixed ---------- * `lxml_html_clean` now correctly handles HTML input as bytes as it did before the 0.2.0 release. 0.4.1 (2024-11-15) ================== Bugs fixed ---------- * Removed superfluous debug prints. 0.4.0 (2024-11-12) ================== Bugs fixed ---------- * The ``Cleaner()`` now scans for hidden JavaScript code embedded within CSS comments. In certain contexts, such as within ```` or ```` tags, `` ... ... ... ... ... ... ... a link ... another link ...

a paragraph

...
secret EVIL!
... of EVIL! ... ...
... Password: ...
... annoying EVIL! ... spam spam SPAM! ... ... ... ''' To remove the all superfluous content from this unparsed document, use the ``clean_html`` function: .. sourcecode:: pycon >>> from lxml_html_clean import clean_html >>> print clean_html(html)
a link another link

a paragraph

secret EVIL!
of EVIL! Password: annoying EVIL!spam spam SPAM!
The ``Cleaner`` class supports several keyword arguments to control exactly which content is removed: .. sourcecode:: pycon >>> from lxml_html_clean import Cleaner >>> cleaner = Cleaner(page_structure=False, links=False) >>> print cleaner.clean_html(html) a link another link

a paragraph

secret EVIL!
of EVIL! Password: annoying EVIL! spam spam SPAM! >>> cleaner = Cleaner(style=True, links=True, add_nofollow=True, ... page_structure=False, safe_attrs_only=False) >>> print cleaner.clean_html(html) a link another link

a paragraph

secret EVIL!
of EVIL! Password: annoying EVIL! spam spam SPAM! You can also whitelist some otherwise dangerous content with ``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow embedded media from YouTube, while still filtering out embedded media from other sites. See the docstring of ``Cleaner`` for the details of what can be cleaned. autolink -------- In addition to cleaning up malicious HTML, ``lxml_html_clean`` contains functions to do other things to your HTML. This includes autolinking:: autolink(doc, ...) autolink_html(html, ...) This finds anything that looks like a link (e.g., ``http://example.com``) in the *text* of an HTML document, and turns it into an anchor. It avoids making bad links. Links in the elements ``'''))
A link in
>>> print(autolink_html(''' ...
A link in http://bar.com
'''))
A link in http://bar.com
>>> print(autolink_html(''' ...
A link in http://foo.com or ... http://bar.com
'''))
A link in http://foo.com or http://bar.com
There's also a word wrapping function, that should probably be run after autolink:: >>> from lxml_html_clean import word_break_html >>> def pascii(s): ... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii')) >>> pascii(word_break_html( u''' ...
Hey you ... 12345678901234567890123456789012345678901234567890
'''))
Hey you 1234567890123456789012345678901234567890​1234567890
Not everything is broken: >>> pascii(word_break_html(''' ...
Hey you ... 12345678901234567890123456789012345678901234567890
'''))
Hey you 12345678901234567890123456789012345678901234567890
>>> pascii(word_break_html(''' ... text''')) text lxml_html_clean-0.4.2/tests/test_clean.py000066400000000000000000000360741477545322100205400ustar00rootroot00000000000000import base64 import gzip import io import unittest import warnings import lxml.html from lxml_html_clean import AmbiguousURLWarning, Cleaner, clean_html, LXMLHTMLCleanWarning from .utils import peak_memory_usage class CleanerTest(unittest.TestCase): def test_allow_tags(self): html = """

some text

helloworld
helloworld
""" html_root = lxml.html.document_fromstring(html) cleaner = Cleaner( remove_unknown_tags = False, allow_tags = ['table', 'tr', 'td']) result = cleaner.clean_html(html_root) self.assertEqual(12-5+1, len(list(result.iter()))) def test_allow_and_remove(self): with self.assertRaises(ValueError): Cleaner(allow_tags=['a'], remove_unknown_tags=True) def test_remove_unknown_tags(self): html = """
lettuce, tomato, veggie patty
""" clean_html = """
lettuce, tomato, veggie patty
""" cleaner = Cleaner(remove_unknown_tags=True) result = cleaner.clean_html(html) self.assertEqual( result, clean_html, msg="Unknown tags not removed. Got: %s" % result, ) def test_safe_attrs_included(self): html = """

Cyan

""" safe_attrs=set(lxml.html.defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner( safe_attrs_only=True, safe_attrs=safe_attrs) result = cleaner.clean_html(html) self.assertEqual(html, result) def test_safe_attrs_excluded(self): html = """

Cyan

""" expected = """

Cyan

""" safe_attrs=set() cleaner = Cleaner( safe_attrs_only=True, safe_attrs=safe_attrs) result = cleaner.clean_html(html) self.assertEqual(expected, result) def test_clean_invalid_root_tag(self): # only testing that cleaning with invalid root tags works at all s = lxml.html.fromstring('parent child') self.assertEqual('parent child', clean_html(s).text_content()) s = lxml.html.fromstring('child') self.assertEqual('child', clean_html(s).text_content()) def test_clean_with_comments(self): html = """

Cyan

""" s = lxml.html.fragment_fromstring(html) self.assertEqual( b'

Cyan

', lxml.html.tostring(clean_html(s))) self.assertEqual( '

Cyan

', clean_html(html)) cleaner = Cleaner(comments=False) result = cleaner.clean_html(s) self.assertEqual( b'

Cyan

', lxml.html.tostring(result)) self.assertEqual( '

Cyan

', cleaner.clean_html(html)) def test_sneaky_noscript_in_style(self): # This gets parsed as through into the output. html = '', lxml.html.tostring(clean_html(s))) def test_sneaky_js_in_math_style(self): # This gets parsed as -> # thus passing any tag/script/whatever content through into the output. html = '' s = lxml.html.fragment_fromstring(html) self.assertEqual( b'', lxml.html.tostring(clean_html(s))) def test_sneaky_js_in_style_comment_math_svg(self): for tag in "svg", "math": html = f'<{tag}>' s = lxml.html.fragment_fromstring(html) expected = f'<{tag}>'.encode() self.assertEqual( expected, lxml.html.tostring(clean_html(s))) def test_sneaky_js_in_style_comment_noscript(self): html = '' s = lxml.html.fragment_fromstring(html) self.assertEqual( b'', lxml.html.tostring(clean_html(s))) def test_sneaky_import_in_style(self): # Prevent "@@importimport" -> "@import" replacement etc. style_codes = [ "@@importimport(extstyle.css)", "@ @ import import(extstyle.css)", "@ @ importimport(extstyle.css)", "@@ import import(extstyle.css)", "@ @import import(extstyle.css)", "@@importimport()", "@@importimport() ()", "@/* ... */import()", "@im/* ... */port()", "@ @import/* ... */import()", "@ /* ... */ import()", ] for style_code in style_codes: html = '' % style_code s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( b'', cleaned, "%s -> %s" % (style_code, cleaned)) def test_sneaky_schemes_in_style(self): style_codes = [ "javasjavascript:cript:", "javascriptjavascript::", "javascriptjavascript:: :", "vbjavascript:cript:", ] for style_code in style_codes: html = '' % style_code s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( b'', cleaned, "%s -> %s" % (style_code, cleaned)) def test_sneaky_urls_in_style(self): style_codes = [ "url(data:image/svg+xml;base64,...)", "url(javasjavascript:cript:)", "url(javasjavascript:cript: ::)", "url(vbjavascript:cript:)", "url(vbjavascript:cript: :)", ] for style_code in style_codes: html = '' % style_code s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( b'', cleaned, "%s -> %s" % (style_code, cleaned)) def test_svg_data_links(self): # Remove SVG images with potentially insecure content. svg = b'' gzout = io.BytesIO() f = gzip.GzipFile(fileobj=gzout, mode='wb') f.write(svg) f.close() svgz = gzout.getvalue() svg_b64 = base64.b64encode(svg).decode('ASCII') svgz_b64 = base64.b64encode(svgz).decode('ASCII') urls = [ "data:image/svg+xml;base64," + svg_b64, "data:image/svg+xml-compressed;base64," + svgz_b64, ] for url in urls: html = '' % url s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( b'', cleaned, "%s -> %s" % (url, cleaned)) def test_image_data_links(self): data = b'123' data_b64 = base64.b64encode(data).decode('ASCII') urls = [ "data:image/jpeg;base64," + data_b64, "data:image/apng;base64," + data_b64, "data:image/png;base64," + data_b64, "data:image/gif;base64," + data_b64, "data:image/webp;base64," + data_b64, "data:image/bmp;base64," + data_b64, "data:image/tiff;base64," + data_b64, "data:image/x-icon;base64," + data_b64, ] for url in urls: html = '' % url s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( html.encode("UTF-8"), cleaned, "%s -> %s" % (url, cleaned)) def test_image_data_links_in_style(self): data = b'123' data_b64 = base64.b64encode(data).decode('ASCII') urls = [ "data:image/jpeg;base64," + data_b64, "data:image/apng;base64," + data_b64, "data:image/png;base64," + data_b64, "data:image/gif;base64," + data_b64, "data:image/webp;base64," + data_b64, "data:image/bmp;base64," + data_b64, "data:image/tiff;base64," + data_b64, "data:image/x-icon;base64," + data_b64, ] for url in urls: html = '' % url s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( html.encode("UTF-8"), cleaned, "%s -> %s" % (url, cleaned)) def test_image_data_links_in_inline_style(self): safe_attrs = set(lxml.html.defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner( safe_attrs_only=True, safe_attrs=safe_attrs) data = b'123' data_b64 = base64.b64encode(data).decode('ASCII') url = "url(data:image/jpeg;base64,%s)" % data_b64 styles = [ "background: %s" % url, "background: %s; background-image: %s" % (url, url), ] for style in styles: html = '
' % style s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(cleaner.clean_html(s)) self.assertEqual( html.encode("UTF-8"), cleaned, "%s -> %s" % (style, cleaned)) def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute html = ('
' '') expected = ('
' '
') cleaner = Cleaner( forms=False, safe_attrs_only=False, ) self.assertEqual( expected, cleaner.clean_html(html)) def test_host_whitelist_slash_type_confusion(self): # Regression test: Accidentally passing a string when a 1-tuple was intended # creates a host_whitelist of the empty string; a malformed triple-slash # URL has an "empty host" according to urlsplit, and `"" in ""` passes. # So, don't allow user to accidentally pass a string for host_whitelist. html = '
' cleaner = Cleaner(frames=False, host_whitelist=["example.com"]) self.assertEqual(expected, cleaner.clean_html(html)) def test_host_whitelist_invalid(self): html = '

") self.assertGreaterEqual(len(w), 1) self.assertIs(w[-1].category, AmbiguousURLWarning) self.assertTrue(issubclass(w[-1].category, LXMLHTMLCleanWarning)) self.assertIn("impossible to parse the hostname", str(w[-1].message)) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) def test_possibly_invalid_url_without_whitelist(self): cleaner = Cleaner() with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") result = cleaner.clean_html(r"

") self.assertEqual(len(w), 0) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) lxml_html_clean-0.4.2/tests/test_clean.txt000066400000000000000000000172061477545322100207230ustar00rootroot00000000000000>>> import re >>> from lxml.html import fromstring, tostring >>> from lxml_html_clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest >>> doc = ''' ... ... ... ... ... ... ... ... ... ... a link ... a control char link ... data ... another link ...

a paragraph

...
secret EVIL!
... of EVIL! ... ...
... Password: ...
... spam spam SPAM! ... ... Text ... ... ... ''' >>> print(re.sub('[\x00-\x07\x0E]', '', doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL!
Password:
spam spam SPAM! Text >>> print(tostring(fromstring(doc)).decode("utf-8")) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL!
Password:
spam spam SPAM! Text >>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text >>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text lxml_html_clean-0.4.2/tests/test_clean_embed.txt000066400000000000000000000027601477545322100220560ustar00rootroot00000000000000THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !! >>> from lxml.html import fromstring, tostring >>> from lxml_html_clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest >>> def tostring(el): # work-around for Py3 'bytes' type ... from lxml.html import tostring ... s = tostring(el) ... if not isinstance(s, str): ... s = s.decode('UTF-8') ... return s >>> doc_embed = '''
... ... ... ... ...
''' >>> print(tostring(fromstring(doc_embed)))
>>> print(Cleaner().clean_html(doc_embed))
>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
lxml_html_clean-0.4.2/tests/utils.py000066400000000000000000000010441477545322100175440ustar00rootroot00000000000000import unittest def peak_memory_usage(func, *args, **kwargs): """ Monitor the memory usage of a function and return the peak memory used, in MiB. """ try: from memory_profiler import memory_usage # type: ignore except ImportError: raise unittest.SkipTest("memory-profiler is not available") try: mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None) except MemoryError: return float("inf") peak_memory = max(mem_usage) - min(mem_usage) return peak_memory lxml_html_clean-0.4.2/tox.ini000066400000000000000000000006031477545322100162030ustar00rootroot00000000000000[tox] envlist = py36,py38,py39,py310,py311,py312,py313,mypy skipsdist = True [testenv] commands = python -m unittest -v tests.test_clean python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt deps = lxml memory_profiler [testenv:mypy] commands = mypy {posargs:} mypy {posargs:} tests/test_clean.py deps = mypy types-lxml >= 2023.3.28