Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
8.4
]
composer: [basic]
timeout-minutes: 10
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -55,7 +55,8 @@ jobs:
- name: Install dependencies
run: |
if [[ "${{ matrix.php }}" == "8.3" ]]; then
composer require phpstan/phpstan --no-update
composer config allow-plugins.infection/extension-installer true
composer require phpstan/phpstan:^2.1 infection/infection:^0.32.7 --dev --no-update --no-interaction
fi;

if [[ "${{ matrix.composer }}" == "lowest" ]]; then
Expand All @@ -78,6 +79,12 @@ jobs:
run: |
php vendor/bin/phpstan analyse -c phpstan.neon --no-progress

- name: Run infection with phpstan integration
if: ${{ matrix.php == '8.3' }}
run: |
mkdir -p build/logs
XDEBUG_MODE=coverage php vendor/bin/infection run --configuration=infection.json5

- name: Upload coverage results to Coveralls
continue-on-error: true
env:
Expand Down
21 changes: 21 additions & 0 deletions infection.json5
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"$schema": "vendor/infection/infection/resources/schema.json",
"source": {
"directories": [
"src"
]
},
"phpUnit": {
"configDir": "."
},
"phpStan": {
"configDir": "."
},
"staticAnalysisTool": "phpstan",
"threads": "max",
"minMsi": 0,
"minCoveredMsi": 0,
"logs": {
"text": "build/logs/infection.log"
}
}
136 changes: 83 additions & 53 deletions src/voku/helper/HtmlDomParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -1174,59 +1174,77 @@
* formatting newlines into the wrapper's children when saving the full
* document).
*
* On PHP < 8.0, older libxml injects a trailing "\n" after raw-text
* elements (script, style) when they are the root of a fresh document.
* For those elements we fall back to serializing from the original
* document and strip only the single trailing "\n". For all other
* element types the fresh-document approach is used to avoid libxml
* injecting formatting newlines inside block-level content. Text and
* other non-element nodes are always serialized from the owner document
* without any trailing-newline stripping (they carry no injected newline).
* On PHP < 8.0, DOMElement instances are serialized through
* serializeElementNodeForPhpLt8() so older libxml cannot inject formatting
* newlines when saveHTML($node) is used on detached block-level elements.
* Text and other non-element nodes still use the fresh-document approach
* directly because they do not need the extra wrapper stripping.
*
* @param \DOMNode $node
*/
private function serializeNode(\DOMNode $node): string

Check warning on line 1185 in src/voku/helper/HtmlDomParser.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

This method has 4 returns, which is more than the 3 allowed.

See more on https://sonarcloud.io/project/issues?id=voku_simple_html_dom&issues=AZ3oyWOHXiTxNghnB6iM&open=AZ3oyWOHXiTxNghnB6iM&pullRequest=145
{
// For script/style on PHP < 8.0 use ownerDocument to avoid fresh-doc
// libxml injecting "\n" inside raw-text content.
$useOwnerDoc = \PHP_VERSION_ID < 80000
&& $node instanceof \DOMElement
&& \in_array(\strtolower($node->tagName), ['script', 'style'], true);

if (!$useOwnerDoc) {
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMNode) {
return '';
}
if (\PHP_VERSION_ID < 80000 && $node instanceof \DOMElement) {
return $this->serializeElementNodeForPhpLt8($node);
}
Comment on lines 1185 to +1189

$document->appendChild($importedNode);
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$content = $document->saveHTML($importedNode);
} else {
// PHP < 8.0 script/style: serialize from original document and
// strip only the trailing "\n" that older libxml appends after
// raw-text elements.
$ownerDoc = $node->ownerDocument;
$content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false;
// Older libxml appends exactly one synthetic trailing "\n" here;
// preserve any real user-provided trailing newlines in the content.
if ($content !== false && \substr($content, -1) === "\n") {
$content = \substr($content, 0, -1);
}
$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMNode) {
return '';
}

$document->appendChild($importedNode);

$content = $document->saveHTML($importedNode);

if ($content === false) {
return '';
}

return $content;
}

/**
* On PHP < 8.0, saveHTML($node) injects formatting newlines for detached
* block-level elements, so serialize a temporary whole document instead.
*
* @param \DOMElement $node
*
* @return string
*/
private function serializeElementNodeForPhpLt8(\DOMElement $node): string
{
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMElement) {
return '';
}

$document->appendChild($importedNode);

$content = $document->saveHTML();
if ($content === false) {
return '';
}

$content = $this->stripLibxmlDocumentWrappers($content, \strtolower($importedNode->tagName));

if (\substr($content, -1) === "\n") {
$content = \substr($content, 0, -1);
}

return $content;
}
Comment on lines +1212 to +1246

/**
* Serialize the single element that was imported via the node-backed
* constructor, for PHP < 8.0.
Expand All @@ -1251,33 +1269,45 @@
return '';
}

// Strip the DOCTYPE declaration that libxml always prepends.
$full = (string) \preg_replace('/<!DOCTYPE[^>]+>/i', '', $full);
$full = \trim($full);

$documentElement = $this->document->documentElement;
$tagName = $documentElement instanceof \DOMElement
? \strtolower($documentElement->tagName)
: '';

// Strip the <html>...</html> wrapper added by libxml when the root
// element is not the HTML element itself.
$full = $this->stripLibxmlDocumentWrappers($full, $tagName, true);

return $full;
}

/**
* Strip the synthetic wrappers libxml adds when serializing a whole
* document around a non-root HTML element on PHP < 8.
*/
private function stripLibxmlDocumentWrappers(string $content, string $tagName, bool $trim = false): string
{
$content = (string) \preg_replace('/^<!DOCTYPE[^>]+>\s*/i', '', $content);
if ($trim) {
$content = \trim($content);
}

if ($tagName !== 'html') {
$full = (string) \preg_replace('/^<html[^>]*>/i', '', $full);
$full = (string) \preg_replace('/<\/html>$/i', '', $full);
$full = \trim($full);
$content = (string) \preg_replace('/^<html[^>]*>/i', '', $content);
$content = (string) \preg_replace('/<\/html>\s*$/i', '', $content);
if ($trim) {
$content = \trim($content);
}

// Strip the <body>...</body> wrapper added for non-body elements.
if ($tagName !== 'body') {
$full = (string) \preg_replace('/^<body[^>]*>/i', '', $full);
$full = (string) \preg_replace('/<\/body>$/i', '', $full);
// Remove a trailing empty <body> libxml may add for <head> roots.
$full = \str_replace('<body></body>', '', $full);
$full = \trim($full);
$content = (string) \preg_replace('/^<body[^>]*>/i', '', $content);
$content = (string) \preg_replace('/<\/body>\s*$/i', '', $content);
$content = \str_replace('<body></body>', '', $content);
if ($trim) {
$content = \trim($content);
}
}
}

return $full;
return $content;
}

/**
Expand Down
141 changes: 141 additions & 0 deletions tests/HtmlSerializationRegressionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,115 @@
*/
final class HtmlSerializationRegressionTest extends \PHPUnit\Framework\TestCase
{
/**
* @return array<string, array{string, string}>
*/
public function provideEdgeCaseDocumentHtml(): array
{
return [
'custom non-html tag' => [
'<custom-tag data-x="1"><span>A</span></custom-tag>',
'<custom-tag data-x="1"><span>A</span></custom-tag>',
],
'invalid html is normalized' => [
'<div><span>alpha</div>',
'<div><span>alpha</span></div>',
],
'html5 implicit paragraph closing' => [
'<p>one<p>two',
'<p>one</p><p>two</p>',
],
'chained paragraph roots' => [
'<p>one</p><p>two</p><p>three</p>',
'<p>one</p><p>two</p><p>three</p>',
],
];
}

/**
* @return array<string, array{string, string, int, string, string}>
*/
public function provideNodeBackedEdgeCases(): array
{
return [
'custom non-html tag' => [
'<custom-tag data-x="1"><span>A</span></custom-tag>',
'custom-tag',
0,
'<custom-tag data-x="1"><span>A</span></custom-tag>',
'<span>A</span>',
],
'only p tag root' => [
'<p>alpha</p>',
'p',
0,
'<p>alpha</p>',
'alpha',
],
'only div tag root' => [
'<div>alpha</div>',
'div',
0,
'<div>alpha</div>',
'alpha',
],
'invalid html normalized div' => [
'<div><span>alpha</div>',
'div',
0,
'<div><span>alpha</span></div>',
'<span>alpha</span>',
],
'html5 implicit paragraph closing first p' => [
'<p>one<p>two',
'p',
0,
'<p>one</p>',
'one',
],
'html5 implicit paragraph closing second p' => [
'<p>one<p>two',
'p',
1,
'<p>two</p>',
'two',
],
'chained paragraph middle root' => [
'<p>one</p><p>two</p><p>three</p>',
'p',
1,
'<p>two</p>',
'two',
],
];
}

/**
* @dataProvider provideEdgeCaseDocumentHtml
*/
public function testDocumentHtmlRoundTripsSerializationEdgeCases(string $html, string $expectedHtml)
{
static::assertSame($expectedHtml, HtmlDomParser::str_get_html($html)->html());
}

/**
* @dataProvider provideNodeBackedEdgeCases
*/
public function testNodeBackedHtmlHandlesSerializationEdgeCases(
string $html,
string $selector,
int $index,
string $expectedHtml,
string $expectedInnerHtml
) {
$document = HtmlDomParser::str_get_html($html);
$element = $document->find($selector, $index);
$parser = new HtmlDomParser($element->getNode());

static::assertSame($expectedHtml, $parser->html());
static::assertSame($expectedInnerHtml, $parser->innerHtml());
}

public function testHtmlDomParserConstructedFromExistingNodePreservesNestedMarkupWithoutInjectedNewlines()
{
$html = '<div class="mydiv"><div class="mydiv-item">A1</div><div class="mydiv-item"><span>B1</span><span>B2</span></div></div>';
Expand Down Expand Up @@ -72,6 +181,38 @@
static::assertSame('before<span>middle</span><strong>after</strong>', $parser->innerHtml());
}

public function testSerializeElementNodeDoesNotAppendTrailingNewline()
{
if (\PHP_VERSION_ID >= 80000) {
static::markTestSkipped('serializeElementNodeForPhpLt8() is only used on PHP < 8.0.');
}

$document = HtmlDomParser::str_get_html(
'<div><span>one</span><br><p>two</p><template id="card"><section><h2>Title</h2><p>Body</p></section></template></div>'
);

$serializeElementNodeForPhpLt8 = new \ReflectionMethod(HtmlDomParser::class, 'serializeElementNodeForPhpLt8');
if (\PHP_VERSION_ID < 80100) {
// This version check is only for Reflection behavior: private method
// access still needs setAccessible() when PHP_VERSION_ID < 80100
// (PHP 8.0 and earlier).
$serializeElementNodeForPhpLt8->setAccessible(true);

Check warning on line 199 in tests/HtmlSerializationRegressionTest.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Make sure that this accessibility update is safe here.

See more on https://sonarcloud.io/project/issues?id=voku_simple_html_dom&issues=AZ3o4j-Iq4uQ7nkCH6aQ&open=AZ3o4j-Iq4uQ7nkCH6aQ&pullRequest=145
}

$spanHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('span')->getNode());
$brHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('br')->getNode());
$pHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('p')->getNode());

static::assertSame(
'<span>one</span><br><p>two</p>',
$spanHtml . $brHtml . $pHtml
);
static::assertSame(
'<template id="card"><section><h2>Title</h2><p>Body</p></section></template>',
$serializeElementNodeForPhpLt8->invoke($document, $document->findOne('template')->getNode())
);
}
Comment on lines +184 to +214

public function testNodeBackedTextNodeHtmlPreservesTextVerbatim()
{
$document = HtmlDomParser::str_get_html('<div>before<span>middle</span>after</div>');
Expand Down
Loading