Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
8.4
]
composer: [basic]
timeout-minutes: 10
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -55,7 +55,8 @@ jobs:
- name: Install dependencies
run: |
if [[ "${{ matrix.php }}" == "8.3" ]]; then
composer require phpstan/phpstan --no-update
composer config allow-plugins.infection/extension-installer true
composer require phpstan/phpstan:^2.1 infection/infection:^0.32.7 --dev --no-update --no-interaction
fi;

if [[ "${{ matrix.composer }}" == "lowest" ]]; then
Expand All @@ -78,6 +79,12 @@ jobs:
run: |
php vendor/bin/phpstan analyse -c phpstan.neon --no-progress

- name: Run infection with phpstan integration
if: ${{ matrix.php == '8.3' }}
run: |
mkdir -p build/logs
XDEBUG_MODE=coverage php vendor/bin/infection run --configuration=infection.json5

- name: Upload coverage results to Coveralls
continue-on-error: true
env:
Expand Down
21 changes: 21 additions & 0 deletions infection.json5
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"$schema": "vendor/infection/infection/resources/schema.json",
"source": {
"directories": [
"src"
]
},
"phpUnit": {
"configDir": "."
},
"phpStan": {
"configDir": "."
},
"staticAnalysisTool": "phpstan",
"threads": "max",
"minMsi": 0,
"minCoveredMsi": 0,
"logs": {
"text": "build/logs/infection.log"
}
}
91 changes: 62 additions & 29 deletions src/voku/helper/HtmlDomParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -1185,48 +1185,81 @@
*
* @param \DOMNode $node
*/
private function serializeNode(\DOMNode $node): string

Check warning on line 1188 in src/voku/helper/HtmlDomParser.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

This method has 4 returns, which is more than the 3 allowed.

See more on https://sonarcloud.io/project/issues?id=voku_simple_html_dom&issues=AZ3oyWOHXiTxNghnB6iM&open=AZ3oyWOHXiTxNghnB6iM&pullRequest=145
{
// For script/style on PHP < 8.0 use ownerDocument to avoid fresh-doc
// libxml injecting "\n" inside raw-text content.
$useOwnerDoc = \PHP_VERSION_ID < 80000
&& $node instanceof \DOMElement
&& \in_array(\strtolower($node->tagName), ['script', 'style'], true);

if (!$useOwnerDoc) {
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMNode) {
return '';
}
if (\PHP_VERSION_ID < 80000 && $node instanceof \DOMElement) {
return $this->serializeElementNodeForPhpLt8($node);
}
Comment on lines 1185 to +1189

$document->appendChild($importedNode);
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$content = $document->saveHTML($importedNode);
} else {
// PHP < 8.0 script/style: serialize from original document and
// strip only the trailing "\n" that older libxml appends after
// raw-text elements.
$ownerDoc = $node->ownerDocument;
$content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false;
// Older libxml appends exactly one synthetic trailing "\n" here;
// preserve any real user-provided trailing newlines in the content.
if ($content !== false && \substr($content, -1) === "\n") {
$content = \substr($content, 0, -1);
}
$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMNode) {
return '';
}

$document->appendChild($importedNode);

$content = $document->saveHTML($importedNode);

if ($content === false) {
return '';
}

return $content;
}

/**
* On PHP < 8.0, saveHTML($node) injects formatting newlines for detached
* block-level elements, so serialize a temporary whole document instead.
*
* @param \DOMElement $node
*
* @return string
*/
private function serializeElementNodeForPhpLt8(\DOMElement $node): string
{
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;

$importedNode = $document->importNode($node, true);
// @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
if (!$importedNode instanceof \DOMElement) {
return '';
}

$document->appendChild($importedNode);

$content = $document->saveHTML();
if ($content === false) {
return '';
}

$content = (string) \preg_replace('/^<!DOCTYPE[^>]+>\s*/i', '', $content);

$tagName = \strtolower($importedNode->tagName);
if ($tagName !== 'html') {
$content = (string) \preg_replace('/^<html[^>]*>/i', '', $content);
$content = (string) \preg_replace('/<\/html>\s*$/i', '', $content);

if ($tagName !== 'body') {
$content = (string) \preg_replace('/^<body[^>]*>/i', '', $content);
$content = (string) \preg_replace('/<\/body>\s*$/i', '', $content);
$content = \str_replace('<body></body>', '', $content);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The str_replace for the empty <body> tag should be performed before the regex-based stripping of <body> and </body> tags.

When serializing a <head> node on PHP < 8, libxml often appends an empty <body></body> at the end of the document. If preg_replace('/<\/body>\s*$/i', '', $content) runs first, it will strip the closing </body> tag, leaving a dangling <body> tag that str_replace('<body></body>', ...) will then fail to match and remove. Moving the literal replacement to the top of this block ensures it correctly catches the empty body fragment before the individual tags are stripped.

                $content = \str_replace('<body></body>', '', $content);
                $content = (string) \preg_replace('/^<body[^>]*>/i', '', $content);
                $content = (string) \preg_replace('/<\/body>\s*$/i', '', $content);

}
}

if (\substr($content, -1) === "\n") {
$content = \substr($content, 0, -1);
}

return $content;
}
Comment on lines +1212 to +1246

/**
* Serialize the single element that was imported via the node-backed
* constructor, for PHP < 8.0.
Expand Down
28 changes: 28 additions & 0 deletions tests/HtmlSerializationRegressionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,34 @@
static::assertSame('before<span>middle</span><strong>after</strong>', $parser->innerHtml());
}

public function testSerializeElementNodeDoesNotAppendTrailingNewline()
{
$document = HtmlDomParser::str_get_html(
'<div><span>one</span><br><p>two</p><template id="card"><section><h2>Title</h2><p>Body</p></section></template></div>'
);

$serializeElementNodeForPhpLt8 = new \ReflectionMethod(HtmlDomParser::class, 'serializeElementNodeForPhpLt8');
if (\PHP_VERSION_ID < 80100) {
// This version check is only for Reflection behavior: private method
// access still needs setAccessible() when PHP_VERSION_ID < 80100
// (PHP 8.0 and earlier).
$serializeElementNodeForPhpLt8->setAccessible(true);

Check warning on line 86 in tests/HtmlSerializationRegressionTest.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Make sure that this accessibility update is safe here.

See more on https://sonarcloud.io/project/issues?id=voku_simple_html_dom&issues=AZ3o4j-Iq4uQ7nkCH6aQ&open=AZ3o4j-Iq4uQ7nkCH6aQ&pullRequest=145
}

$spanHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('span')->getNode());
$brHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('br')->getNode());
$pHtml = $serializeElementNodeForPhpLt8->invoke($document, $document->getElementByTagName('p')->getNode());

static::assertSame(
'<span>one</span><br><p>two</p>',
$spanHtml . $brHtml . $pHtml
);
static::assertSame(
'<template id="card"><section><h2>Title</h2><p>Body</p></section></template>',
$serializeElementNodeForPhpLt8->invoke($document, $document->findOne('template')->getNode())
);
}
Comment on lines +184 to +214

public function testNodeBackedTextNodeHtmlPreservesTextVerbatim()
{
$document = HtmlDomParser::str_get_html('<div>before<span>middle</span>after</div>');
Expand Down
Loading