diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6646957..a8a5a21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: 8.4 ] composer: [basic] - timeout-minutes: 10 + timeout-minutes: 30 steps: - name: Checkout code uses: actions/checkout@v4 @@ -55,7 +55,8 @@ jobs: - name: Install dependencies run: | if [[ "${{ matrix.php }}" == "8.3" ]]; then - composer require phpstan/phpstan --no-update + composer config allow-plugins.infection/extension-installer true + composer require phpstan/phpstan:^2.1 infection/infection:^0.32.7 --dev --no-update --no-interaction fi; if [[ "${{ matrix.composer }}" == "lowest" ]]; then @@ -78,6 +79,12 @@ jobs: run: | php vendor/bin/phpstan analyse -c phpstan.neon --no-progress + - name: Run infection with phpstan integration + if: ${{ matrix.php == '8.3' }} + run: | + mkdir -p build/logs + XDEBUG_MODE=coverage php vendor/bin/infection run --configuration=infection.json5 + - name: Upload coverage results to Coveralls continue-on-error: true env: diff --git a/infection.json5 b/infection.json5 new file mode 100644 index 0000000..91e7d2b --- /dev/null +++ b/infection.json5 @@ -0,0 +1,21 @@ +{ + "$schema": "vendor/infection/infection/resources/schema.json", + "source": { + "directories": [ + "src" + ] + }, + "phpUnit": { + "configDir": "." + }, + "phpStan": { + "configDir": "." + }, + "staticAnalysisTool": "phpstan", + "threads": "max", + "minMsi": 0, + "minCoveredMsi": 0, + "logs": { + "text": "build/logs/infection.log" + } +} diff --git a/src/voku/helper/HtmlDomParser.php b/src/voku/helper/HtmlDomParser.php index e89ddf4..a2b4f04 100644 --- a/src/voku/helper/HtmlDomParser.php +++ b/src/voku/helper/HtmlDomParser.php @@ -1174,52 +1174,34 @@ private function markSyntheticParagraphWrapper(): void * formatting newlines into the wrapper's children when saving the full * document). * - * On PHP < 8.0, older libxml injects a trailing "\n" after raw-text - * elements (script, style) when they are the root of a fresh document. - * For those elements we fall back to serializing from the original - * document and strip only the single trailing "\n". For all other - * element types the fresh-document approach is used to avoid libxml - * injecting formatting newlines inside block-level content. Text and - * other non-element nodes are always serialized from the owner document - * without any trailing-newline stripping (they carry no injected newline). + * On PHP < 8.0, DOMElement instances are serialized through + * serializeElementNodeForPhpLt8() so older libxml cannot inject formatting + * newlines when saveHTML($node) is used on detached block-level elements. + * Text and other non-element nodes still use the fresh-document approach + * directly because they do not need the extra wrapper stripping. * * @param \DOMNode $node */ private function serializeNode(\DOMNode $node): string { - // For script/style on PHP < 8.0 use ownerDocument to avoid fresh-doc - // libxml injecting "\n" inside raw-text content. - $useOwnerDoc = \PHP_VERSION_ID < 80000 - && $node instanceof \DOMElement - && \in_array(\strtolower($node->tagName), ['script', 'style'], true); - - if (!$useOwnerDoc) { - $document = new \DOMDocument('1.0', $this->getEncoding()); - $document->preserveWhiteSpace = true; - $document->formatOutput = false; - - $importedNode = $document->importNode($node, true); - // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here) - if (!$importedNode instanceof \DOMNode) { - return ''; - } + if (\PHP_VERSION_ID < 80000 && $node instanceof \DOMElement) { + return $this->serializeElementNodeForPhpLt8($node); + } - $document->appendChild($importedNode); + $document = new \DOMDocument('1.0', $this->getEncoding()); + $document->preserveWhiteSpace = true; + $document->formatOutput = false; - $content = $document->saveHTML($importedNode); - } else { - // PHP < 8.0 script/style: serialize from original document and - // strip only the trailing "\n" that older libxml appends after - // raw-text elements. - $ownerDoc = $node->ownerDocument; - $content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false; - // Older libxml appends exactly one synthetic trailing "\n" here; - // preserve any real user-provided trailing newlines in the content. - if ($content !== false && \substr($content, -1) === "\n") { - $content = \substr($content, 0, -1); - } + $importedNode = $document->importNode($node, true); + // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here) + if (!$importedNode instanceof \DOMNode) { + return ''; } + $document->appendChild($importedNode); + + $content = $document->saveHTML($importedNode); + if ($content === false) { return ''; } @@ -1227,6 +1209,42 @@ private function serializeNode(\DOMNode $node): string return $content; } + /** + * On PHP < 8.0, saveHTML($node) injects formatting newlines for detached + * block-level elements, so serialize a temporary whole document instead. + * + * @param \DOMElement $node + * + * @return string + */ + private function serializeElementNodeForPhpLt8(\DOMElement $node): string + { + $document = new \DOMDocument('1.0', $this->getEncoding()); + $document->preserveWhiteSpace = true; + $document->formatOutput = false; + + $importedNode = $document->importNode($node, true); + // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here) + if (!$importedNode instanceof \DOMElement) { + return ''; + } + + $document->appendChild($importedNode); + + $content = $document->saveHTML(); + if ($content === false) { + return ''; + } + + $content = $this->stripLibxmlDocumentWrappers($content, \strtolower($importedNode->tagName)); + + if (\substr($content, -1) === "\n") { + $content = \substr($content, 0, -1); + } + + return $content; + } + /** * Serialize the single element that was imported via the node-backed * constructor, for PHP < 8.0. @@ -1251,33 +1269,45 @@ private function serializeCreatedFromNodeForPhpLt8(): string return ''; } - // Strip the DOCTYPE declaration that libxml always prepends. - $full = (string) \preg_replace('/]+>/i', '', $full); - $full = \trim($full); - $documentElement = $this->document->documentElement; $tagName = $documentElement instanceof \DOMElement ? \strtolower($documentElement->tagName) : ''; - // Strip the ... wrapper added by libxml when the root - // element is not the HTML element itself. + $full = $this->stripLibxmlDocumentWrappers($full, $tagName, true); + + return $full; + } + + /** + * Strip the synthetic wrappers libxml adds when serializing a whole + * document around a non-root HTML element on PHP < 8. + */ + private function stripLibxmlDocumentWrappers(string $content, string $tagName, bool $trim = false): string + { + $content = (string) \preg_replace('/^]+>\s*/i', '', $content); + if ($trim) { + $content = \trim($content); + } + if ($tagName !== 'html') { - $full = (string) \preg_replace('/^]*>/i', '', $full); - $full = (string) \preg_replace('/<\/html>$/i', '', $full); - $full = \trim($full); + $content = (string) \preg_replace('/^]*>/i', '', $content); + $content = (string) \preg_replace('/<\/html>\s*$/i', '', $content); + if ($trim) { + $content = \trim($content); + } - // Strip the
... wrapper added for non-body elements. if ($tagName !== 'body') { - $full = (string) \preg_replace('/^]*>/i', '', $full); - $full = (string) \preg_replace('/<\/body>$/i', '', $full); - // Remove a trailing empty libxml may add for roots. - $full = \str_replace('', '', $full); - $full = \trim($full); + $content = (string) \preg_replace('/^]*>/i', '', $content); + $content = (string) \preg_replace('/<\/body>\s*$/i', '', $content); + $content = \str_replace('', '', $content); + if ($trim) { + $content = \trim($content); + } } } - return $full; + return $content; } /** diff --git a/tests/HtmlSerializationRegressionTest.php b/tests/HtmlSerializationRegressionTest.php index 6d16843..e8b0627 100644 --- a/tests/HtmlSerializationRegressionTest.php +++ b/tests/HtmlSerializationRegressionTest.php @@ -7,6 +7,115 @@ */ final class HtmlSerializationRegressionTest extends \PHPUnit\Framework\TestCase { + /** + * @return arrayone
two', + '
one
two
', + ], + 'chained paragraph roots' => [ + 'one
two
three
', + 'one
two
three
', + ], + ]; + } + + /** + * @return arrayalpha
', + 'p', + 0, + 'alpha
', + 'alpha', + ], + 'only div tag root' => [ + 'one
two', + 'p', + 0, + '
one
', + 'one', + ], + 'html5 implicit paragraph closing second p' => [ + 'one
two', + 'p', + 1, + '
two
', + 'two', + ], + 'chained paragraph middle root' => [ + 'one
two
three
', + 'p', + 1, + 'two
', + 'two', + ], + ]; + } + + /** + * @dataProvider provideEdgeCaseDocumentHtml + */ + public function testDocumentHtmlRoundTripsSerializationEdgeCases(string $html, string $expectedHtml) + { + static::assertSame($expectedHtml, HtmlDomParser::str_get_html($html)->html()); + } + + /** + * @dataProvider provideNodeBackedEdgeCases + */ + public function testNodeBackedHtmlHandlesSerializationEdgeCases( + string $html, + string $selector, + int $index, + string $expectedHtml, + string $expectedInnerHtml + ) { + $document = HtmlDomParser::str_get_html($html); + $element = $document->find($selector, $index); + $parser = new HtmlDomParser($element->getNode()); + + static::assertSame($expectedHtml, $parser->html()); + static::assertSame($expectedInnerHtml, $parser->innerHtml()); + } + public function testHtmlDomParserConstructedFromExistingNodePreservesNestedMarkupWithoutInjectedNewlines() { $html = 'two
Body
two
', + $spanHtml . $brHtml . $pHtml + ); + static::assertSame( + 'Body