registerNodeClass('DOMElement', '\IvoPetkov\HTML5DOMElement');
}
/**
* Load HTML from a string.
*
* @param string $source The HTML code.
* @param int $options Additional Libxml parameters.
* @return boolean TRUE on success or FALSE on failure.
*/
public function loadHTML($source, $options = 0)
{
// Enables libxml errors handling
$internalErrorsOptionValue = libxml_use_internal_errors();
if ($internalErrorsOptionValue === false) {
libxml_use_internal_errors(true);
}
$source = trim($source);
// Add CDATA around script tags content
$matches = null;
preg_match_all('/', '-html5-dom-document-internal-cdata]]>', $source); // Add CDATA before the end tag
$source = str_replace('', '', $source); // Clean empty script tags
$matches = null;
preg_match_all('/\/s', $source, $matches);
if (isset($matches[0])) {
$matches[0] = array_unique($matches[0]);
foreach ($matches[0] as $match) {
if (strpos($match, '') !== false) { // check if contains
$source = str_replace($match, str_replace('', '<-html5-dom-document-internal-cdata-endtagfix/', $match), $source);
}
}
}
$autoAddHtmlAndBodyTags = !defined('LIBXML_HTML_NOIMPLIED') || ($options & LIBXML_HTML_NOIMPLIED) === 0;
$autoAddDoctype = !defined('LIBXML_HTML_NODEFDTD') || ($options & LIBXML_HTML_NODEFDTD) === 0;
$allowDuplicateIDs = ($options & self::ALLOW_DUPLICATE_IDS) !== 0;
// Add body tag if missing
if ($autoAddHtmlAndBodyTags && $source !== '' && preg_match('/\/', $source) === 0 && preg_match('/\/', $source) === 0 && preg_match('/\/', $source) === 0 && preg_match('/\/', $source) === 0) {
$source = '' . $source . '';
}
// Add DOCTYPE if missing
if ($autoAddDoctype && strtoupper(substr($source, 0, 9)) !== '\n" . $source;
}
// Adds temporary head tag
$charsetTag = '';
$matches = [];
preg_match('/\/', $source, $matches);
$removeHeadTag = false;
$removeHtmlTag = false;
if (isset($matches[0])) { // has head tag
$insertPosition = strpos($source, $matches[0]) + strlen($matches[0]);
$source = substr($source, 0, $insertPosition) . $charsetTag . substr($source, $insertPosition);
} else {
$matches = [];
preg_match('/\/', $source, $matches);
if (isset($matches[0])) { // has html tag
$source = str_replace($matches[0], $matches[0] . '' . $charsetTag . '', $source);
} else {
$source = '' . $charsetTag . '' . $source;
$removeHtmlTag = true;
}
$removeHeadTag = true;
}
// Preserve html entities
$source = preg_replace('/&([a-zA-Z]*);/', 'html5-dom-document-internal-entity1-$1-end', $source);
$source = preg_replace('/([0-9]*);/', 'html5-dom-document-internal-entity2-$1-end', $source);
$result = parent::loadHTML('' . $source, $options);
if ($internalErrorsOptionValue === false) {
libxml_use_internal_errors(false);
}
if ($result === false) {
return false;
}
$this->encoding = 'utf-8';
foreach ($this->childNodes as $item) {
if ($item->nodeType === XML_PI_NODE) {
$this->removeChild($item);
break;
}
}
/** @var HTML5DOMElement|null */
$metaTagElement = $this->getElementsByTagName('meta')->item(0);
if ($metaTagElement !== null) {
if ($metaTagElement->getAttribute('data-html5-dom-document-internal-attribute') === 'charset-meta') {
$headElement = $metaTagElement->parentNode;
$htmlElement = $headElement->parentNode;
$metaTagElement->parentNode->removeChild($metaTagElement);
if ($removeHeadTag && $headElement !== null && $headElement->parentNode !== null && ($headElement->firstChild === null || ($headElement->childNodes->length === 1 && $headElement->firstChild instanceof \DOMText))) {
$headElement->parentNode->removeChild($headElement);
}
if ($removeHtmlTag && $htmlElement !== null && $htmlElement->parentNode !== null && $htmlElement->firstChild === null) {
$htmlElement->parentNode->removeChild($htmlElement);
}
}
}
if (!$allowDuplicateIDs) {
$matches = [];
preg_match_all('/\sid[\s]*=[\s]*(["\'])(.*?)\1/', $source, $matches);
if (!empty($matches[2]) && max(array_count_values($matches[2])) > 1) {
$elementIDs = [];
$walkChildren = function ($element) use (&$walkChildren, &$elementIDs) {
foreach ($element->childNodes as $child) {
if ($child instanceof \DOMElement) {
if ($child->attributes->length > 0) { // Performance optimization
$id = $child->getAttribute('id');
if ($id !== '') {
if (isset($elementIDs[$id])) {
throw new \Exception('A DOM node with an ID value "' . $id . '" already exists! Pass the HTML5DOMDocument::ALLOW_DUPLICATE_IDS option to disable this check.');
} else {
$elementIDs[$id] = true;
}
}
}
$walkChildren($child);
}
}
};
$walkChildren($this);
}
}
$this->loaded = true;
return true;
}
/**
* Load HTML from a file.
*
* @param string $filename The path to the HTML file.
* @param int $options Additional Libxml parameters.
*/
public function loadHTMLFile($filename, $options = 0)
{
return $this->loadHTML(file_get_contents($filename), $options);
}
/**
* Adds the HTML tag to the document if missing.
*
* @return boolean TRUE on success, FALSE otherwise.
*/
private function addHtmlElementIfMissing(): bool
{
if ($this->getElementsByTagName('html')->length === 0) {
if (!isset(self::$newObjectsCache['htmlelement'])) {
self::$newObjectsCache['htmlelement'] = new \DOMElement('html');
}
$this->appendChild(clone (self::$newObjectsCache['htmlelement']));
return true;
}
return false;
}
/**
* Adds the HEAD tag to the document if missing.
*
* @return boolean TRUE on success, FALSE otherwise.
*/
private function addHeadElementIfMissing(): bool
{
if ($this->getElementsByTagName('head')->length === 0) {
$htmlElement = $this->getElementsByTagName('html')->item(0);
if (!isset(self::$newObjectsCache['headelement'])) {
self::$newObjectsCache['headelement'] = new \DOMElement('head');
}
$headElement = clone (self::$newObjectsCache['headelement']);
if ($htmlElement->firstChild === null) {
$htmlElement->appendChild($headElement);
} else {
$htmlElement->insertBefore($headElement, $htmlElement->firstChild);
}
return true;
}
return false;
}
/**
* Adds the BODY tag to the document if missing.
*
* @return boolean TRUE on success, FALSE otherwise.
*/
private function addBodyElementIfMissing(): bool
{
if ($this->getElementsByTagName('body')->length === 0) {
if (!isset(self::$newObjectsCache['bodyelement'])) {
self::$newObjectsCache['bodyelement'] = new \DOMElement('body');
}
$this->getElementsByTagName('html')->item(0)->appendChild(clone (self::$newObjectsCache['bodyelement']));
return true;
}
return false;
}
/**
* Dumps the internal document into a string using HTML formatting.
*
* @param \DOMNode $node Optional parameter to output a subset of the document.
* @return string The document (or node) HTML code as string.
*/
public function saveHTML(\DOMNode $node = null): string
{
$nodeMode = $node !== null;
if ($nodeMode && $node instanceof \DOMDocument) {
$nodeMode = false;
}
if ($nodeMode) {
if (!isset(self::$newObjectsCache['html5domdocument'])) {
self::$newObjectsCache['html5domdocument'] = new HTML5DOMDocument();
}
$tempDomDocument = clone (self::$newObjectsCache['html5domdocument']);
if ($node->nodeName === 'html') {
$tempDomDocument->loadHTML('');
$tempDomDocument->appendChild($tempDomDocument->importNode(clone ($node), true));
$html = $tempDomDocument->saveHTML();
$html = substr($html, 16); // remove the DOCTYPE + the new line after
} elseif ($node->nodeName === 'head' || $node->nodeName === 'body') {
$tempDomDocument->loadHTML("\n");
$tempDomDocument->childNodes[1]->appendChild($tempDomDocument->importNode(clone ($node), true));
$html = $tempDomDocument->saveHTML();
$html = substr($html, 22, -7); // remove the DOCTYPE + the new line after + html tag
} else {
$isInHead = false;
$parentNode = $node;
for ($i = 0; $i < 1000; $i++) {
$parentNode = $parentNode->parentNode;
if ($parentNode === null) {
break;
}
if ($parentNode->nodeName === 'body') {
break;
} elseif ($parentNode->nodeName === 'head') {
$isInHead = true;
break;
}
}
$tempDomDocument->loadHTML("\n" . ($isInHead ? '' : '') . '');
$tempDomDocument->childNodes[1]->childNodes[0]->appendChild($tempDomDocument->importNode(clone ($node), true));
$html = $tempDomDocument->saveHTML();
$html = substr($html, 28, -14); // remove the DOCTYPE + the new line + html + body or head tags
}
$html = trim($html);
} else {
$removeHtmlElement = false;
$removeHeadElement = false;
$headElement = $this->getElementsByTagName('head')->item(0);
if ($headElement === null) {
if ($this->addHtmlElementIfMissing()) {
$removeHtmlElement = true;
}
if ($this->addHeadElementIfMissing()) {
$removeHeadElement = true;
}
$headElement = $this->getElementsByTagName('head')->item(0);
}
$meta = $this->createElement('meta');
$meta->setAttribute('data-html5-dom-document-internal-attribute', 'charset-meta');
$meta->setAttribute('http-equiv', 'content-type');
$meta->setAttribute('content', 'text/html; charset=utf-8');
if ($headElement->firstChild !== null) {
$headElement->insertBefore($meta, $headElement->firstChild);
} else {
$headElement->appendChild($meta);
}
$html = parent::saveHTML();
$html = rtrim($html, "\n");
if ($removeHeadElement) {
$headElement->parentNode->removeChild($headElement);
} else {
$meta->parentNode->removeChild($meta);
}
if (strpos($html, 'html5-dom-document-internal-entity') !== false) {
$html = preg_replace('/html5-dom-document-internal-entity1-(.*?)-end/', '&$1;', $html);
$html = preg_replace('/html5-dom-document-internal-entity2-(.*?)-end/', '$1;', $html);
}
$codeToRemove = [
'html5-dom-document-internal-content',
'',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '-html5-dom-document-internal-cdata-endtagfix'
];
if ($removeHeadElement) {
$codeToRemove[] = '';
}
if ($removeHtmlElement) {
$codeToRemove[] = '';
}
$html = str_replace($codeToRemove, '', $html);
}
return $html;
}
/**
* Dumps the internal document into a file using HTML formatting.
*
* @param string $filename The path to the saved HTML document.
* @return int|false the number of bytes written or FALSE if an error occurred.
*/
#[\ReturnTypeWillChange] // Return type "int|false" is invalid in older supported versions.
public function saveHTMLFile($filename)
{
if (!is_writable($filename)) {
return false;
}
$result = $this->saveHTML();
file_put_contents($filename, $result);
$bytesWritten = filesize($filename);
if ($bytesWritten === strlen($result)) {
return $bytesWritten;
}
return false;
}
/**
* Returns the first document element matching the selector.
*
* @param string $selector A CSS query selector. Available values: *, tagname, tagname#id, #id, tagname.classname, .classname, tagname.classname.classname2, .classname.classname2, tagname[attribute-selector], [attribute-selector], "div, p", div p, div > p, div + p and p ~ ul.
* @return HTML5DOMElement|null The result DOMElement or null if not found.
* @throws \InvalidArgumentException
*/
public function querySelector(string $selector)
{
return $this->internalQuerySelector($selector);
}
/**
* Returns a list of document elements matching the selector.
*
* @param string $selector A CSS query selector. Available values: *, tagname, tagname#id, #id, tagname.classname, .classname, tagname.classname.classname2, .classname.classname2, tagname[attribute-selector], [attribute-selector], "div, p", div p, div > p, div + p and p ~ ul.
* @return HTML5DOMNodeList Returns a list of DOMElements matching the criteria.
* @throws \InvalidArgumentException
*/
public function querySelectorAll(string $selector)
{
return $this->internalQuerySelectorAll($selector);
}
/**
* Creates an element that will be replaced by the new body in insertHTML.
*
* @param string $name The name of the insert target.
* @return HTML5DOMElement A new DOMElement that must be set in the place where the new body will be inserted.
*/
public function createInsertTarget(string $name)
{
if (!$this->loaded) {
$this->loadHTML('');
}
$element = $this->createElement('html5-dom-document-insert-target');
$element->setAttribute('name', $name);
return $element;
}
/**
* Inserts a HTML document into the current document. The elements from the head and the body will be moved to their proper locations.
*
* @param string $source The HTML code to be inserted.
* @param string $target Body target position. Available values: afterBodyBegin, beforeBodyEnd or insertTarget name.
*/
public function insertHTML(string $source, string $target = 'beforeBodyEnd')
{
$this->insertHTMLMulti([['source' => $source, 'target' => $target]]);
}
/**
* Inserts multiple HTML documents into the current document. The elements from the head and the body will be moved to their proper locations.
*
* @param array $sources An array containing the source of the document to be inserted in the following format: [ ['source'=>'', 'target'=>''], ['source'=>'', 'target'=>''], ... ]
* @throws \Exception
*/
public function insertHTMLMulti(array $sources)
{
if (!$this->loaded) {
$this->loadHTML('');
}
if (!isset(self::$newObjectsCache['html5domdocument'])) {
self::$newObjectsCache['html5domdocument'] = new HTML5DOMDocument();
}
$currentDomDocument = &$this;
$copyAttributes = function ($sourceNode, $targetNode) {
foreach ($sourceNode->attributes as $attributeName => $attribute) {
$targetNode->setAttribute($attributeName, $attribute->value);
}
};
$currentDomHTMLElement = null;
$currentDomHeadElement = null;
$currentDomBodyElement = null;
$insertTargetsList = null;
$prepareInsertTargetsList = function () use (&$insertTargetsList) {
if ($insertTargetsList === null) {
$insertTargetsList = [];
$targetElements = $this->getElementsByTagName('html5-dom-document-insert-target');
foreach ($targetElements as $targetElement) {
$insertTargetsList[$targetElement->getAttribute('name')] = $targetElement;
}
}
};
foreach ($sources as $sourceData) {
if (!isset($sourceData['source'])) {
throw new \Exception('Missing source key');
}
$source = $sourceData['source'];
$target = isset($sourceData['target']) ? $sourceData['target'] : 'beforeBodyEnd';
$domDocument = clone (self::$newObjectsCache['html5domdocument']);
$domDocument->loadHTML($source, self::ALLOW_DUPLICATE_IDS);
$htmlElement = $domDocument->getElementsByTagName('html')->item(0);
if ($htmlElement !== null) {
if ($htmlElement->attributes->length > 0) {
if ($currentDomHTMLElement === null) {
$currentDomHTMLElement = $this->getElementsByTagName('html')->item(0);
if ($currentDomHTMLElement === null) {
$this->addHtmlElementIfMissing();
$currentDomHTMLElement = $this->getElementsByTagName('html')->item(0);
}
}
$copyAttributes($htmlElement, $currentDomHTMLElement);
}
}
$headElement = $domDocument->getElementsByTagName('head')->item(0);
if ($headElement !== null) {
if ($currentDomHeadElement === null) {
$currentDomHeadElement = $this->getElementsByTagName('head')->item(0);
if ($currentDomHeadElement === null) {
$this->addHtmlElementIfMissing();
$this->addHeadElementIfMissing();
$currentDomHeadElement = $this->getElementsByTagName('head')->item(0);
}
}
foreach ($headElement->childNodes as $headElementChild) {
$newNode = $currentDomDocument->importNode($headElementChild, true);
if ($newNode !== null) {
$currentDomHeadElement->appendChild($newNode);
}
}
if ($headElement->attributes->length > 0) {
$copyAttributes($headElement, $currentDomHeadElement);
}
}
$bodyElement = $domDocument->getElementsByTagName('body')->item(0);
if ($bodyElement !== null) {
if ($currentDomBodyElement === null) {
$currentDomBodyElement = $this->getElementsByTagName('body')->item(0);
if ($currentDomBodyElement === null) {
$this->addHtmlElementIfMissing();
$this->addBodyElementIfMissing();
$currentDomBodyElement = $this->getElementsByTagName('body')->item(0);
}
}
$bodyElementChildren = $bodyElement->childNodes;
if ($target === 'afterBodyBegin') {
$bodyElementChildrenCount = $bodyElementChildren->length;
for ($i = $bodyElementChildrenCount - 1; $i >= 0; $i--) {
$newNode = $currentDomDocument->importNode($bodyElementChildren->item($i), true);
if ($newNode !== null) {
if ($currentDomBodyElement->firstChild === null) {
$currentDomBodyElement->appendChild($newNode);
} else {
$currentDomBodyElement->insertBefore($newNode, $currentDomBodyElement->firstChild);
}
}
}
} elseif ($target === 'beforeBodyEnd') {
foreach ($bodyElementChildren as $bodyElementChild) {
$newNode = $currentDomDocument->importNode($bodyElementChild, true);
if ($newNode !== null) {
$currentDomBodyElement->appendChild($newNode);
}
}
} else {
$prepareInsertTargetsList();
if (isset($insertTargetsList[$target])) {
$targetElement = $insertTargetsList[$target];
$targetElementParent = $targetElement->parentNode;
foreach ($bodyElementChildren as $bodyElementChild) {
$newNode = $currentDomDocument->importNode($bodyElementChild, true);
if ($newNode !== null) {
$targetElementParent->insertBefore($newNode, $targetElement);
}
}
$targetElementParent->removeChild($targetElement);
}
}
if ($bodyElement->attributes->length > 0) {
$copyAttributes($bodyElement, $currentDomBodyElement);
}
} else { // clear the insert target when there is no body element
$prepareInsertTargetsList();
if (isset($insertTargetsList[$target])) {
$targetElement = $insertTargetsList[$target];
$targetElement->parentNode->removeChild($targetElement);
}
}
}
}
/**
* Applies the modifications specified to the DOM document.
*
* @param int $modifications The modifications to apply. Available values:
* - HTML5DOMDocument::FIX_MULTIPLE_TITLES - removes all but the last title elements.
* - HTML5DOMDocument::FIX_DUPLICATE_METATAGS - removes all but the last metatags with matching name or property attributes.
* - HTML5DOMDocument::FIX_MULTIPLE_HEADS - merges multiple head elements.
* - HTML5DOMDocument::FIX_MULTIPLE_BODIES - merges multiple body elements.
* - HTML5DOMDocument::OPTIMIZE_HEAD - moves charset metatag and title elements first.
*/
public function modify($modifications = 0)
{
$fixMultipleTitles = ($modifications & self::FIX_MULTIPLE_TITLES) !== 0;
$fixDuplicateMetatags = ($modifications & self::FIX_DUPLICATE_METATAGS) !== 0;
$fixMultipleHeads = ($modifications & self::FIX_MULTIPLE_HEADS) !== 0;
$fixMultipleBodies = ($modifications & self::FIX_MULTIPLE_BODIES) !== 0;
$optimizeHead = ($modifications & self::OPTIMIZE_HEAD) !== 0;
/** @var \DOMNodeList */
$headElements = $this->getElementsByTagName('head');
if ($fixMultipleHeads) { // Merges multiple head elements.
if ($headElements->length > 1) {
$firstHeadElement = $headElements->item(0);
while ($headElements->length > 1) {
$nextHeadElement = $headElements->item(1);
$nextHeadElementChildren = $nextHeadElement->childNodes;
$nextHeadElementChildrenCount = $nextHeadElementChildren->length;
for ($i = 0; $i < $nextHeadElementChildrenCount; $i++) {
$firstHeadElement->appendChild($nextHeadElementChildren->item(0));
}
$nextHeadElement->parentNode->removeChild($nextHeadElement);
}
$headElements = [$firstHeadElement];
}
}
foreach ($headElements as $headElement) {
if ($fixMultipleTitles) { // Remove all title elements except the last one.
$titleTags = $headElement->getElementsByTagName('title');
$titleTagsCount = $titleTags->length;
for ($i = 0; $i < $titleTagsCount - 1; $i++) {
$node = $titleTags->item($i);
$node->parentNode->removeChild($node);
}
}
if ($fixDuplicateMetatags) { // Remove all meta tags that has matching name or property attributes.
$metaTags = $headElement->getElementsByTagName('meta');
if ($metaTags->length > 0) {
$list = [];
$idsList = [];
foreach ($metaTags as $metaTag) {
$id = $metaTag->getAttribute('name');
if ($id !== '') {
$id = 'name:' . $id;
} else {
$id = $metaTag->getAttribute('property');
if ($id !== '') {
$id = 'property:' . $id;
} else {
$id = $metaTag->getAttribute('charset');
if ($id !== '') {
$id = 'charset';
}
}
}
if (!isset($idsList[$id])) {
$idsList[$id] = 0;
}
$idsList[$id]++;
$list[] = [$metaTag, $id];
}
foreach ($idsList as $id => $count) {
if ($count > 1 && $id !== '') {
foreach ($list as $i => $item) {
if ($item[1] === $id) {
$node = $item[0];
$node->parentNode->removeChild($node);
unset($list[$i]);
$count--;
}
if ($count === 1) {
break;
}
}
}
}
}
}
if ($optimizeHead) { // Moves charset metatag and title elements first.
$titleElement = $headElement->getElementsByTagName('title')->item(0);
$hasTitleElement = false;
if ($titleElement !== null && $titleElement->previousSibling !== null) {
$headElement->insertBefore($titleElement, $headElement->firstChild);
$hasTitleElement = true;
}
$metaTags = $headElement->getElementsByTagName('meta');
$metaTagsLength = $metaTags->length;
if ($metaTagsLength > 0) {
$charsetMetaTag = null;
$nodesToMove = [];
for ($i = $metaTagsLength - 1; $i >= 0; $i--) {
$nodesToMove[$i] = $metaTags->item($i);
}
for ($i = $metaTagsLength - 1; $i >= 0; $i--) {
$nodeToMove = $nodesToMove[$i];
if ($charsetMetaTag === null && $nodeToMove->getAttribute('charset') !== '') {
$charsetMetaTag = $nodeToMove;
}
$referenceNode = $headElement->childNodes->item($hasTitleElement ? 1 : 0);
if ($nodeToMove !== $referenceNode) {
$headElement->insertBefore($nodeToMove, $referenceNode);
}
}
if ($charsetMetaTag !== null && $charsetMetaTag->previousSibling !== null) {
$headElement->insertBefore($charsetMetaTag, $headElement->firstChild);
}
}
}
}
if ($fixMultipleBodies) { // Merges multiple body elements.
$bodyElements = $this->getElementsByTagName('body');
if ($bodyElements->length > 1) {
$firstBodyElement = $bodyElements->item(0);
while ($bodyElements->length > 1) {
$nextBodyElement = $bodyElements->item(1);
$nextBodyElementChildren = $nextBodyElement->childNodes;
$nextBodyElementChildrenCount = $nextBodyElementChildren->length;
for ($i = 0; $i < $nextBodyElementChildrenCount; $i++) {
$firstBodyElement->appendChild($nextBodyElementChildren->item(0));
}
$nextBodyElement->parentNode->removeChild($nextBodyElement);
}
}
}
}
}