factory = new HTMLPurifier_TokenFactory();
* @param string $html
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return HTMLPurifier_Token[]
public function tokenizeHTML($html, $config, $context)
$html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core.AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/|\z)/is";
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
do {
$old = $html;
$html = preg_replace("/<($char)/i", '<\\1', $html);
} while ($html !== $old);
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
// preprocess html, essential for UTF-8
$html = $this->wrapHTML($html, $config, $context);
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
$options = 0;
if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
if ($config->get('Core.RemoveBlanks') && defined('LIBXML_NOBLANKS')) {
$options |= LIBXML_NOBLANKS;
set_error_handler(array($this, 'muteErrorHandler'));
// loadHTML() fails on PHP 5.3 when second parameter is given
if ($options) {
$doc->loadHTML($html, $options);
} else {
$body = $doc->getElementsByTagName('html')->item(0)-> //
getElementsByTagName('body')->item(0); //
$div = $body->getElementsByTagName('div')->item(0); //
$tokens = array();
$this->tokenizeDOM($div, $tokens, $config);
// If the div has a sibling, that means we tripped across
// a premature
tag. So remove the div we parsed,
// and then tokenize the rest of body. We can't tokenize
// the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$this->tokenizeDOM($body, $tokens, $config);
return $tokens;
* Iterative function that tokenizes a node, putting it into an accumulator.
* To iterate is human, to recurse divine - L. Peter Deutsch
* @param DOMNode $node DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
protected function tokenizeDOM($node, &$tokens, $config)
$level = 0;
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
$closingNodes = array();
do {
while (!$nodes[$level]->isEmpty()) {
$node = $nodes[$level]->shift(); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
if ($node->childNodes && $node->childNodes->length) {
$nodes[$level] = new HTMLPurifier_Queue();
foreach ($node->childNodes as $childNode) {
if ($level && isset($closingNodes[$level])) {
while ($node = array_pop($closingNodes[$level])) {
$this->createEndNode($node, $tokens);
} while ($level > 0);
* Portably retrieve the tag name of a node; deals with older versions
* of libxml like 2.7.6
* @param DOMNode $node
protected function getTagName($node)
if (isset($node->tagName)) {
return $node->tagName;
} else if (isset($node->nodeName)) {
return $node->nodeName;
} else if (isset($node->localName)) {
return $node->localName;
return null;
* Portably retrieve the data of a node; deals with older versions
* of libxml like 2.7.6
* @param DOMNode $node
protected function getData($node)
if (isset($node->data)) {
return $node->data;
} else if (isset($node->nodeValue)) {
return $node->nodeValue;
} else if (isset($node->textContent)) {
return $node->textContent;
return null;
* @param DOMNode $node DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @param bool $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
protected function createStartNode($node, &$tokens, $collect, $config)
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$data = $this->getData($node); // Handle variable data property
if ($data !== null) {
$tokens[] = $this->factory->createText($data);
return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of