factory = new HTMLPurifier_TokenFactory();
}
/**
* @param string $html
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return HTMLPurifier_Token[]
*/
public function tokenizeHTML($html, $config, $context)
{
$html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core.AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/|\z)/is";
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
do {
$old = $html;
$html = preg_replace("/<($char)/i", '<\\1', $html);
} while ($html !== $old);
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
}
// preprocess html, essential for UTF-8
$html = $this->wrapHTML($html, $config, $context);
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
set_error_handler(array($this, 'muteErrorHandler'));
$doc->loadHTML($html);
restore_error_handler();
$body = $doc->getElementsByTagName('html')->item(0)-> //
getElementsByTagName('body')->item(0); //
$div = $body->getElementsByTagName('div')->item(0); //
$tokens = array();
$this->tokenizeDOM($div, $tokens);
// If the div has a sibling, that means we tripped across
// a premature
tag. So remove the div we parsed,
// and then tokenize the rest of body. We can't tokenize
// the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$body->removeChild($div);
$this->tokenizeDOM($body, $tokens);
}
return $tokens;
}
/**
* Iterative function that tokenizes a node, putting it into an accumulator.
* To iterate is human, to recurse divine - L. Peter Deutsch
* @param DOMNode $node DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens)
{
$level = 0;
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
$closingNodes = array();
do {
while (!$nodes[$level]->isEmpty()) {
$node = $nodes[$level]->shift(); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
}
if ($node->childNodes && $node->childNodes->length) {
$level++;
$nodes[$level] = new HTMLPurifier_Queue();
foreach ($node->childNodes as $childNode) {
$nodes[$level]->push($childNode);
}
}
}
$level--;
if ($level && isset($closingNodes[$level])) {
while ($node = array_pop($closingNodes[$level])) {
$this->createEndNode($node, $tokens);
}
}
} while ($level > 0);
}
/**
* @param DOMNode $node DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @param bool $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
*/
protected function createStartNode($node, &$tokens, $collect)
{
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of