factory = new HTMLPurifier_TokenFactory();
}
public function tokenizeHTML($html, $config, &$context) {
$html = $this->normalize($html, $config, $context);
// preprocess html, essential for UTF-8
$html =
''.
'
'.
''.
''.$html.'
';
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
set_error_handler(array($this, 'muteErrorHandler'));
$doc->loadHTML($html);
restore_error_handler();
$tokens = array();
$this->tokenizeDOM(
$doc->getElementsByTagName('html')->item(0)-> //
getElementsByTagName('body')->item(0)-> //
getElementsByTagName('div')->item(0) //
, $tokens);
return $tokens;
}
/**
* Recursive function that tokenizes a node, putting it into an accumulator.
*
* @param $node DOMNode to be tokenized.
* @param $tokens Array-list of already tokenized tokens.
* @param $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens, $collect = false) {
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
return;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo DOM's special treatment of