factory = new HTMLPurifier_TokenFactory(); } public function tokenizeHTML($html, $config, &$context) { $html = $this->normalize($html, $config, $context); // preprocess html, essential for UTF-8 $html = ''. ''. ''. '
'.$html.'
'; $doc = new DOMDocument(); $doc->encoding = 'UTF-8'; // theoretically, the above has this covered set_error_handler(array($this, 'muteErrorHandler')); $doc->loadHTML($html); restore_error_handler(); $tokens = array(); $this->tokenizeDOM( $doc->getElementsByTagName('html')->item(0)-> // getElementsByTagName('body')->item(0)-> // getElementsByTagName('div')->item(0) //
, $tokens); return $tokens; } /** * Recursive function that tokenizes a node, putting it into an accumulator. * * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. * @param $collect Says whether or start and close are collected, set to * false at first recursion because it's the implicit DIV * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ protected function tokenizeDOM($node, &$tokens, $collect = false) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); return; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo DOM's special treatment of