factory = new HTMLPurifier_TokenFactory();
}
public function tokenizeHTML($string, $config, &$context) {
$string = $this->normalize($string, $config, $context);
// preprocess string, essential for UTF-8
$string =
''.
'
'.
''.
''.$string.'
';
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // technically does nothing, but whatever
@$doc->loadHTML($string); // mute all errors, handle it transparently
$tokens = array();
$this->tokenizeDOM(
$doc->getElementsByTagName('html')->item(0)-> // html
getElementsByTagName('body')->item(0)-> // body
getElementsByTagName('div')->item(0) // div
, $tokens);
return $tokens;
}
/**
* Recursive function that tokenizes a node, putting it into an accumulator.
*
* @param $node DOMNode to be tokenized.
* @param $tokens Array-list of already tokenized tokens.
* @param $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens, $collect = false) {
// recursive goodness!
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE ||
$node->nodeType === XML_CDATA_SECTION_NODE) {
$tokens[] = $this->factory->createText($node->data);
return;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
$tokens[] = $this->factory->createComment($node->data);
return;
}
$attr = $node->hasAttributes() ?
$this->transformAttrToAssoc($node->attributes) :
array();
// We still have to make sure that the element actually IS empty
if (!$node->childNodes->length) {
if ($collect) {
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
}
} else {
if ($collect) { // don't wrap on first iteration
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$attr
);
}
foreach ($node->childNodes as $node) {
// remember, it's an accumulator. Otherwise, we'd have
// to use array_merge
$this->tokenizeDOM($node, $tokens, true);
}
if ($collect) {
$tokens[] = $this->factory->createEnd($tag_name);
}
}
}
/**
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
*
* @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
* @returns Associative array of attributes.
*/
protected function transformAttrToAssoc($node_map) {
// NamedNodeMap is documented very well, so we're using undocumented
// features, namely, the fact that it implements Iterator and
// has a ->length attribute
if ($node_map->length === 0) return array();
$array = array();
foreach ($node_map as $attr) {
$array[$attr->name] = $attr->value;
}
return $array;
}
}
?>