2006-07-22 12:53:04 +00:00
|
|
|
<?php
|
|
|
|
|
2006-07-23 03:43:53 +00:00
|
|
|
/**
|
|
|
|
* Forgivingly lexes HTML (SGML-style) markup into tokens.
|
|
|
|
*
|
2006-08-01 00:29:38 +00:00
|
|
|
* A lexer parses a string of SGML-style markup and converts them into
|
2006-07-23 18:56:00 +00:00
|
|
|
* corresponding tokens. It doesn't check for well-formedness, although its
|
2006-07-23 03:43:53 +00:00
|
|
|
* internal mechanism may make this automatic (such as the case of
|
|
|
|
* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
|
|
|
|
* from.
|
|
|
|
*
|
2006-08-01 00:29:38 +00:00
|
|
|
* A lexer is HTML-oriented: it might work with XML, but it's not
|
2006-07-23 03:43:53 +00:00
|
|
|
* recommended, as we adhere to a subset of the specification for optimization
|
|
|
|
* reasons.
|
|
|
|
*
|
2006-07-23 23:04:34 +00:00
|
|
|
* This class should not be directly instantiated, but you may use create() to
|
2006-08-01 00:29:38 +00:00
|
|
|
* retrieve a default copy of the lexer. Being a supertype, this class
|
|
|
|
* does not actually define any implementation, but offers commonly used
|
|
|
|
* convenience functions for subclasses.
|
2006-07-23 03:43:53 +00:00
|
|
|
*
|
2006-07-23 23:04:34 +00:00
|
|
|
* @note The unit tests will instantiate this class for testing purposes, as
|
|
|
|
* many of the utility functions require a class to be instantiated.
|
|
|
|
* Be careful when porting this class to PHP 5.
|
|
|
|
*
|
|
|
|
* @par
|
|
|
|
*
|
2006-07-23 03:43:53 +00:00
|
|
|
* @note
|
|
|
|
* We use tokens rather than create a DOM representation because DOM would:
|
|
|
|
*
|
2006-07-23 23:04:34 +00:00
|
|
|
* @par
|
2006-07-23 03:43:53 +00:00
|
|
|
* -# Require more processing power to create,
|
|
|
|
* -# Require recursion to iterate,
|
|
|
|
* -# Must be compatible with PHP 5's DOM (otherwise duplication),
|
|
|
|
* -# Has the entire document structure (html and body not needed), and
|
|
|
|
* -# Has unknown readability improvement.
|
|
|
|
*
|
2006-07-23 23:04:34 +00:00
|
|
|
* @par
|
2006-07-23 03:43:53 +00:00
|
|
|
* What the last item means is that the functions for manipulating tokens are
|
|
|
|
* already fairly compact, and when well-commented, more abstraction may not
|
|
|
|
* be needed.
|
|
|
|
*
|
|
|
|
* @see HTMLPurifier_Token
|
|
|
|
*/
|
2006-07-22 12:53:04 +00:00
|
|
|
class HTMLPurifier_Lexer
|
|
|
|
{
|
|
|
|
|
2007-06-17 21:27:39 +00:00
|
|
|
// -- STATIC ----------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Retrieves or sets the default Lexer as a Prototype Factory.
|
|
|
|
*
|
|
|
|
* Depending on what PHP version you are running, the abstract base
|
|
|
|
* Lexer class will determine which concrete Lexer is best for you:
|
|
|
|
* HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex
|
|
|
|
* for PHP 5 and beyond. This general rule has a few exceptions to it
|
|
|
|
* involving special features that only DirectLex implements.
|
|
|
|
*
|
|
|
|
* @note The behavior of this class has changed, rather than accepting
|
|
|
|
* a prototype object, it now accepts a configuration object.
|
|
|
|
* To specify your own prototype, set %Core.LexerImpl to it.
|
|
|
|
* This change in behavior de-singletonizes the lexer object.
|
|
|
|
*
|
|
|
|
* @note In PHP4, it is possible to call this factory method from
|
|
|
|
* subclasses, such usage is not recommended and not
|
|
|
|
* forwards-compatible.
|
|
|
|
*
|
|
|
|
* @param $prototype Optional prototype lexer or configuration object
|
|
|
|
* @return Concrete lexer.
|
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
public static function create($config) {
|
2007-06-17 21:27:39 +00:00
|
|
|
|
2007-11-25 02:24:39 +00:00
|
|
|
if (!($config instanceof HTMLPurifier_Config)) {
|
2007-06-17 21:27:39 +00:00
|
|
|
$lexer = $config;
|
|
|
|
trigger_error("Passing a prototype to
|
|
|
|
HTMLPurifier_Lexer::create() is deprecated, please instead
|
|
|
|
use %Core.LexerImpl", E_USER_WARNING);
|
|
|
|
} else {
|
|
|
|
$lexer = $config->get('Core', 'LexerImpl');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_object($lexer)) {
|
|
|
|
return $lexer;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_null($lexer)) { do {
|
2007-06-18 02:01:01 +00:00
|
|
|
// auto-detection algorithm
|
|
|
|
|
|
|
|
// once PHP DOM implements native line numbers, or we
|
|
|
|
// hack out something using XSLT, remove this stipulation
|
2007-06-26 02:49:21 +00:00
|
|
|
$line_numbers = $config->get('Core', 'MaintainLineNumbers');
|
|
|
|
if (
|
|
|
|
$line_numbers === true ||
|
|
|
|
($line_numbers === null && $config->get('Core', 'CollectErrors'))
|
|
|
|
) {
|
2007-06-18 02:01:01 +00:00
|
|
|
$lexer = 'DirectLex';
|
|
|
|
break;
|
|
|
|
}
|
2007-06-17 21:27:39 +00:00
|
|
|
|
|
|
|
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
|
|
|
|
class_exists('DOMDocument')) { // check for DOM support
|
|
|
|
$lexer = 'DOMLex';
|
|
|
|
} else {
|
|
|
|
$lexer = 'DirectLex';
|
|
|
|
}
|
|
|
|
|
|
|
|
} while(0); } // do..while so we can break
|
|
|
|
|
|
|
|
// instantiate recognized string names
|
|
|
|
switch ($lexer) {
|
|
|
|
case 'DOMLex':
|
|
|
|
return new HTMLPurifier_Lexer_DOMLex();
|
|
|
|
case 'DirectLex':
|
|
|
|
return new HTMLPurifier_Lexer_DirectLex();
|
2007-08-19 18:49:35 +00:00
|
|
|
case 'PH5P':
|
|
|
|
// experimental Lexer that must be manually included
|
|
|
|
return new HTMLPurifier_Lexer_PH5P();
|
2007-06-17 21:27:39 +00:00
|
|
|
default:
|
|
|
|
trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// -- CONVENIENCE MEMBERS ---------------------------------------------
|
|
|
|
|
2007-11-29 04:29:51 +00:00
|
|
|
public function __construct() {
|
2006-08-30 02:21:39 +00:00
|
|
|
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
|
|
|
|
2006-09-27 02:09:54 +00:00
|
|
|
/**
|
|
|
|
* Most common entity to raw value conversion table for special entities.
|
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected $_special_entity2str =
|
2006-09-27 02:09:54 +00:00
|
|
|
array(
|
|
|
|
'"' => '"',
|
|
|
|
'&' => '&',
|
|
|
|
'<' => '<',
|
|
|
|
'>' => '>',
|
|
|
|
''' => "'",
|
|
|
|
''' => "'",
|
|
|
|
''' => "'"
|
|
|
|
);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parses special entities into the proper characters.
|
|
|
|
*
|
|
|
|
* This string will translate escaped versions of the special characters
|
|
|
|
* into the correct ones.
|
|
|
|
*
|
|
|
|
* @warning
|
|
|
|
* You should be able to treat the output of this function as
|
|
|
|
* completely parsed, but that's only because all other entities should
|
|
|
|
* have been handled previously in substituteNonSpecialEntities()
|
|
|
|
*
|
|
|
|
* @param $string String character data to be parsed.
|
|
|
|
* @returns Parsed character data.
|
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
public function parseData($string) {
|
2006-09-27 02:09:54 +00:00
|
|
|
|
|
|
|
// following functions require at least one character
|
|
|
|
if ($string === '') return '';
|
|
|
|
|
|
|
|
// subtracts amps that cannot possibly be escaped
|
|
|
|
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
|
|
|
($string[strlen($string)-1] === '&' ? 1 : 0);
|
|
|
|
|
|
|
|
if (!$num_amp) return $string; // abort if no entities
|
|
|
|
$num_esc_amp = substr_count($string, '&');
|
|
|
|
$string = strtr($string, $this->_special_entity2str);
|
|
|
|
|
|
|
|
// code duplication for sake of optimization, see above
|
|
|
|
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
|
|
|
($string[strlen($string)-1] === '&' ? 1 : 0);
|
|
|
|
|
|
|
|
if ($num_amp_2 <= $num_esc_amp) return $string;
|
|
|
|
|
|
|
|
// hmm... now we have some uncommon entities. Use the callback.
|
|
|
|
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
|
|
|
return $string;
|
|
|
|
}
|
|
|
|
|
2006-07-23 03:43:53 +00:00
|
|
|
/**
|
|
|
|
* Lexes an HTML string into tokens.
|
|
|
|
*
|
|
|
|
* @param $string String HTML.
|
|
|
|
* @return HTMLPurifier_Token array representation of HTML.
|
|
|
|
*/
|
2008-01-05 00:10:43 +00:00
|
|
|
public function tokenizeHTML($string, $config, $context) {
|
2006-07-22 12:53:04 +00:00
|
|
|
trigger_error('Call to abstract class', E_USER_ERROR);
|
|
|
|
}
|
|
|
|
|
2006-07-23 23:04:34 +00:00
|
|
|
/**
|
|
|
|
* Translates CDATA sections into regular sections (through escaping).
|
|
|
|
*
|
|
|
|
* @param $string HTML string to process.
|
|
|
|
* @returns HTML with CDATA sections escaped.
|
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected static function escapeCDATA($string) {
|
2006-07-23 23:04:34 +00:00
|
|
|
return preg_replace_callback(
|
2007-06-21 14:44:26 +00:00
|
|
|
'/<!\[CDATA\[(.+?)\]\]>/s',
|
|
|
|
array('HTMLPurifier_Lexer', 'CDATACallback'),
|
|
|
|
$string
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2007-11-25 02:24:39 +00:00
|
|
|
* Special CDATA case that is especially convoluted for <script>
|
2007-06-21 14:44:26 +00:00
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected static function escapeCommentedCDATA($string) {
|
2007-06-21 14:44:26 +00:00
|
|
|
return preg_replace_callback(
|
|
|
|
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
|
2006-07-23 23:04:34 +00:00
|
|
|
array('HTMLPurifier_Lexer', 'CDATACallback'),
|
|
|
|
$string
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Callback function for escapeCDATA() that does the work.
|
|
|
|
*
|
|
|
|
* @warning Though this is public in order to let the callback happen,
|
|
|
|
* calling it directly is not recommended.
|
|
|
|
* @params $matches PCRE matches array, with index 0 the entire match
|
|
|
|
* and 1 the inside of the CDATA section.
|
|
|
|
* @returns Escaped internals of the CDATA section.
|
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected static function CDATACallback($matches) {
|
2006-07-23 23:04:34 +00:00
|
|
|
// not exactly sure why the character set is needed, but whatever
|
|
|
|
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
|
|
|
|
}
|
|
|
|
|
2006-08-29 20:05:26 +00:00
|
|
|
/**
|
|
|
|
* Takes a piece of HTML and normalizes it by converting entities, fixing
|
|
|
|
* encoding, extracting bits, and other good stuff.
|
2007-11-25 02:24:39 +00:00
|
|
|
* @todo Consider making protected
|
2006-08-29 20:05:26 +00:00
|
|
|
*/
|
2008-01-05 00:10:43 +00:00
|
|
|
public function normalize($html, $config, $context) {
|
2006-08-29 20:05:26 +00:00
|
|
|
|
|
|
|
// extract body from document if applicable
|
2007-09-09 01:35:50 +00:00
|
|
|
if ($config->get('Core', 'ConvertDocumentToFragment')) {
|
2006-08-29 20:05:26 +00:00
|
|
|
$html = $this->extractBody($html);
|
|
|
|
}
|
|
|
|
|
2007-06-25 19:18:55 +00:00
|
|
|
// normalize newlines to \n
|
|
|
|
$html = str_replace("\r\n", "\n", $html);
|
|
|
|
$html = str_replace("\r", "\n", $html);
|
|
|
|
|
2007-06-21 14:44:26 +00:00
|
|
|
if ($config->get('HTML', 'Trusted')) {
|
|
|
|
// escape convoluted CDATA
|
|
|
|
$html = $this->escapeCommentedCDATA($html);
|
|
|
|
}
|
|
|
|
|
2006-08-29 20:05:26 +00:00
|
|
|
// escape CDATA
|
|
|
|
$html = $this->escapeCDATA($html);
|
|
|
|
|
|
|
|
// expand entities that aren't the big five
|
2006-08-30 02:21:39 +00:00
|
|
|
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
2006-08-29 20:05:26 +00:00
|
|
|
|
|
|
|
// clean into wellformed UTF-8 string for an SGML context: this has
|
|
|
|
// to be done after entity expansion because the entities sometimes
|
|
|
|
// represent non-SGML characters (horror, horror!)
|
2007-01-18 22:55:44 +00:00
|
|
|
$html = HTMLPurifier_Encoder::cleanUTF8($html);
|
2006-08-29 20:05:26 +00:00
|
|
|
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
|
2006-08-15 00:31:12 +00:00
|
|
|
/**
|
|
|
|
* Takes a string of HTML (fragment or document) and returns the content
|
2007-11-25 02:24:39 +00:00
|
|
|
* @todo Consider making protected
|
2006-08-15 00:31:12 +00:00
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
public function extractBody($html) {
|
2006-08-15 00:53:24 +00:00
|
|
|
$matches = array();
|
|
|
|
$result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
|
|
|
|
if ($result) {
|
|
|
|
return $matches[1];
|
|
|
|
} else {
|
|
|
|
return $html;
|
|
|
|
}
|
2006-08-15 00:31:12 +00:00
|
|
|
}
|
|
|
|
|
2006-07-22 12:53:04 +00:00
|
|
|
}
|
|
|
|
|