2006-07-22 12:53:04 +00:00
< ? php
2006-07-23 03:43:53 +00:00
/**
* Forgivingly lexes HTML ( SGML - style ) markup into tokens .
2008-12-06 02:28:20 -05:00
*
2006-08-01 00:29:38 +00:00
* A lexer parses a string of SGML - style markup and converts them into
2006-07-23 18:56:00 +00:00
* corresponding tokens . It doesn ' t check for well - formedness , although its
2006-07-23 03:43:53 +00:00
* internal mechanism may make this automatic ( such as the case of
* HTMLPurifier_Lexer_DOMLex ) . There are several implementations to choose
* from .
2008-12-06 02:28:20 -05:00
*
2006-08-01 00:29:38 +00:00
* A lexer is HTML - oriented : it might work with XML , but it ' s not
2006-07-23 03:43:53 +00:00
* recommended , as we adhere to a subset of the specification for optimization
2008-04-23 02:40:17 +00:00
* reasons . This might change in the future . Also , most tokenizers are not
* expected to handle DTDs or PIs .
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* This class should not be directly instantiated , but you may use create () to
2006-08-01 00:29:38 +00:00
* retrieve a default copy of the lexer . Being a supertype , this class
* does not actually define any implementation , but offers commonly used
* convenience functions for subclasses .
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ note The unit tests will instantiate this class for testing purposes , as
* many of the utility functions require a class to be instantiated .
2008-04-23 02:40:17 +00:00
* This means that , even though this class is not runnable , it will
* not be declared abstract .
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ par
2008-12-06 02:28:20 -05:00
*
2006-07-23 03:43:53 +00:00
* @ note
* We use tokens rather than create a DOM representation because DOM would :
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ par
2008-04-23 02:40:17 +00:00
* - # Require more processing and memory to create,
* - # Is not streamable, and
* - # Has the entire document structure (html and body not needed).
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ par
2008-04-23 02:40:17 +00:00
* However , DOM is helpful in that it makes it easy to move around nodes
* without a lot of lookaheads to see when a tag is closed . This is a
* limitation of the token system and some workarounds would be nice .
2006-07-23 03:43:53 +00:00
*/
2006-07-22 12:53:04 +00:00
class HTMLPurifier_Lexer
{
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
/**
* Whether or not this lexer implements line - number / column - number tracking .
* If it does , set to true .
*/
public $tracksLineNumbers = false ;
2008-12-06 02:28:20 -05:00
2007-06-17 21:27:39 +00:00
// -- STATIC ----------------------------------------------------------
2008-12-06 02:28:20 -05:00
2007-06-17 21:27:39 +00:00
/**
* Retrieves or sets the default Lexer as a Prototype Factory .
2008-12-06 02:28:20 -05:00
*
2008-04-23 02:40:17 +00:00
* By default HTMLPurifier_Lexer_DOMLex will be returned . There are
* a few exceptions involving special features that only DirectLex
* implements .
2008-12-06 02:28:20 -05:00
*
2007-06-17 21:27:39 +00:00
* @ note The behavior of this class has changed , rather than accepting
* a prototype object , it now accepts a configuration object .
* To specify your own prototype , set % Core . LexerImpl to it .
* This change in behavior de - singletonizes the lexer object .
2008-12-06 02:28:20 -05:00
*
2008-04-23 02:40:17 +00:00
* @ param $config Instance of HTMLPurifier_Config
2007-06-17 21:27:39 +00:00
* @ return Concrete lexer .
*/
2007-11-25 02:24:39 +00:00
public static function create ( $config ) {
2008-12-06 02:28:20 -05:00
2007-11-25 02:24:39 +00:00
if ( ! ( $config instanceof HTMLPurifier_Config )) {
2007-06-17 21:27:39 +00:00
$lexer = $config ;
2008-12-06 02:28:20 -05:00
trigger_error ( " Passing a prototype to
2007-06-17 21:27:39 +00:00
HTMLPurifier_Lexer :: create () is deprecated , please instead
use % Core . LexerImpl " , E_USER_WARNING);
} else {
2009-02-19 19:17:49 -05:00
$lexer = $config -> get ( 'Core.LexerImpl' );
2007-06-17 21:27:39 +00:00
}
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
$needs_tracking =
2009-02-19 19:17:49 -05:00
$config -> get ( 'Core.MaintainLineNumbers' ) ||
$config -> get ( 'Core.CollectErrors' );
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
$inst = null ;
if ( is_object ( $lexer )) {
$inst = $lexer ;
} else {
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
if ( is_null ( $lexer )) { do {
// auto-detection algorithm
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
if ( $needs_tracking ) {
$lexer = 'DirectLex' ;
break ;
}
2008-12-06 02:28:20 -05:00
2008-10-08 17:06:10 -04:00
if (
class_exists ( 'DOMDocument' ) &&
method_exists ( 'DOMDocument' , 'loadHTML' ) &&
! extension_loaded ( 'domxml' )
) {
// check for DOM support, because while it's part of the
// core, it can be disabled compile time. Also, the PECL
// domxml extension overrides the default DOM, and is evil
// and nasty and we shan't bother to support it
2008-09-05 14:04:23 -04:00
$lexer = 'DOMLex' ;
} else {
$lexer = 'DirectLex' ;
}
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
} while ( 0 ); } // do..while so we can break
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
// instantiate recognized string names
switch ( $lexer ) {
case 'DOMLex' :
$inst = new HTMLPurifier_Lexer_DOMLex ();
break ;
case 'DirectLex' :
$inst = new HTMLPurifier_Lexer_DirectLex ();
break ;
case 'PH5P' :
$inst = new HTMLPurifier_Lexer_PH5P ();
break ;
default :
throw new HTMLPurifier_Exception ( " Cannot instantiate unrecognized Lexer type " . htmlspecialchars ( $lexer ));
2007-06-17 21:27:39 +00:00
}
2008-09-05 14:04:23 -04:00
}
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
if ( ! $inst ) throw new HTMLPurifier_Exception ( 'No lexer was instantiated' );
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
// once PHP DOM implements native line numbers, or we
// hack out something using XSLT, remove this stipulation
if ( $needs_tracking && ! $inst -> tracksLineNumbers ) {
throw new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' );
2007-06-17 21:27:39 +00:00
}
2008-12-06 02:28:20 -05:00
2008-09-05 14:04:23 -04:00
return $inst ;
2008-12-06 02:28:20 -05:00
2007-06-17 21:27:39 +00:00
}
2008-12-06 02:28:20 -05:00
2007-06-17 21:27:39 +00:00
// -- CONVENIENCE MEMBERS ---------------------------------------------
2008-12-06 02:28:20 -05:00
2007-11-29 04:29:51 +00:00
public function __construct () {
2006-08-30 02:21:39 +00:00
$this -> _entity_parser = new HTMLPurifier_EntityParser ();
2006-08-29 19:36:40 +00:00
}
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
/**
* Most common entity to raw value conversion table for special entities .
*/
2007-11-25 02:24:39 +00:00
protected $_special_entity2str =
2006-09-27 02:09:54 +00:00
array (
'"' => '"' ,
'&' => '&' ,
'<' => '<' ,
'>' => '>' ,
''' => " ' " ,
''' => " ' " ,
''' => " ' "
);
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
/**
* Parses special entities into the proper characters .
2008-12-06 02:28:20 -05:00
*
2006-09-27 02:09:54 +00:00
* This string will translate escaped versions of the special characters
* into the correct ones .
2008-12-06 02:28:20 -05:00
*
2006-09-27 02:09:54 +00:00
* @ warning
* You should be able to treat the output of this function as
* completely parsed , but that ' s only because all other entities should
* have been handled previously in substituteNonSpecialEntities ()
2008-12-06 02:28:20 -05:00
*
2006-09-27 02:09:54 +00:00
* @ param $string String character data to be parsed .
* @ returns Parsed character data .
*/
2007-11-25 02:24:39 +00:00
public function parseData ( $string ) {
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
// following functions require at least one character
if ( $string === '' ) return '' ;
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count ( $string , '&' ) - substr_count ( $string , '& ' ) -
( $string [ strlen ( $string ) - 1 ] === '&' ? 1 : 0 );
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
if ( ! $num_amp ) return $string ; // abort if no entities
$num_esc_amp = substr_count ( $string , '&' );
$string = strtr ( $string , $this -> _special_entity2str );
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count ( $string , '&' ) - substr_count ( $string , '& ' ) -
( $string [ strlen ( $string ) - 1 ] === '&' ? 1 : 0 );
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
if ( $num_amp_2 <= $num_esc_amp ) return $string ;
2008-12-06 02:28:20 -05:00
2006-09-27 02:09:54 +00:00
// hmm... now we have some uncommon entities. Use the callback.
$string = $this -> _entity_parser -> substituteSpecialEntities ( $string );
return $string ;
}
2008-12-06 02:28:20 -05:00
2006-07-23 03:43:53 +00:00
/**
* Lexes an HTML string into tokens .
2008-12-06 02:28:20 -05:00
*
2006-07-23 03:43:53 +00:00
* @ param $string String HTML .
* @ return HTMLPurifier_Token array representation of HTML .
*/
2008-01-05 00:10:43 +00:00
public function tokenizeHTML ( $string , $config , $context ) {
2006-07-22 12:53:04 +00:00
trigger_error ( 'Call to abstract class' , E_USER_ERROR );
}
2008-12-06 02:28:20 -05:00
2006-07-23 23:04:34 +00:00
/**
* Translates CDATA sections into regular sections ( through escaping ) .
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ param $string HTML string to process .
* @ returns HTML with CDATA sections escaped .
*/
2007-11-25 02:24:39 +00:00
protected static function escapeCDATA ( $string ) {
2006-07-23 23:04:34 +00:00
return preg_replace_callback (
2007-06-21 14:44:26 +00:00
'/<!\[CDATA\[(.+?)\]\]>/s' ,
array ( 'HTMLPurifier_Lexer' , 'CDATACallback' ),
$string
);
}
2008-12-06 02:28:20 -05:00
2007-06-21 14:44:26 +00:00
/**
2007-11-25 02:24:39 +00:00
* Special CDATA case that is especially convoluted for < script >
2007-06-21 14:44:26 +00:00
*/
2007-11-25 02:24:39 +00:00
protected static function escapeCommentedCDATA ( $string ) {
2007-06-21 14:44:26 +00:00
return preg_replace_callback (
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s' ,
2006-07-23 23:04:34 +00:00
array ( 'HTMLPurifier_Lexer' , 'CDATACallback' ),
$string
);
}
2008-12-06 02:28:20 -05:00
2006-07-23 23:04:34 +00:00
/**
* Callback function for escapeCDATA () that does the work .
2008-12-06 02:28:20 -05:00
*
2006-07-23 23:04:34 +00:00
* @ warning Though this is public in order to let the callback happen ,
* calling it directly is not recommended .
* @ params $matches PCRE matches array , with index 0 the entire match
* and 1 the inside of the CDATA section .
* @ returns Escaped internals of the CDATA section .
*/
2007-11-25 02:24:39 +00:00
protected static function CDATACallback ( $matches ) {
2006-07-23 23:04:34 +00:00
// not exactly sure why the character set is needed, but whatever
return htmlspecialchars ( $matches [ 1 ], ENT_COMPAT , 'UTF-8' );
}
2008-12-06 02:28:20 -05:00
2006-08-29 20:05:26 +00:00
/**
* Takes a piece of HTML and normalizes it by converting entities , fixing
* encoding , extracting bits , and other good stuff .
2007-11-25 02:24:39 +00:00
* @ todo Consider making protected
2006-08-29 20:05:26 +00:00
*/
2008-01-05 00:10:43 +00:00
public function normalize ( $html , $config , $context ) {
2008-12-06 02:28:20 -05:00
2007-06-25 19:18:55 +00:00
// normalize newlines to \n
$html = str_replace ( " \r \n " , " \n " , $html );
$html = str_replace ( " \r " , " \n " , $html );
2008-12-06 02:28:20 -05:00
2009-02-19 19:17:49 -05:00
if ( $config -> get ( 'HTML.Trusted' )) {
2007-06-21 14:44:26 +00:00
// escape convoluted CDATA
$html = $this -> escapeCommentedCDATA ( $html );
}
2008-12-06 02:28:20 -05:00
2006-08-29 20:05:26 +00:00
// escape CDATA
$html = $this -> escapeCDATA ( $html );
2008-12-06 02:28:20 -05:00
2008-08-01 19:06:28 -04:00
// extract body from document if applicable
2009-02-19 19:17:49 -05:00
if ( $config -> get ( 'Core.ConvertDocumentToFragment' )) {
2008-08-01 19:06:28 -04:00
$html = $this -> extractBody ( $html );
}
2008-12-06 02:28:20 -05:00
2006-08-29 20:05:26 +00:00
// expand entities that aren't the big five
2006-08-30 02:21:39 +00:00
$html = $this -> _entity_parser -> substituteNonSpecialEntities ( $html );
2008-12-06 02:28:20 -05:00
2006-08-29 20:05:26 +00:00
// clean into wellformed UTF-8 string for an SGML context: this has
// to be done after entity expansion because the entities sometimes
// represent non-SGML characters (horror, horror!)
2007-01-18 22:55:44 +00:00
$html = HTMLPurifier_Encoder :: cleanUTF8 ( $html );
2008-12-06 02:28:20 -05:00
2006-08-29 20:05:26 +00:00
return $html ;
}
2008-12-06 02:28:20 -05:00
2006-08-15 00:31:12 +00:00
/**
* Takes a string of HTML ( fragment or document ) and returns the content
2007-11-25 02:24:39 +00:00
* @ todo Consider making protected
2006-08-15 00:31:12 +00:00
*/
2007-11-25 02:24:39 +00:00
public function extractBody ( $html ) {
2006-08-15 00:53:24 +00:00
$matches = array ();
2009-07-07 22:19:04 -04:00
$result = preg_match ( '!<body[^>]*>(.*)</body>!is' , $html , $matches );
2006-08-15 00:53:24 +00:00
if ( $result ) {
return $matches [ 1 ];
} else {
return $html ;
}
2006-08-15 00:31:12 +00:00
}
2008-12-06 02:28:20 -05:00
2006-07-22 12:53:04 +00:00
}
2008-12-06 04:24:59 -05:00
// vim: et sw=4 sts=4