diff --git a/NEWS b/NEWS index 503429fc..fd2da597 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,11 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.7.0, unknown release date +1.6.1, unknown release date +! DirectLex now preserves text in which a < bracket is followed by + a non-alphanumeric character. This means that certain emoticons + are now preserved. + 1.6.0, released 2007-04-01 ! Support for most common deprecated attributes via transformations: + bgcolor in td, th, tr and table diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 65d95a7c..57d116a4 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -110,6 +110,23 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer continue; } + // Check leading character is alnum, if not, we may + // have accidently grabbed an emoticon. Translate into + // text and go our merry way + if (!ctype_alnum($segment[0])) { + $array[] = new + HTMLPurifier_Token_Text( + '<' . + $this->parseData( + $segment + ) . + '>' + ); + $cursor = $position_next_gt + 1; + $inside_tag = false; + continue; + } + // Check if it is explicitly self closing, if so, remove // trailing slash. Remember, we could have a tag like
, so // any later token processing scripts must convert improperly diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 3e627dab..c0da5b2a 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -281,6 +281,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase $input[18] = '
'; $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ); + // test emoticon protection + $input[19] = 'Whoa! >.< That\'s not good >.>'; + $expect[19] = array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('Whoa! >.'), + new HTMLPurifier_Token_Text('< That\'s not good >'), + new HTMLPurifier_Token_Text('.>'), + new HTMLPurifier_Token_End('b'), + ); + $sax_expect[19] = false; // SAX drops the < character + $dom_expect[19] = false; // DOM drops the entire pseudo-tag + $default_config = HTMLPurifier_Config::createDefault(); $default_context = new HTMLPurifier_Context(); foreach($input as $i => $discard) {