diff --git a/NEWS b/NEWS
index 503429fc..fd2da597 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,11 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.7.0, unknown release date
+1.6.1, unknown release date
+! DirectLex now preserves text in which a < bracket is followed by
+ a non-alphanumeric character. This means that certain emoticons
+ are now preserved.
+
1.6.0, released 2007-04-01
! Support for most common deprecated attributes via transformations:
+ bgcolor in td, th, tr and table
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 65d95a7c..57d116a4 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -110,6 +110,23 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
continue;
}
+ // Check leading character is alnum, if not, we may
+ // have accidently grabbed an emoticon. Translate into
+ // text and go our merry way
+ if (!ctype_alnum($segment[0])) {
+ $array[] = new
+ HTMLPurifier_Token_Text(
+ '<' .
+ $this->parseData(
+ $segment
+ ) .
+ '>'
+ );
+ $cursor = $position_next_gt + 1;
+ $inside_tag = false;
+ continue;
+ }
+
// Check if it is explicitly self closing, if so, remove
// trailing slash. Remember, we could have a tag like
, so
// any later token processing scripts must convert improperly
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 3e627dab..c0da5b2a 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -281,6 +281,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$input[18] = '
';
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
+ // test emoticon protection
+ $input[19] = 'Whoa! >.< That\'s not good >.>';
+ $expect[19] = array(
+ new HTMLPurifier_Token_Start('b'),
+ new HTMLPurifier_Token_Text('Whoa! >.'),
+ new HTMLPurifier_Token_Text('< That\'s not good >'),
+ new HTMLPurifier_Token_Text('.>'),
+ new HTMLPurifier_Token_End('b'),
+ );
+ $sax_expect[19] = false; // SAX drops the < character
+ $dom_expect[19] = false; // DOM drops the entire pseudo-tag
+
$default_config = HTMLPurifier_Config::createDefault();
$default_context = new HTMLPurifier_Context();
foreach($input as $i => $discard) {