mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Begin getting parsing of character data into shape, not done yet.
git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@60 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
dadfa87acc
commit
3427c6c079
@ -15,6 +15,51 @@ TODO:
|
||||
class HTML_Lexer
|
||||
{
|
||||
|
||||
// does this version of PHP support utf8 as entity function charset?
|
||||
var $_entity_utf8;
|
||||
|
||||
function HTML_Lexer() {
|
||||
$this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
|
||||
}
|
||||
|
||||
// this is QUITE a knotty problem
|
||||
//
|
||||
// The main trouble is that, even while assuming UTF-8 is what we're
|
||||
// using, we've got to deal with HTML entities (like —)
|
||||
// Not even sure if the PHP 5 decoding function does that. Plus,
|
||||
// SimpleTest doesn't use UTF-8!
|
||||
//
|
||||
// However, we MUST parse everything possible, because once you get
|
||||
// to the HTML generator, it will escape everything possible (although
|
||||
// that may not be correct, and we should be using htmlspecialchars() ).
|
||||
//
|
||||
// Nevertheless, strictly XML speaking, we cannot assume any character
|
||||
// entities are defined except the htmlspecialchars() ones, so leaving
|
||||
// the entities inside HERE is not acceptable. (plus, htmlspecialchars
|
||||
// might convert them anyway). So EVERYTHING must get parsed.
|
||||
//
|
||||
// We may need to roll our own character entity lookup table. It's only
|
||||
// about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
|
||||
function parseData($string) {
|
||||
// we may want to let the user do a different char encoding,
|
||||
// although there is NO REASON why they shouldn't be able
|
||||
// to convert it to UTF-8 before they pass it to us
|
||||
|
||||
// no support for less than PHP 4.3
|
||||
if ($this->_entity_utf8) {
|
||||
// PHP 5+, UTF-8 is nicely supported
|
||||
return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
|
||||
} else {
|
||||
// PHP 4, do compat stuff
|
||||
$string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
|
||||
// get the numeric UTF-8 stuff
|
||||
$string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
|
||||
$string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
|
||||
// get the stringy UTF-8 stuff
|
||||
return $string;
|
||||
}
|
||||
}
|
||||
|
||||
function nextQuote($string, $offset = 0) {
|
||||
$quotes = array('"', "'");
|
||||
return $this->next($string, $quotes, $offset);
|
||||
@ -80,7 +125,8 @@ class HTML_Lexer
|
||||
html_entity_decode(
|
||||
substr(
|
||||
$string, $cursor, $position_next_lt - $cursor
|
||||
)
|
||||
),
|
||||
ENT_QUOTES
|
||||
)
|
||||
);
|
||||
$cursor = $position_next_lt + 1;
|
||||
@ -96,7 +142,8 @@ class HTML_Lexer
|
||||
html_entity_decode(
|
||||
substr(
|
||||
$string, $cursor
|
||||
)
|
||||
),
|
||||
ENT_QUOTES
|
||||
)
|
||||
);
|
||||
break;
|
||||
@ -175,7 +222,8 @@ class HTML_Lexer
|
||||
MF_Text(
|
||||
'<' .
|
||||
html_entity_decode(
|
||||
substr($string, $cursor)
|
||||
substr($string, $cursor),
|
||||
ENT_QUOTES
|
||||
)
|
||||
);
|
||||
break;
|
||||
@ -273,7 +321,7 @@ class HTML_Lexer
|
||||
$value = substr($string, $position_next_quote + 1,
|
||||
$position_end_quote - $position_next_quote - 1);
|
||||
if ($key) {
|
||||
$array[$key] = html_entity_decode($value);
|
||||
$array[$key] = html_entity_decode($value, ENT_QUOTES);
|
||||
}
|
||||
$cursor = $position_end_quote + 1;
|
||||
} else {
|
||||
|
@ -25,6 +25,16 @@ class Test_HTML_Lexer extends UnitTestCase
|
||||
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
|
||||
}
|
||||
|
||||
function test_parseData() {
|
||||
$HP =& $this->HTML_Lexer;
|
||||
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
||||
$this->assertIdentical('&', $HP->parseData('&'));
|
||||
$this->assertIdentical('"', $HP->parseData('"'));
|
||||
$this->assertIdentical("'", $HP->parseData('''));
|
||||
$this->assertIdentical('-', $HP->parseData('-'));
|
||||
// UTF-8 needed!!!
|
||||
}
|
||||
|
||||
function test_tokenizeHTML() {
|
||||
|
||||
$input = array();
|
||||
@ -114,8 +124,8 @@ class Test_HTML_Lexer extends UnitTestCase
|
||||
,new MF_Text('b')
|
||||
,new MF_Text('>')
|
||||
);
|
||||
// however, we may want to change both styles
|
||||
// into parsed: '<b>'. SAX has an option for this
|
||||
// note that SAX can clump text nodes together. We won't be
|
||||
// too picky though
|
||||
|
||||
// [INVALID]
|
||||
$input[10] = '<a "=>';
|
||||
@ -123,6 +133,16 @@ class Test_HTML_Lexer extends UnitTestCase
|
||||
new MF_StartTag('a', array('"' => ''))
|
||||
);
|
||||
|
||||
// [INVALID] [RECOVERABLE]
|
||||
$input[11] = '"';
|
||||
$expect[11] = array( new MF_Text('"') );
|
||||
|
||||
// compare with this valid one:
|
||||
$input[12] = '"';
|
||||
$expect[12] = array( new MF_Text('"') );
|
||||
$sax_expect[12] = false;
|
||||
// SAX chokes on this? We do have entity parsing on, so it should work!
|
||||
|
||||
foreach($input as $i => $discard) {
|
||||
$result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
|
||||
$this->assertEqual($expect[$i], $result);
|
||||
|
Loading…
Reference in New Issue
Block a user