mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-23 00:41:52 +00:00
Begin getting parsing of character data into shape, not done yet.
git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@60 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
dadfa87acc
commit
3427c6c079
@ -15,6 +15,51 @@ TODO:
|
|||||||
class HTML_Lexer
|
class HTML_Lexer
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// does this version of PHP support utf8 as entity function charset?
|
||||||
|
var $_entity_utf8;
|
||||||
|
|
||||||
|
function HTML_Lexer() {
|
||||||
|
$this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is QUITE a knotty problem
|
||||||
|
//
|
||||||
|
// The main trouble is that, even while assuming UTF-8 is what we're
|
||||||
|
// using, we've got to deal with HTML entities (like —)
|
||||||
|
// Not even sure if the PHP 5 decoding function does that. Plus,
|
||||||
|
// SimpleTest doesn't use UTF-8!
|
||||||
|
//
|
||||||
|
// However, we MUST parse everything possible, because once you get
|
||||||
|
// to the HTML generator, it will escape everything possible (although
|
||||||
|
// that may not be correct, and we should be using htmlspecialchars() ).
|
||||||
|
//
|
||||||
|
// Nevertheless, strictly XML speaking, we cannot assume any character
|
||||||
|
// entities are defined except the htmlspecialchars() ones, so leaving
|
||||||
|
// the entities inside HERE is not acceptable. (plus, htmlspecialchars
|
||||||
|
// might convert them anyway). So EVERYTHING must get parsed.
|
||||||
|
//
|
||||||
|
// We may need to roll our own character entity lookup table. It's only
|
||||||
|
// about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
|
||||||
|
function parseData($string) {
|
||||||
|
// we may want to let the user do a different char encoding,
|
||||||
|
// although there is NO REASON why they shouldn't be able
|
||||||
|
// to convert it to UTF-8 before they pass it to us
|
||||||
|
|
||||||
|
// no support for less than PHP 4.3
|
||||||
|
if ($this->_entity_utf8) {
|
||||||
|
// PHP 5+, UTF-8 is nicely supported
|
||||||
|
return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
|
||||||
|
} else {
|
||||||
|
// PHP 4, do compat stuff
|
||||||
|
$string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
|
||||||
|
// get the numeric UTF-8 stuff
|
||||||
|
$string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
|
||||||
|
$string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
|
||||||
|
// get the stringy UTF-8 stuff
|
||||||
|
return $string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function nextQuote($string, $offset = 0) {
|
function nextQuote($string, $offset = 0) {
|
||||||
$quotes = array('"', "'");
|
$quotes = array('"', "'");
|
||||||
return $this->next($string, $quotes, $offset);
|
return $this->next($string, $quotes, $offset);
|
||||||
@ -80,7 +125,8 @@ class HTML_Lexer
|
|||||||
html_entity_decode(
|
html_entity_decode(
|
||||||
substr(
|
substr(
|
||||||
$string, $cursor, $position_next_lt - $cursor
|
$string, $cursor, $position_next_lt - $cursor
|
||||||
)
|
),
|
||||||
|
ENT_QUOTES
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
$cursor = $position_next_lt + 1;
|
$cursor = $position_next_lt + 1;
|
||||||
@ -96,7 +142,8 @@ class HTML_Lexer
|
|||||||
html_entity_decode(
|
html_entity_decode(
|
||||||
substr(
|
substr(
|
||||||
$string, $cursor
|
$string, $cursor
|
||||||
)
|
),
|
||||||
|
ENT_QUOTES
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
@ -175,7 +222,8 @@ class HTML_Lexer
|
|||||||
MF_Text(
|
MF_Text(
|
||||||
'<' .
|
'<' .
|
||||||
html_entity_decode(
|
html_entity_decode(
|
||||||
substr($string, $cursor)
|
substr($string, $cursor),
|
||||||
|
ENT_QUOTES
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
@ -273,7 +321,7 @@ class HTML_Lexer
|
|||||||
$value = substr($string, $position_next_quote + 1,
|
$value = substr($string, $position_next_quote + 1,
|
||||||
$position_end_quote - $position_next_quote - 1);
|
$position_end_quote - $position_next_quote - 1);
|
||||||
if ($key) {
|
if ($key) {
|
||||||
$array[$key] = html_entity_decode($value);
|
$array[$key] = html_entity_decode($value, ENT_QUOTES);
|
||||||
}
|
}
|
||||||
$cursor = $position_end_quote + 1;
|
$cursor = $position_end_quote + 1;
|
||||||
} else {
|
} else {
|
||||||
|
@ -25,6 +25,16 @@ class Test_HTML_Lexer extends UnitTestCase
|
|||||||
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
|
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_parseData() {
|
||||||
|
$HP =& $this->HTML_Lexer;
|
||||||
|
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
||||||
|
$this->assertIdentical('&', $HP->parseData('&'));
|
||||||
|
$this->assertIdentical('"', $HP->parseData('"'));
|
||||||
|
$this->assertIdentical("'", $HP->parseData('''));
|
||||||
|
$this->assertIdentical('-', $HP->parseData('-'));
|
||||||
|
// UTF-8 needed!!!
|
||||||
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML() {
|
function test_tokenizeHTML() {
|
||||||
|
|
||||||
$input = array();
|
$input = array();
|
||||||
@ -114,8 +124,8 @@ class Test_HTML_Lexer extends UnitTestCase
|
|||||||
,new MF_Text('b')
|
,new MF_Text('b')
|
||||||
,new MF_Text('>')
|
,new MF_Text('>')
|
||||||
);
|
);
|
||||||
// however, we may want to change both styles
|
// note that SAX can clump text nodes together. We won't be
|
||||||
// into parsed: '<b>'. SAX has an option for this
|
// too picky though
|
||||||
|
|
||||||
// [INVALID]
|
// [INVALID]
|
||||||
$input[10] = '<a "=>';
|
$input[10] = '<a "=>';
|
||||||
@ -123,6 +133,16 @@ class Test_HTML_Lexer extends UnitTestCase
|
|||||||
new MF_StartTag('a', array('"' => ''))
|
new MF_StartTag('a', array('"' => ''))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// [INVALID] [RECOVERABLE]
|
||||||
|
$input[11] = '"';
|
||||||
|
$expect[11] = array( new MF_Text('"') );
|
||||||
|
|
||||||
|
// compare with this valid one:
|
||||||
|
$input[12] = '"';
|
||||||
|
$expect[12] = array( new MF_Text('"') );
|
||||||
|
$sax_expect[12] = false;
|
||||||
|
// SAX chokes on this? We do have entity parsing on, so it should work!
|
||||||
|
|
||||||
foreach($input as $i => $discard) {
|
foreach($input as $i => $discard) {
|
||||||
$result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
|
$result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
|
||||||
$this->assertEqual($expect[$i], $result);
|
$this->assertEqual($expect[$i], $result);
|
||||||
|
Loading…
Reference in New Issue
Block a user