Fix DOM bug where default encoding for HTML docs is not UTF-8.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@252 48356398-32a2-884e-a903-53898d9a118a
2024-11-09 23:28:42 +00:00 · 2006-08-14 13:27:18 +00:00 · 2006-08-14 13:27:18 +00:00 · 299236f695
commit 299236f695
parent ebf0da9b78
3 changed files with 45 additions and 4 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    public function tokenizeHTML($string) {
        $doc = new DOMDocument();
-        
+        $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
        // preprocess string
        $string = '<html><body><div>'.$string.'</div></body></html>';
        // replace and escape the CDATA sections, since parsing under HTML
        // mode won't get 'em.
        $string = $this->escapeCDATA($string);
        // preprocess string, essential for UTF-8
        $string =
        '<html><head>'.
        '<meta http-equiv="Content-Type" content="text/html;'.
            ' charset=utf-8" />'.
        '</head><body><div>'.$string.'</div></body></html>';
        @$doc->loadHTML($string); // mute all errors, handle it transparently
        return $this->tokenizeDOM(
            $doc->childNodes->item(1)-> // html
-                  childNodes->item(0)-> // body
+                  childNodes->item(1)-> // body
                  childNodes->item(0)   // div
            );
    }
--- a/smoketests/utf8.php
+++ b/smoketests/utf8.php
@ -0,0 +1,31 @@
 <!DOCTYPE html 
     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html>
 <head>
 <title>HTMLPurifier UTF-8 Smoketest</title>
 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
 </head>
 <body>
 <h1>HTMLPurifier UTF-8 Smoketest</h1>
 <?php
 set_include_path('../library' . PATH_SEPARATOR . get_include_path());
 require_once 'HTMLPurifier.php';
 $purifier = new HTMLPurifier();
 $string = '
 <ul>
    <li><b>Chinese</b> - 太極拳</li>
    <li><b>Russian</b> - ЊЎЖ</li>
    <li><b>Arabic</b> - لمنس</li>
 </ul>
 ';
 ?>
 <h2>Raw</h2>
 <?php echo $string; ?>
 <h2>Purified</h2>
 <?php echo $purifier->purify($string); ?>
 </body>
 </html>
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -216,6 +216,11 @@ class HTMLPurifier_LexerTest extends UnitTestCase
            );
        $sax_expect[16] = false; // PEARSax doesn't support it!
        // test that UTF-8 is preserved
        $char_hearts = $this->_entity_lookup->table['hearts'];
        $input[17] = $char_hearts;
        $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
        foreach($input as $i => $discard) {
            $result = $this->DirectLex->tokenizeHTML($input[$i]);
            $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');