From f145f64bf482e10f91410f2ea5fbd0652278fa24 Mon Sep 17 00:00:00 2001 From: mpyw Date: Sat, 4 Mar 2017 15:38:01 +0900 Subject: [PATCH 1/2] Fix #122: correct surrogate pair range --- library/HTMLPurifier/Encoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index fef9b589..054986ea 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -255,7 +255,7 @@ class HTMLPurifier_Encoder // 7F-9F is not strictly prohibited by XML, // but it is non-SGML, and thus we don't allow it (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) || - (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4) + (0xE000 <= $mUcs4 && 0x10FFFF >= $mUcs4) ) ) { $out .= $char; From d16e73e63e0e8c10765d91e1a4fe9a8aca2e36a6 Mon Sep 17 00:00:00 2001 From: mpyw Date: Sat, 4 Mar 2017 15:40:44 +0900 Subject: [PATCH 2/2] Add test for #122 --- tests/HTMLPurifier/EncoderTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index 819d4b11..c43e4240 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -23,6 +23,7 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness $this->assertCleanUTF8('Normal string.'); $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters"); $this->assertCleanUTF8("null byte: \0", 'null byte: '); + $this->assertCleanUTF8("あ(い)う(え)お\0", "あ(い)う(え)お"); // test for issue #122 $this->assertCleanUTF8("\1\2\3\4\5\6\7", ''); $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML