From 6b643ede02eca7331d2b03124c676ff14a372173 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 26 Dec 2011 15:34:42 +0800 Subject: [PATCH] Implement %HTML.AllowedComments and %HTML.AllowedCommentsRegexp Signed-off-by: Edward Z. Yang --- NEWS | 2 ++ configdoc/usage.xml | 16 +++++++-- library/HTMLPurifier/ConfigSchema/schema.ser | Bin 14224 -> 14435 bytes .../schema/HTML.AllowedComments.txt | 10 ++++++ .../schema/HTML.AllowedCommentsRegexp.txt | 15 +++++++++ .../Strategy/RemoveForeignElements.php | 31 ++++++++++++++---- .../Strategy/RemoveForeignElementsTest.php | 10 ++++++ .../RemoveForeignElements_ErrorsTest.php | 4 +-- 8 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt create mode 100644 library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt diff --git a/NEWS b/NEWS index b90be793..e86b35b4 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Added %HTML.TargetBlank, which adds target="blank" to all outgoing links. ! Properly handle sub-lists directly nested inside of lists in a standards compliant way, by moving them into the preceding
  • +! Added %HTML.AllowedComments and %HTML.AllowedCommentsRegexp for + limited allowed comments in untrusted situations. - Color keywords are now case insensitive. Thanks Yzmir Ramirez for reporting. - Explicitly initialize anonModule variable to null. diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 5c69264d..17df6357 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -14,7 +14,7 @@ 348 - 47 + 50 @@ -478,14 +478,24 @@ 19 - + + + 24 + + + 25 + + + 28 + + - 26 + 29 diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser index e900bac9cc6ab573f5367f0bd0650b659612128f..aedf3d2b47a4e8e58a88f325bf8bba72fde1472c 100644 GIT binary patch delta 203 zcmbP`|F~d+Iiu0!Y9-l?p($Jj&iT2ysd**EO4f;123FOz#a2dUR!SZrzCL=6IXU^| zsVOiOL8HV2&a|?yCS*NGm9F9DD19>y E074f=MF0Q* delta 58 zcmaD{Fd=_}Iitbm)q;&&lN}7WHrL2RvQ17B^gtEsVq-L(yjoCpvx@Rnp3M#h${YY) C_Y)=n diff --git a/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt b/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt new file mode 100644 index 00000000..fe7406f2 --- /dev/null +++ b/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt @@ -0,0 +1,10 @@ +HTML.AllowedComments +TYPE: lookup +VERSION: 4.3.1 +DEFAULT: array() +--DESCRIPTION-- +A whitelist which indicates what explicit comment bodies should be +allowed, modulo leading and trailing whitespace. See also %HTML.AllowedCommentsRegexp +(these directives are union'ed together, so a comment is considered +valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt b/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt new file mode 100644 index 00000000..edc4bec2 --- /dev/null +++ b/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt @@ -0,0 +1,15 @@ +HTML.AllowedCommentsRegexp +TYPE: string/null +VERSION: 4.3.1 +DEFAULT: NULL +--DESCRIPTION-- +A regexp, which if it matches the body of a comment, indicates that +it should be allowed. Trailing and leading spaces are removed prior +to running this regular expression. +Warning: Make sure you specify +correct anchor metacharacters ^regex$, otherwise you may accept +comments that you did not mean to! In particular, the regex /foo|bar/ +is probably not sufficiently strict, since it also allows foobar. +See also %HTML.AllowedComments (these directives are union'ed together, +so a comment is considered valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index cf3a33e4..bccaf14d 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -21,6 +21,9 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy // currently only used to determine if comments should be kept $trusted = $config->get('HTML.Trusted'); + $comment_lookup = $config->get('HTML.AllowedComments'); + $comment_regexp = $config->get('HTML.AllowedCommentsRegexp'); + $check_comments = $comment_lookup !== array() || $comment_regexp !== null; $remove_script_contents = $config->get('Core.RemoveScriptContents'); $hidden_elements = $config->get('Core.HiddenElements'); @@ -128,23 +131,37 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy if ($textify_comments !== false) { $data = $token->data; $token = new HTMLPurifier_Token_Text($data); - } elseif ($trusted) { - // keep, but perform comment cleaning + } elseif ($trusted || $check_comments) { + // always cleanup comments + $trailing_hyphen = false; if ($e) { // perform check whether or not there's a trailing hyphen if (substr($token->data, -1) == '-') { - $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); + $trailing_hyphen = true; } } $token->data = rtrim($token->data, '-'); $found_double_hyphen = false; while (strpos($token->data, '--') !== false) { - if ($e && !$found_double_hyphen) { - $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); - } - $found_double_hyphen = true; // prevent double-erroring + $found_double_hyphen = true; $token->data = str_replace('--', '-', $token->data); } + if ($trusted || !empty($comment_lookup[trim($token->data)]) || ($comment_regexp !== NULL && preg_match($comment_regexp, trim($token->data)))) { + // OK good + if ($e) { + if ($trailing_hyphen) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); + } + if ($found_double_hyphen) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); + } + } + } else { + if ($e) { + $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); + } + continue; + } } else { // strip comments if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php index 793bd938..b3ca1646 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php @@ -100,6 +100,16 @@ alert(<b>bold</b>); $this->assertResult('', ''); } + function testPreserveCommentsWithLookup() { + $this->config->set('HTML.AllowedComments', array('allowed')); + $this->assertResult('', ''); + } + + function testPreserveCommentsWithRegexp() { + $this->config->set('HTML.AllowedCommentsRegexp', '/^allowed[1-9]$/'); + $this->assertResult('', ''); + } + } // vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php index 98eb0b4e..4b4e3107 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php @@ -48,14 +48,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifie function testTrailingHyphenInCommentRemoved() { $this->config->set('HTML.Trusted', true); $this->expectErrorCollection(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_Comment(' test --', 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_Comment(' test ', 1)); $this->invoke(''); } function testDoubleHyphenInCommentRemoved() { $this->config->set('HTML.Trusted', true); $this->expectErrorCollection(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_Comment(' test --- test -- test ', 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_Comment(' test - test - test ', 1)); $this->invoke(''); }