From 8a2371040571c292ee41b4794882b093845a163f Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 5 Aug 2006 01:50:13 +0000 Subject: [PATCH] Implement lang and xml:lang. Fixed a bunch of bugs too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@162 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/AttrDef.php | 2 +- library/HTMLPurifier/AttrDef/Lang.php | 73 ++++++++++++++++ library/HTMLPurifier/AttrTransform.php | 14 ++++ library/HTMLPurifier/AttrTransform/Lang.php | 31 +++++++ library/HTMLPurifier/Definition.php | 24 +++++- .../Strategy/ValidateAttributes.php | 7 +- library/HTMLPurifier/Token.php | 19 ++++- tests/HTMLPurifier/AttrDef/LangTest.php | 83 +++++++++++++++++++ tests/HTMLPurifier/AttrTransform/LangTest.php | 60 ++++++++++++++ .../Strategy/ValidateAttributesTest.php | 4 + tests/index.php | 2 + 11 files changed, 312 insertions(+), 7 deletions(-) create mode 100644 library/HTMLPurifier/AttrDef/Lang.php create mode 100644 library/HTMLPurifier/AttrTransform.php create mode 100644 library/HTMLPurifier/AttrTransform/Lang.php create mode 100644 tests/HTMLPurifier/AttrDef/LangTest.php create mode 100644 tests/HTMLPurifier/AttrTransform/LangTest.php diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php index 754d995a..c1cdc7fa 100644 --- a/library/HTMLPurifier/AttrDef.php +++ b/library/HTMLPurifier/AttrDef.php @@ -5,7 +5,7 @@ class HTMLPurifier_AttrDef { function HTMLPurifier_AttrDef() {} - function validate() { + function validate($string, $config = null) { trigger_error('Cannot call abstract function', E_USER_ERROR); } diff --git a/library/HTMLPurifier/AttrDef/Lang.php b/library/HTMLPurifier/AttrDef/Lang.php new file mode 100644 index 00000000..a18b6b5c --- /dev/null +++ b/library/HTMLPurifier/AttrDef/Lang.php @@ -0,0 +1,73 @@ + 8 || !ctype_alnum($subtags[1])) { + return $new_string; + } + if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]); + + $new_string .= '-' . $subtags[1]; + if ($num_subtags == 2) return $new_string; + + // process all other subtags, index 2 and up + for ($i = 2; $i < $num_subtags; $i++) { + $length = strlen($subtags[$i]); + if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) { + return $new_string; + } + if (!ctype_lower($subtags[$i])) { + $subtags[$i] = strotolower($subtags[$i]); + } + $new_string .= '-' . $subtags[$i]; + } + + return $new_string; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/AttrTransform.php b/library/HTMLPurifier/AttrTransform.php new file mode 100644 index 00000000..8df5d3d2 --- /dev/null +++ b/library/HTMLPurifier/AttrTransform.php @@ -0,0 +1,14 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/AttrTransform/Lang.php b/library/HTMLPurifier/AttrTransform/Lang.php new file mode 100644 index 00000000..fc0b72ba --- /dev/null +++ b/library/HTMLPurifier/AttrTransform/Lang.php @@ -0,0 +1,31 @@ +attributes['lang']) ? + $token->attributes['lang'] : false; + $xml_lang = isset($token->attributes['xml:lang']) ? + $token->attributes['xml:lang'] : false; + + if ($lang === false && $xml_lang == false) return $token; + + $new_token = $token->copy(); + + if ($lang !== false && $xml_lang === false) { + $new_token->attributes['xml:lang'] = $lang; + } elseif ($xml_lang !== false) { + $new_token->attributes['lang'] = $xml_lang; + } + + return $new_token; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php index 6af372ec..2ed84ab7 100644 --- a/library/HTMLPurifier/Definition.php +++ b/library/HTMLPurifier/Definition.php @@ -45,6 +45,9 @@ class HTMLPurifier_Definition // used solely by HTMLPurifier_Strategy_RemoveForeignElements var $info_tag_transform = array(); + // used solely by HTMLPurifier_Strategy_ValidateAttributes + var $info_attr_transform = array(); + // WARNING! Prototype is not passed by reference, so in order to get // a copy of the real one, you'll have to destroy your copy and // use instance() to get it. @@ -238,11 +241,22 @@ class HTMLPurifier_Definition // which manually override these in their local definitions $this->info_global_attr = array( // core attrs - 'id' => new HTMLPurifier_AttrDef_ID(), + 'id' => new HTMLPurifier_AttrDef_ID(), 'class' => new HTMLPurifier_AttrDef_Class(), 'title' => new HTMLPurifier_AttrDef_Text(), // i18n - 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false), + 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false), + 'lang' => new HTMLPurifier_AttrDef_Lang(), + 'xml:lang' => new HTMLPurifier_AttrDef_Lang(), + ); + + // required attribute stipulation handled in attribute transformation + $this->info['bdo']->attr = array(); + + $this->info['br']->attr = array( + 'dir' => false, + 'lang' => false, + 'xml:lang' => false, ); ////////////////////////////////////////////////////////////////////// @@ -275,9 +289,11 @@ class HTMLPurifier_Definition // UNIMP : info[]->attr_transform : attribute transformations in elements ////////////////////////////////////////////////////////////////////// - // UNIMP : info_attr_transform : global attribute transform (for xml:lang) + // info_attr_transform : global attribute transformation that is + // unconditionally called. Good for transformations that have complex + // start conditions - // this might have bad implications for performance + $this->info_attr_transform[] = new HTMLPurifier_AttrTransform_Lang(); } diff --git a/library/HTMLPurifier/Strategy/ValidateAttributes.php b/library/HTMLPurifier/Strategy/ValidateAttributes.php index fa382cc3..8793f802 100644 --- a/library/HTMLPurifier/Strategy/ValidateAttributes.php +++ b/library/HTMLPurifier/Strategy/ValidateAttributes.php @@ -26,11 +26,16 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy $d_defs = $this->definition->info_global_attr; foreach ($tokens as $key => $token) { - if ($token->type !== 'start' && $token->type !== 'end') continue; + if ($token->type !== 'start' && $token->type !== 'empty') continue; // DEFINITION CALL $defs = $this->definition->info[$token->name]->attr; + // DEFINITION CALL + foreach ($this->definition->info_attr_transform as $transformer) { + $token = $transformer->transform($token); + } + $attr = $token->attributes; $changed = false; foreach ($attr as $attr_key => $value) { diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index 87e37f73..ed46621b 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -59,7 +59,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract if (!isset($attributes[$new_key])) { $attributes[$new_key] = $attributes[$key]; } - unset($attributes[$key]); + if ($new_key !== $key) { + unset($attributes[$key]); + } } } $this->attributes = $attributes; @@ -72,6 +74,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag { var $type = 'start'; + function copy() { + return new HTMLPurifier_Token_Start($this->name, $this->attributes); + } } /** @@ -80,6 +85,9 @@ class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag { var $type = 'empty'; + function copy() { + return new HTMLPurifier_Token_Empty($this->name, $this->attributes); + } } /** @@ -92,6 +100,9 @@ class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag { var $type = 'end'; + function copy() { + return new HTMLPurifier_Token_End($this->name); + } } /** @@ -120,6 +131,9 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token $this->data = $data; $this->is_whitespace = ctype_space($data); } + function copy() { + return new HTMLPurifier_Token_Text($this->data); + } } @@ -138,6 +152,9 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token function HTMLPurifier_Token_Comment($data) { $this->data = $data; } + function copy() { + return new HTMLPurifier_Token_Comment($this->data); + } } ?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/LangTest.php b/tests/HTMLPurifier/AttrDef/LangTest.php new file mode 100644 index 00000000..216a7a88 --- /dev/null +++ b/tests/HTMLPurifier/AttrDef/LangTest.php @@ -0,0 +1,83 @@ +def = new HTMLPurifier_AttrDef_Lang(); + + // basic good uses + $this->assertDef('en'); + $this->assertDef('en-us'); + + $this->assertDef(' en ', 'en'); // trim + $this->assertDef('EN', 'en'); // case insensitivity + + $this->assertDef('fr en', false); // multiple languages + $this->assertDef('%', false); // bad character + + // test overlong language according to syntax + $this->assertDef('thisistoolongsoitgetscut', false); + + // primary subtag rules + // I'm somewhat hesitant to allow x and i as primary language codes, + // because they usually are never used in real life. However, + // theoretically speaking, having them alone is permissble, so + // I'll be lenient. No XML parser is going to complain anyway. + $this->assertDef('x'); + $this->assertDef('i'); + // real world use-cases + $this->assertDef('x-klingon'); + $this->assertDef('i-mingo'); + // because the RFC only defines two and three letter primary codes, + // anything with a length of four or greater is invalid, despite + // the syntax stipulation of 1 to 8 characters. Because the RFC + // specifically states that this reservation is in order to allow + // for future versions to expand, the adoption of a new RFC will + // require these test cases to be rewritten, even if backwards- + // compatibility is largely retained (i.e. this is not forwards + // compatible) + $this->assertDef('four', false); + // for similar reasons, disallow any other one character language + $this->assertDef('f', false); + + // second subtag rules + // one letter subtags prohibited until revision. This is, however, + // less volatile than the restrictions on the primary subtags. + // Also note that this test-case tests fix-behavior: chop + // off subtags until you get a valid language code. + $this->assertDef('en-a', 'en'); + // 2-8 chars are permitted, but have special meaning that cannot + // be checked without maintaining country code lookup tables (for + // two characters) or special registration tables (for all above). + $this->assertDef('en-uk', true); + + // further subtag rules: only syntactic constraints + $this->assertDef('en-us-edison'); + $this->assertDef('en-us-toolonghaha', 'en-us'); + $this->assertDef('en-us-a-silly-long-one'); + + // rfc 3066 stipulates that if a three letter and a two letter code + // are available, the two letter one MUST be used. Without a language + // code lookup table, we cannot implement this functionality. + + // although the HTML protocol, technically speaking, allows you to + // omit language tags, this implicitly means that the parent element's + // language is the one applicable, which, in some cases, is incorrect. + // Thus, we allow und, only slightly defying the RFC's SHOULD NOT + // designation. + $this->assertDef('und'); + + // because attributes only allow one language, mul is allowed, complying + // with the RFC's SHOULD NOT designation. + $this->assertDef('mul'); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrTransform/LangTest.php b/tests/HTMLPurifier/AttrTransform/LangTest.php new file mode 100644 index 00000000..fa9273c9 --- /dev/null +++ b/tests/HTMLPurifier/AttrTransform/LangTest.php @@ -0,0 +1,60 @@ + 'en')); + $expect[1] = new HTMLPurifier_Token_Start('span', + array('lang' => 'en', + 'xml:lang' => 'en')); + + // empty tags must work too, also test attribute preservation + $inputs[2] = new HTMLPurifier_Token_Empty('img', + array('src' => 'seine.png', + 'lang' => 'fr')); + $expect[2] = new HTMLPurifier_Token_Empty('img', + array('src' => 'seine.png', + 'lang' => 'fr', + 'xml:lang' => 'fr')); + + // copy xml:lang to lang + $inputs[3] = new HTMLPurifier_Token_Start('span', + array('xml:lang' => 'en')); + $expect[3] = new HTMLPurifier_Token_Start('span', + array('lang' => 'en', + 'xml:lang' => 'en')); + + // both set, override lang with xml:lang + $inputs[4] = new HTMLPurifier_Token_Start('span', + array('lang' => 'fr', + 'xml:lang' => 'de')); + $expect[4] = new HTMLPurifier_Token_Start('span', + array('lang' => 'de', + 'xml:lang' => 'de')); + + foreach ($inputs as $i => $input) { + $result = $transform->transform($input); + $this->assertEqual($expect[$i], $result, "Test $i: %s"); + } + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php index dbf7aea2..078cde7a 100644 --- a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php +++ b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php @@ -59,6 +59,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends $inputs[10] = 'PHP'; $expect[10] = $inputs[10]; + // test lang (NEEDS CORRECTION!) + $inputs[11] = 'La soupe.'; + $expect[11] = 'La soupe.'; + $this->assertStrategyWorks($strategy, $inputs, $expect, $config); } diff --git a/tests/index.php b/tests/index.php index 190c3b91..5ff54b6c 100644 --- a/tests/index.php +++ b/tests/index.php @@ -42,8 +42,10 @@ $test->addTestFile('HTMLPurifier/AttrDef/EnumTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/IDTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/ClassTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/TextTest.php'); +$test->addTestFile('HTMLPurifier/AttrDef/LangTest.php'); $test->addTestFile('HTMLPurifier/IDAccumulatorTest.php'); $test->addTestFile('HTMLPurifier/TagTransformTest.php'); +$test->addTestFile('HTMLPurifier/AttrTransform/LangTest.php'); if (SimpleReporter::inCli()) $reporter = new TextReporter(); else $reporter = new HTMLReporter();