0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-11-10 07:38:41 +00:00

Implement lang and xml:lang. Fixed a bunch of bugs too.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@162 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-08-05 01:50:13 +00:00
parent 1945ddca5c
commit 8a23710405
11 changed files with 312 additions and 7 deletions

View File

@ -5,7 +5,7 @@ class HTMLPurifier_AttrDef
{ {
function HTMLPurifier_AttrDef() {} function HTMLPurifier_AttrDef() {}
function validate() { function validate($string, $config = null) {
trigger_error('Cannot call abstract function', E_USER_ERROR); trigger_error('Cannot call abstract function', E_USER_ERROR);
} }

View File

@ -0,0 +1,73 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
// built according to RFC 3066, which obsoleted RFC 1766
class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
{
function validate($string) {
$string = trim($string);
if (!$string) return false;
$subtags = explode('-', $string);
$num_subtags = count($subtags);
if ($num_subtags == 0) return false; // sanity check
// process primary subtag : $subtags[0]
$length = strlen($subtags[0]);
switch ($length) {
case 0:
return false;
case 1:
if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
return false;
}
break;
case 2:
case 3:
if (! ctype_alpha($subtags[0]) ) {
return false;
} elseif (! ctype_lower($subtags[0]) ) {
$subtags[0] = strtolower($subtags[0]);
}
break;
default:
return false;
}
$new_string = $subtags[0];
if ($num_subtags == 1) return $new_string;
// process second subtag : $subtags[1]
$length = strlen($subtags[1]);
if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) {
return $new_string;
}
if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]);
$new_string .= '-' . $subtags[1];
if ($num_subtags == 2) return $new_string;
// process all other subtags, index 2 and up
for ($i = 2; $i < $num_subtags; $i++) {
$length = strlen($subtags[$i]);
if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
return $new_string;
}
if (!ctype_lower($subtags[$i])) {
$subtags[$i] = strotolower($subtags[$i]);
}
$new_string .= '-' . $subtags[$i];
}
return $new_string;
}
}
?>

View File

@ -0,0 +1,14 @@
<?php
// AttrTransform = Attribute Transformation, when handling one attribute
// isn't enough
class HTMLPurifier_AttrTransform
{
function HTMLPurifier_AttrTransform() {}
function transform($token, $config = null) {
trigger_error('Cannot call abstract function', E_USER_ERROR);
}
}
?>

View File

@ -0,0 +1,31 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
{
function transform($token) {
$lang = isset($token->attributes['lang']) ?
$token->attributes['lang'] : false;
$xml_lang = isset($token->attributes['xml:lang']) ?
$token->attributes['xml:lang'] : false;
if ($lang === false && $xml_lang == false) return $token;
$new_token = $token->copy();
if ($lang !== false && $xml_lang === false) {
$new_token->attributes['xml:lang'] = $lang;
} elseif ($xml_lang !== false) {
$new_token->attributes['lang'] = $xml_lang;
}
return $new_token;
}
}
?>

View File

@ -45,6 +45,9 @@ class HTMLPurifier_Definition
// used solely by HTMLPurifier_Strategy_RemoveForeignElements // used solely by HTMLPurifier_Strategy_RemoveForeignElements
var $info_tag_transform = array(); var $info_tag_transform = array();
// used solely by HTMLPurifier_Strategy_ValidateAttributes
var $info_attr_transform = array();
// WARNING! Prototype is not passed by reference, so in order to get // WARNING! Prototype is not passed by reference, so in order to get
// a copy of the real one, you'll have to destroy your copy and // a copy of the real one, you'll have to destroy your copy and
// use instance() to get it. // use instance() to get it.
@ -243,6 +246,17 @@ class HTMLPurifier_Definition
'title' => new HTMLPurifier_AttrDef_Text(), 'title' => new HTMLPurifier_AttrDef_Text(),
// i18n // i18n
'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false), 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
'lang' => new HTMLPurifier_AttrDef_Lang(),
'xml:lang' => new HTMLPurifier_AttrDef_Lang(),
);
// required attribute stipulation handled in attribute transformation
$this->info['bdo']->attr = array();
$this->info['br']->attr = array(
'dir' => false,
'lang' => false,
'xml:lang' => false,
); );
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -275,9 +289,11 @@ class HTMLPurifier_Definition
// UNIMP : info[]->attr_transform : attribute transformations in elements // UNIMP : info[]->attr_transform : attribute transformations in elements
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// UNIMP : info_attr_transform : global attribute transform (for xml:lang) // info_attr_transform : global attribute transformation that is
// unconditionally called. Good for transformations that have complex
// start conditions
// this might have bad implications for performance $this->info_attr_transform[] = new HTMLPurifier_AttrTransform_Lang();
} }

View File

@ -26,11 +26,16 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
$d_defs = $this->definition->info_global_attr; $d_defs = $this->definition->info_global_attr;
foreach ($tokens as $key => $token) { foreach ($tokens as $key => $token) {
if ($token->type !== 'start' && $token->type !== 'end') continue; if ($token->type !== 'start' && $token->type !== 'empty') continue;
// DEFINITION CALL // DEFINITION CALL
$defs = $this->definition->info[$token->name]->attr; $defs = $this->definition->info[$token->name]->attr;
// DEFINITION CALL
foreach ($this->definition->info_attr_transform as $transformer) {
$token = $transformer->transform($token);
}
$attr = $token->attributes; $attr = $token->attributes;
$changed = false; $changed = false;
foreach ($attr as $attr_key => $value) { foreach ($attr as $attr_key => $value) {

View File

@ -59,9 +59,11 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
if (!isset($attributes[$new_key])) { if (!isset($attributes[$new_key])) {
$attributes[$new_key] = $attributes[$key]; $attributes[$new_key] = $attributes[$key];
} }
if ($new_key !== $key) {
unset($attributes[$key]); unset($attributes[$key]);
} }
} }
}
$this->attributes = $attributes; $this->attributes = $attributes;
} }
} }
@ -72,6 +74,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{ {
var $type = 'start'; var $type = 'start';
function copy() {
return new HTMLPurifier_Token_Start($this->name, $this->attributes);
}
} }
/** /**
@ -80,6 +85,9 @@ class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{ {
var $type = 'empty'; var $type = 'empty';
function copy() {
return new HTMLPurifier_Token_Empty($this->name, $this->attributes);
}
} }
/** /**
@ -92,6 +100,9 @@ class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{ {
var $type = 'end'; var $type = 'end';
function copy() {
return new HTMLPurifier_Token_End($this->name);
}
} }
/** /**
@ -120,6 +131,9 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
$this->data = $data; $this->data = $data;
$this->is_whitespace = ctype_space($data); $this->is_whitespace = ctype_space($data);
} }
function copy() {
return new HTMLPurifier_Token_Text($this->data);
}
} }
@ -138,6 +152,9 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
function HTMLPurifier_Token_Comment($data) { function HTMLPurifier_Token_Comment($data) {
$this->data = $data; $this->data = $data;
} }
function copy() {
return new HTMLPurifier_Token_Comment($this->data);
}
} }
?> ?>

View File

@ -0,0 +1,83 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/Lang.php';
class HTMLPurifier_AttrDef_LangTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Lang();
// basic good uses
$this->assertDef('en');
$this->assertDef('en-us');
$this->assertDef(' en ', 'en'); // trim
$this->assertDef('EN', 'en'); // case insensitivity
$this->assertDef('fr en', false); // multiple languages
$this->assertDef('%', false); // bad character
// test overlong language according to syntax
$this->assertDef('thisistoolongsoitgetscut', false);
// primary subtag rules
// I'm somewhat hesitant to allow x and i as primary language codes,
// because they usually are never used in real life. However,
// theoretically speaking, having them alone is permissble, so
// I'll be lenient. No XML parser is going to complain anyway.
$this->assertDef('x');
$this->assertDef('i');
// real world use-cases
$this->assertDef('x-klingon');
$this->assertDef('i-mingo');
// because the RFC only defines two and three letter primary codes,
// anything with a length of four or greater is invalid, despite
// the syntax stipulation of 1 to 8 characters. Because the RFC
// specifically states that this reservation is in order to allow
// for future versions to expand, the adoption of a new RFC will
// require these test cases to be rewritten, even if backwards-
// compatibility is largely retained (i.e. this is not forwards
// compatible)
$this->assertDef('four', false);
// for similar reasons, disallow any other one character language
$this->assertDef('f', false);
// second subtag rules
// one letter subtags prohibited until revision. This is, however,
// less volatile than the restrictions on the primary subtags.
// Also note that this test-case tests fix-behavior: chop
// off subtags until you get a valid language code.
$this->assertDef('en-a', 'en');
// 2-8 chars are permitted, but have special meaning that cannot
// be checked without maintaining country code lookup tables (for
// two characters) or special registration tables (for all above).
$this->assertDef('en-uk', true);
// further subtag rules: only syntactic constraints
$this->assertDef('en-us-edison');
$this->assertDef('en-us-toolonghaha', 'en-us');
$this->assertDef('en-us-a-silly-long-one');
// rfc 3066 stipulates that if a three letter and a two letter code
// are available, the two letter one MUST be used. Without a language
// code lookup table, we cannot implement this functionality.
// although the HTML protocol, technically speaking, allows you to
// omit language tags, this implicitly means that the parent element's
// language is the one applicable, which, in some cases, is incorrect.
// Thus, we allow und, only slightly defying the RFC's SHOULD NOT
// designation.
$this->assertDef('und');
// because attributes only allow one language, mul is allowed, complying
// with the RFC's SHOULD NOT designation.
$this->assertDef('mul');
}
}
?>

View File

@ -0,0 +1,60 @@
<?php
require_once 'HTMLPurifier/Token.php';
require_once 'HTMLPurifier/AttrTransform/Lang.php';
class HTMLPurifier_AttrTransform_LangTest extends UnitTestCase
{
function test() {
$transform = new HTMLPurifier_AttrTransform_Lang();
$inputs = array();
$expect = array();
// leave non-lang'ed elements alone
$inputs[0] = new HTMLPurifier_Token_Start('b');
$expect[0] = $inputs[0];
// copy lang to xml:lang
$inputs[1] = new HTMLPurifier_Token_Start('span',
array('lang' => 'en'));
$expect[1] = new HTMLPurifier_Token_Start('span',
array('lang' => 'en',
'xml:lang' => 'en'));
// empty tags must work too, also test attribute preservation
$inputs[2] = new HTMLPurifier_Token_Empty('img',
array('src' => 'seine.png',
'lang' => 'fr'));
$expect[2] = new HTMLPurifier_Token_Empty('img',
array('src' => 'seine.png',
'lang' => 'fr',
'xml:lang' => 'fr'));
// copy xml:lang to lang
$inputs[3] = new HTMLPurifier_Token_Start('span',
array('xml:lang' => 'en'));
$expect[3] = new HTMLPurifier_Token_Start('span',
array('lang' => 'en',
'xml:lang' => 'en'));
// both set, override lang with xml:lang
$inputs[4] = new HTMLPurifier_Token_Start('span',
array('lang' => 'fr',
'xml:lang' => 'de'));
$expect[4] = new HTMLPurifier_Token_Start('span',
array('lang' => 'de',
'xml:lang' => 'de'));
foreach ($inputs as $i => $input) {
$result = $transform->transform($input);
$this->assertEqual($expect[$i], $result, "Test $i: %s");
}
}
}
?>

View File

@ -59,6 +59,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
$inputs[10] = '<acronym title="PHP: Hypertext Preprocessor">PHP</acronym>'; $inputs[10] = '<acronym title="PHP: Hypertext Preprocessor">PHP</acronym>';
$expect[10] = $inputs[10]; $expect[10] = $inputs[10];
// test lang (NEEDS CORRECTION!)
$inputs[11] = '<span lang="fr">La soupe.</span>';
$expect[11] = '<span lang="fr" xml:lang="fr">La soupe.</span>';
$this->assertStrategyWorks($strategy, $inputs, $expect, $config); $this->assertStrategyWorks($strategy, $inputs, $expect, $config);
} }

View File

@ -42,8 +42,10 @@ $test->addTestFile('HTMLPurifier/AttrDef/EnumTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/IDTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/IDTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/ClassTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/ClassTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/TextTest.php'); $test->addTestFile('HTMLPurifier/AttrDef/TextTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/LangTest.php');
$test->addTestFile('HTMLPurifier/IDAccumulatorTest.php'); $test->addTestFile('HTMLPurifier/IDAccumulatorTest.php');
$test->addTestFile('HTMLPurifier/TagTransformTest.php'); $test->addTestFile('HTMLPurifier/TagTransformTest.php');
$test->addTestFile('HTMLPurifier/AttrTransform/LangTest.php');
if (SimpleReporter::inCli()) $reporter = new TextReporter(); if (SimpleReporter::inCli()) $reporter = new TextReporter();
else $reporter = new HTMLReporter(); else $reporter = new HTMLReporter();