mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Implement lang and xml:lang. Fixed a bunch of bugs too.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@162 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
1945ddca5c
commit
8a23710405
@ -5,7 +5,7 @@ class HTMLPurifier_AttrDef
|
||||
{
|
||||
function HTMLPurifier_AttrDef() {}
|
||||
|
||||
function validate() {
|
||||
function validate($string, $config = null) {
|
||||
trigger_error('Cannot call abstract function', E_USER_ERROR);
|
||||
}
|
||||
|
||||
|
73
library/HTMLPurifier/AttrDef/Lang.php
Normal file
73
library/HTMLPurifier/AttrDef/Lang.php
Normal file
@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// built according to RFC 3066, which obsoleted RFC 1766
|
||||
|
||||
class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
function validate($string) {
|
||||
|
||||
$string = trim($string);
|
||||
if (!$string) return false;
|
||||
|
||||
$subtags = explode('-', $string);
|
||||
$num_subtags = count($subtags);
|
||||
|
||||
if ($num_subtags == 0) return false; // sanity check
|
||||
|
||||
// process primary subtag : $subtags[0]
|
||||
$length = strlen($subtags[0]);
|
||||
switch ($length) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
if (! ctype_alpha($subtags[0]) ) {
|
||||
return false;
|
||||
} elseif (! ctype_lower($subtags[0]) ) {
|
||||
$subtags[0] = strtolower($subtags[0]);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
$new_string = $subtags[0];
|
||||
if ($num_subtags == 1) return $new_string;
|
||||
|
||||
// process second subtag : $subtags[1]
|
||||
$length = strlen($subtags[1]);
|
||||
if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) {
|
||||
return $new_string;
|
||||
}
|
||||
if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]);
|
||||
|
||||
$new_string .= '-' . $subtags[1];
|
||||
if ($num_subtags == 2) return $new_string;
|
||||
|
||||
// process all other subtags, index 2 and up
|
||||
for ($i = 2; $i < $num_subtags; $i++) {
|
||||
$length = strlen($subtags[$i]);
|
||||
if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
|
||||
return $new_string;
|
||||
}
|
||||
if (!ctype_lower($subtags[$i])) {
|
||||
$subtags[$i] = strotolower($subtags[$i]);
|
||||
}
|
||||
$new_string .= '-' . $subtags[$i];
|
||||
}
|
||||
|
||||
return $new_string;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
14
library/HTMLPurifier/AttrTransform.php
Normal file
14
library/HTMLPurifier/AttrTransform.php
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
// AttrTransform = Attribute Transformation, when handling one attribute
|
||||
// isn't enough
|
||||
class HTMLPurifier_AttrTransform
|
||||
{
|
||||
function HTMLPurifier_AttrTransform() {}
|
||||
|
||||
function transform($token, $config = null) {
|
||||
trigger_error('Cannot call abstract function', E_USER_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
31
library/HTMLPurifier/AttrTransform/Lang.php
Normal file
31
library/HTMLPurifier/AttrTransform/Lang.php
Normal file
@ -0,0 +1,31 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrTransform.php';
|
||||
|
||||
class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
|
||||
{
|
||||
|
||||
function transform($token) {
|
||||
|
||||
$lang = isset($token->attributes['lang']) ?
|
||||
$token->attributes['lang'] : false;
|
||||
$xml_lang = isset($token->attributes['xml:lang']) ?
|
||||
$token->attributes['xml:lang'] : false;
|
||||
|
||||
if ($lang === false && $xml_lang == false) return $token;
|
||||
|
||||
$new_token = $token->copy();
|
||||
|
||||
if ($lang !== false && $xml_lang === false) {
|
||||
$new_token->attributes['xml:lang'] = $lang;
|
||||
} elseif ($xml_lang !== false) {
|
||||
$new_token->attributes['lang'] = $xml_lang;
|
||||
}
|
||||
|
||||
return $new_token;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@ -45,6 +45,9 @@ class HTMLPurifier_Definition
|
||||
// used solely by HTMLPurifier_Strategy_RemoveForeignElements
|
||||
var $info_tag_transform = array();
|
||||
|
||||
// used solely by HTMLPurifier_Strategy_ValidateAttributes
|
||||
var $info_attr_transform = array();
|
||||
|
||||
// WARNING! Prototype is not passed by reference, so in order to get
|
||||
// a copy of the real one, you'll have to destroy your copy and
|
||||
// use instance() to get it.
|
||||
@ -238,11 +241,22 @@ class HTMLPurifier_Definition
|
||||
// which manually override these in their local definitions
|
||||
$this->info_global_attr = array(
|
||||
// core attrs
|
||||
'id' => new HTMLPurifier_AttrDef_ID(),
|
||||
'id' => new HTMLPurifier_AttrDef_ID(),
|
||||
'class' => new HTMLPurifier_AttrDef_Class(),
|
||||
'title' => new HTMLPurifier_AttrDef_Text(),
|
||||
// i18n
|
||||
'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
|
||||
'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
|
||||
'lang' => new HTMLPurifier_AttrDef_Lang(),
|
||||
'xml:lang' => new HTMLPurifier_AttrDef_Lang(),
|
||||
);
|
||||
|
||||
// required attribute stipulation handled in attribute transformation
|
||||
$this->info['bdo']->attr = array();
|
||||
|
||||
$this->info['br']->attr = array(
|
||||
'dir' => false,
|
||||
'lang' => false,
|
||||
'xml:lang' => false,
|
||||
);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
@ -275,9 +289,11 @@ class HTMLPurifier_Definition
|
||||
// UNIMP : info[]->attr_transform : attribute transformations in elements
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// UNIMP : info_attr_transform : global attribute transform (for xml:lang)
|
||||
// info_attr_transform : global attribute transformation that is
|
||||
// unconditionally called. Good for transformations that have complex
|
||||
// start conditions
|
||||
|
||||
// this might have bad implications for performance
|
||||
$this->info_attr_transform[] = new HTMLPurifier_AttrTransform_Lang();
|
||||
|
||||
}
|
||||
|
||||
|
@ -26,11 +26,16 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
$d_defs = $this->definition->info_global_attr;
|
||||
|
||||
foreach ($tokens as $key => $token) {
|
||||
if ($token->type !== 'start' && $token->type !== 'end') continue;
|
||||
if ($token->type !== 'start' && $token->type !== 'empty') continue;
|
||||
|
||||
// DEFINITION CALL
|
||||
$defs = $this->definition->info[$token->name]->attr;
|
||||
|
||||
// DEFINITION CALL
|
||||
foreach ($this->definition->info_attr_transform as $transformer) {
|
||||
$token = $transformer->transform($token);
|
||||
}
|
||||
|
||||
$attr = $token->attributes;
|
||||
$changed = false;
|
||||
foreach ($attr as $attr_key => $value) {
|
||||
|
@ -59,7 +59,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
|
||||
if (!isset($attributes[$new_key])) {
|
||||
$attributes[$new_key] = $attributes[$key];
|
||||
}
|
||||
unset($attributes[$key]);
|
||||
if ($new_key !== $key) {
|
||||
unset($attributes[$key]);
|
||||
}
|
||||
}
|
||||
}
|
||||
$this->attributes = $attributes;
|
||||
@ -72,6 +74,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
|
||||
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'start';
|
||||
function copy() {
|
||||
return new HTMLPurifier_Token_Start($this->name, $this->attributes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -80,6 +85,9 @@ class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
|
||||
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'empty';
|
||||
function copy() {
|
||||
return new HTMLPurifier_Token_Empty($this->name, $this->attributes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -92,6 +100,9 @@ class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
|
||||
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'end';
|
||||
function copy() {
|
||||
return new HTMLPurifier_Token_End($this->name);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -120,6 +131,9 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
|
||||
$this->data = $data;
|
||||
$this->is_whitespace = ctype_space($data);
|
||||
}
|
||||
function copy() {
|
||||
return new HTMLPurifier_Token_Text($this->data);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -138,6 +152,9 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
|
||||
function HTMLPurifier_Token_Comment($data) {
|
||||
$this->data = $data;
|
||||
}
|
||||
function copy() {
|
||||
return new HTMLPurifier_Token_Comment($this->data);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
83
tests/HTMLPurifier/AttrDef/LangTest.php
Normal file
83
tests/HTMLPurifier/AttrDef/LangTest.php
Normal file
@ -0,0 +1,83 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDefHarness.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Lang.php';
|
||||
|
||||
class HTMLPurifier_AttrDef_LangTest extends HTMLPurifier_AttrDefHarness
|
||||
{
|
||||
|
||||
function test() {
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Lang();
|
||||
|
||||
// basic good uses
|
||||
$this->assertDef('en');
|
||||
$this->assertDef('en-us');
|
||||
|
||||
$this->assertDef(' en ', 'en'); // trim
|
||||
$this->assertDef('EN', 'en'); // case insensitivity
|
||||
|
||||
$this->assertDef('fr en', false); // multiple languages
|
||||
$this->assertDef('%', false); // bad character
|
||||
|
||||
// test overlong language according to syntax
|
||||
$this->assertDef('thisistoolongsoitgetscut', false);
|
||||
|
||||
// primary subtag rules
|
||||
// I'm somewhat hesitant to allow x and i as primary language codes,
|
||||
// because they usually are never used in real life. However,
|
||||
// theoretically speaking, having them alone is permissble, so
|
||||
// I'll be lenient. No XML parser is going to complain anyway.
|
||||
$this->assertDef('x');
|
||||
$this->assertDef('i');
|
||||
// real world use-cases
|
||||
$this->assertDef('x-klingon');
|
||||
$this->assertDef('i-mingo');
|
||||
// because the RFC only defines two and three letter primary codes,
|
||||
// anything with a length of four or greater is invalid, despite
|
||||
// the syntax stipulation of 1 to 8 characters. Because the RFC
|
||||
// specifically states that this reservation is in order to allow
|
||||
// for future versions to expand, the adoption of a new RFC will
|
||||
// require these test cases to be rewritten, even if backwards-
|
||||
// compatibility is largely retained (i.e. this is not forwards
|
||||
// compatible)
|
||||
$this->assertDef('four', false);
|
||||
// for similar reasons, disallow any other one character language
|
||||
$this->assertDef('f', false);
|
||||
|
||||
// second subtag rules
|
||||
// one letter subtags prohibited until revision. This is, however,
|
||||
// less volatile than the restrictions on the primary subtags.
|
||||
// Also note that this test-case tests fix-behavior: chop
|
||||
// off subtags until you get a valid language code.
|
||||
$this->assertDef('en-a', 'en');
|
||||
// 2-8 chars are permitted, but have special meaning that cannot
|
||||
// be checked without maintaining country code lookup tables (for
|
||||
// two characters) or special registration tables (for all above).
|
||||
$this->assertDef('en-uk', true);
|
||||
|
||||
// further subtag rules: only syntactic constraints
|
||||
$this->assertDef('en-us-edison');
|
||||
$this->assertDef('en-us-toolonghaha', 'en-us');
|
||||
$this->assertDef('en-us-a-silly-long-one');
|
||||
|
||||
// rfc 3066 stipulates that if a three letter and a two letter code
|
||||
// are available, the two letter one MUST be used. Without a language
|
||||
// code lookup table, we cannot implement this functionality.
|
||||
|
||||
// although the HTML protocol, technically speaking, allows you to
|
||||
// omit language tags, this implicitly means that the parent element's
|
||||
// language is the one applicable, which, in some cases, is incorrect.
|
||||
// Thus, we allow und, only slightly defying the RFC's SHOULD NOT
|
||||
// designation.
|
||||
$this->assertDef('und');
|
||||
|
||||
// because attributes only allow one language, mul is allowed, complying
|
||||
// with the RFC's SHOULD NOT designation.
|
||||
$this->assertDef('mul');
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
60
tests/HTMLPurifier/AttrTransform/LangTest.php
Normal file
60
tests/HTMLPurifier/AttrTransform/LangTest.php
Normal file
@ -0,0 +1,60 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/Token.php';
|
||||
require_once 'HTMLPurifier/AttrTransform/Lang.php';
|
||||
|
||||
class HTMLPurifier_AttrTransform_LangTest extends UnitTestCase
|
||||
{
|
||||
|
||||
function test() {
|
||||
|
||||
$transform = new HTMLPurifier_AttrTransform_Lang();
|
||||
|
||||
$inputs = array();
|
||||
$expect = array();
|
||||
|
||||
// leave non-lang'ed elements alone
|
||||
$inputs[0] = new HTMLPurifier_Token_Start('b');
|
||||
$expect[0] = $inputs[0];
|
||||
|
||||
// copy lang to xml:lang
|
||||
$inputs[1] = new HTMLPurifier_Token_Start('span',
|
||||
array('lang' => 'en'));
|
||||
$expect[1] = new HTMLPurifier_Token_Start('span',
|
||||
array('lang' => 'en',
|
||||
'xml:lang' => 'en'));
|
||||
|
||||
// empty tags must work too, also test attribute preservation
|
||||
$inputs[2] = new HTMLPurifier_Token_Empty('img',
|
||||
array('src' => 'seine.png',
|
||||
'lang' => 'fr'));
|
||||
$expect[2] = new HTMLPurifier_Token_Empty('img',
|
||||
array('src' => 'seine.png',
|
||||
'lang' => 'fr',
|
||||
'xml:lang' => 'fr'));
|
||||
|
||||
// copy xml:lang to lang
|
||||
$inputs[3] = new HTMLPurifier_Token_Start('span',
|
||||
array('xml:lang' => 'en'));
|
||||
$expect[3] = new HTMLPurifier_Token_Start('span',
|
||||
array('lang' => 'en',
|
||||
'xml:lang' => 'en'));
|
||||
|
||||
// both set, override lang with xml:lang
|
||||
$inputs[4] = new HTMLPurifier_Token_Start('span',
|
||||
array('lang' => 'fr',
|
||||
'xml:lang' => 'de'));
|
||||
$expect[4] = new HTMLPurifier_Token_Start('span',
|
||||
array('lang' => 'de',
|
||||
'xml:lang' => 'de'));
|
||||
|
||||
foreach ($inputs as $i => $input) {
|
||||
$result = $transform->transform($input);
|
||||
$this->assertEqual($expect[$i], $result, "Test $i: %s");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@ -59,6 +59,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
|
||||
$inputs[10] = '<acronym title="PHP: Hypertext Preprocessor">PHP</acronym>';
|
||||
$expect[10] = $inputs[10];
|
||||
|
||||
// test lang (NEEDS CORRECTION!)
|
||||
$inputs[11] = '<span lang="fr">La soupe.</span>';
|
||||
$expect[11] = '<span lang="fr" xml:lang="fr">La soupe.</span>';
|
||||
|
||||
$this->assertStrategyWorks($strategy, $inputs, $expect, $config);
|
||||
|
||||
}
|
||||
|
@ -42,8 +42,10 @@ $test->addTestFile('HTMLPurifier/AttrDef/EnumTest.php');
|
||||
$test->addTestFile('HTMLPurifier/AttrDef/IDTest.php');
|
||||
$test->addTestFile('HTMLPurifier/AttrDef/ClassTest.php');
|
||||
$test->addTestFile('HTMLPurifier/AttrDef/TextTest.php');
|
||||
$test->addTestFile('HTMLPurifier/AttrDef/LangTest.php');
|
||||
$test->addTestFile('HTMLPurifier/IDAccumulatorTest.php');
|
||||
$test->addTestFile('HTMLPurifier/TagTransformTest.php');
|
||||
$test->addTestFile('HTMLPurifier/AttrTransform/LangTest.php');
|
||||
|
||||
if (SimpleReporter::inCli()) $reporter = new TextReporter();
|
||||
else $reporter = new HTMLReporter();
|
||||
|
Loading…
Reference in New Issue
Block a user