diff --git a/INSTALL b/INSTALL index 0adf3748..677c04aa 100644 --- a/INSTALL +++ b/INSTALL @@ -26,6 +26,10 @@ These optional extensions can enhance the capabilities of HTML Purifier: * bcmath : Used for unit conversion and imagecrash protection * tidy : Used for pretty-printing HTML +These optional libraries can enhance the capabilities of HTML Purifier: + + * CSSTidy : Clean CSS stylesheets using %Core.ExtractStyleBlocks + * Net_IDNA2 (PEAR) : IRI support using %Core.EnableIDNA --------------------------------------------------------------------------- 2. Reconnaissance diff --git a/NEWS b/NEWS index 0669a313..25eae785 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier %HTML.SafeIframe and %URI.SafeIframeRegexp. Thanks Bradley M. Froehle for submitting an initial version of the patch. ! The Forms module now works properly for transitional doctypes. +! Added support for internationalized domain names. You need the PEAR + Net_IDNA2 module to be in your path; if it is installed, ensure the + class can be loaded and then set %Core.EnableIDNA to true. - Color keywords are now case insensitive. Thanks Yzmir Ramirez for reporting. - Explicitly initialize anonModule variable to null. diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 8e4a7d3c..e53be30d 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -347,6 +347,11 @@ 30 + + + 67 + + 13 diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php index feca469d..125decb2 100644 --- a/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/library/HTMLPurifier/AttrDef/URI/Host.php @@ -44,9 +44,8 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef // A regular domain name. - // This breaks I18N domain names, but we don't have proper IRI support, - // so force users to insert Punycode. If there's complaining we'll - // try to fix things into an international friendly form. + // This doesn't match I18N domain names, but we don't have proper IRI support, + // so force users to insert Punycode. // The productions describing this are: $a = '[a-z]'; // alpha @@ -57,10 +56,44 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef // toplabel = alpha | alpha *( alphanum | "-" ) alphanum $toplabel = "$a($and*$an)?"; // hostname = *( domainlabel "." ) toplabel [ "." ] - $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string); - if (!$match) return false; + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } - return $string; + // If we have Net_IDNA2 support, we can support IRIs by + // punycoding them. (This is the most portable thing to do, + // since otherwise we have to assume browsers support + + if ($config->get('Core.EnableIDNA')) { + $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true)); + // we need to encode each period separately + $parts = explode('.', $string); + try { + $new_parts = array(); + foreach ($parts as $part) { + $encodable = false; + for ($i = 0, $c = strlen($part); $i < $c; $i++) { + if (ord($part[$i]) > 0x7a) { + $encodable = true; + break; + } + } + if (!$encodable) { + $new_parts[] = $part; + } else { + $new_parts[] = $idna->encode($part); + } + } + $string = implode('.', $new_parts); + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } + } catch (Exception $e) { + // XXX error reporting + } + } + + return false; } } diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser index bc9227ec..b106bcf7 100644 Binary files a/library/HTMLPurifier/ConfigSchema/schema.ser and b/library/HTMLPurifier/ConfigSchema/schema.ser differ diff --git a/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt b/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt new file mode 100644 index 00000000..ce243c35 --- /dev/null +++ b/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt @@ -0,0 +1,9 @@ +Core.EnableIDNA +TYPE: bool +DEFAULT: false +VERSION: 4.4.0 +--DESCRIPTION-- +Allows international domain names in URLs. This configuration option +requires the PEAR Net_IDNA2 module to be installed. It operates by +punycoding any internationalized host names for maximum portability. +--# vim: et sw=4 sts=4 diff --git a/maintenance/generate-standalone.php b/maintenance/generate-standalone.php index 9fe5354a..e67f0712 100755 --- a/maintenance/generate-standalone.php +++ b/maintenance/generate-standalone.php @@ -145,7 +145,6 @@ make_dir_standalone('HTMLPurifier/Filter'); make_dir_standalone('HTMLPurifier/Printer'); make_file_standalone('HTMLPurifier/Printer.php'); make_file_standalone('HTMLPurifier/Lexer/PH5P.php'); -make_file_standalone('HTMLPurifier/Lexer/PEARSax3.php'); echo ' done!' . PHP_EOL; diff --git a/test-settings.sample.php b/test-settings.sample.php index ad7789cb..886b9748 100644 --- a/test-settings.sample.php +++ b/test-settings.sample.php @@ -69,4 +69,8 @@ $phpv = false; // to true (or, if it's not in the include path, to its install directory). $GLOBALS['HTMLPurifierTest']['PEAR'] = false; +// If PEAR is enabled, what PEAR tests should be run? (Note: you will +// need to ensure these libraries are installed) +$GLOBALS['HTMLPurifierTest']['Net_IDNA2'] = true; + // vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/AttrDef/URI/HostTest.php b/tests/HTMLPurifier/AttrDef/URI/HostTest.php index cf7beacc..b5827718 100644 --- a/tests/HTMLPurifier/AttrDef/URI/HostTest.php +++ b/tests/HTMLPurifier/AttrDef/URI/HostTest.php @@ -35,6 +35,17 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness $this->assertDef('f1.top'); $this->assertDef('f-.top', false); + $this->assertDef("\xE4\xB8\xAD\xE6\x96\x87.com.cn", false); + + } + + function testIDNA() { + if (!$GLOBALS['HTMLPurifierTest']['Net_IDNA2']) { + return false; + } + $this->config->set('Core.EnableIDNA', true); + $this->assertDef("\xE4\xB8\xAD\xE6\x96\x87.com.cn", "xn--fiq228c.com.cn"); + $this->assertDef("\xe2\x80\x85.com", false); // rejected } }