diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php new file mode 100644 index 00000000..ffda0e71 --- /dev/null +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -0,0 +1,93 @@ +parseCDATA($uri); + + // while it would be nice to use parse_url(), that's specifically + // for HTTP and thus won't work for our generic URI parsing + + // according to the RFC... (but this cuts corners, i.e. non-validating) + $regexp = '!^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?!'; + // 12 3 4 5 6 7 8 9 + $matches = array(); + + $result = preg_match($regexp, $uri, $matches); + + if (!$result) return ''; // wow, that's very strange + + // seperate out parts + $scheme = !empty($matches[1]) ? $matches[2] : null; + $authority = !empty($matches[3]) ? $matches[4] : null; + $path = $matches[5]; // always present + $query = !empty($matches[6]) ? $matches[7] : null; + $fragment = !empty($matches[8]) ? $matches[9] : null; + + // okay, no need to validate the scheme since we do that when we + // retrieve the specific scheme object from the registry + $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); + $registry = HTMLPurifier_URISchemeRegistry::instance(); + $scheme_obj = $registry->getScheme($scheme); + + if (!$scheme_obj) return ''; // invalid scheme, clean it out + + if ($authority !== null) { + // validate authority + $matches = array(); + + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; + $sub_delims = '!$&\'()'; + $h16 = "{$HEXDIG}{1,4}"; + $dec_octet = '(?:25[0-5]|2[0-4]\d|1\d\d|1\d|[0-9])'; + $IPv4address = "$dec_octet.$dec_octet.$dec_octet.$dec_octet"; + $ls32 = "(?:$h16:$h16|$IPv4address)"; + $IPvFuture = "v$HEXDIG+\.[:$unreserved$sub_delims]+"; + $IPv6Address = "(?:". + "(?:$h16:){6}$ls32" . + "|::(?:$h16:){5}$ls32" . + "|(?:$h16)?::(?:$h16:){4}$ls32" . + "|(?:(?:$h16:){1}$h16)?::(?:$h16:){3}$ls32" . + "|(?:(?:$h16:){2}$h16)?::(?:$h16:){2}$ls32" . + "|(?:(?:$h16:){3}$h16)?::(?:$h16:){1}$ls32" . + "|(?:(?:$h16:){4}$h16)?::$ls32" . + "|(?:(?:$h16:){5}$h16)?::$h16" . + "|(?:(?:$h16:){6}$h16)?::" . + ")"; + $IP_literal = "\[(?:$IPvFuture|$IPv6Address)\]"; + $regexp = "/^(([^@]+)@)?(\[$IP_literal\]|[^:]*)(:(\d*))?/"; + + preg_match($regexp, $authority, $matches); + $userinfo = !empty($matches[1]) ? $matches[2] : null; + $host = !empty($matches[3]) ? $matches[3] : null; + $port = !empty($matches[4]) ? $matches[5] : null; + + if ($port !== null) { + if (!ctype_digit($port) || $port < 1 || $port > 65535) { + $port = null; + } + } + $authority = + ($userinfo === null ? '' : ($userinfo . '@')) . + $host . + ($port === null ? '' : (':' . $port)); + } + + list($authority, $path, $query, $fragment) = + $scheme_obj->validateComponents($authority, $path, $query, $fragment); + + + + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php new file mode 100644 index 00000000..8b3ec2c1 --- /dev/null +++ b/library/HTMLPurifier/URIScheme.php @@ -0,0 +1,12 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/URISchemeRegistry.php b/library/HTMLPurifier/URISchemeRegistry.php new file mode 100644 index 00000000..438561be --- /dev/null +++ b/library/HTMLPurifier/URISchemeRegistry.php @@ -0,0 +1,23 @@ + \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php new file mode 100644 index 00000000..32b397fc --- /dev/null +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -0,0 +1,183 @@ + $value) { + + // $fake_registry isn't the real mock, because due to PHP 4 weirdness + // I cannot set a default value to function parameters that are passed + // by reference. So we use the value instance() returns. + $fake_registry = new HTMLPurifier_URISchemeRegistryMock($this); + $registry = HTMLPurifier_URISchemeRegistry::instance($fake_registry); + + // now, let's at a pseudo-scheme to the registry + $scheme = new HTMLPurifier_URISchemeMock($this); + + // here are the schemes we will support with overloaded mocks + $registry->setReturnValue('getScheme', $scheme, array('http')); + $registry->setReturnValue('getScheme', $scheme, array('mailto')); + + // default return value is false (meaning no scheme defined: reject) + $registry->setReturnValue('getScheme', false, array('*')); + + if (!isset($return_components[$i])) { + $return_components[$i] = $components[$i]; + } + if (!isset($expect_uri[$i])) { + $expect_uri[$i] = $value; + } + if ($components[$i] === false) { + $scheme->expectNever('validateComponents'); + } else { + $scheme->setReturnValue( + 'validateComponents', $return_components[$i], $components[$i]); + $scheme->expectOnce('validateComponents', $components[$i]); + } + $result = $def->validate($value); + $scheme->tally(); + //$this->assertIdentical($expect_uri[$i], $result); + + } + + } + +} + +?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index 7c28cdb4..61f5e7f7 100644 --- a/tests/index.php +++ b/tests/index.php @@ -62,7 +62,7 @@ $test_files[] = 'AttrDef/LangTest.php'; $test_files[] = 'AttrDef/PixelsTest.php'; $test_files[] = 'AttrDef/LengthTest.php'; $test_files[] = 'AttrDef/NumberSpanTest.php'; -//$test_files[] = 'AttrDef/URITest.php'; +$test_files[] = 'AttrDef/URITest.php'; $test_files[] = 'IDAccumulatorTest.php'; $test_files[] = 'TagTransformTest.php'; $test_files[] = 'AttrTransform/LangTest.php';