diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index ffda0e71..ab8c9746 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -16,12 +16,41 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // while it would be nice to use parse_url(), that's specifically // for HTTP and thus won't work for our generic URI parsing + // define regexps + + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] + $sub_delims = '!$&\'()'; // needs [] + $pct_encoded = "%$HEXDIG$HEXDIG"; + $h16 = "{$HEXDIG}{1,4}"; + $dec_octet = '(?:25[0-5]|2[0-4]\d|1\d\d|1\d|[0-9])'; + $IPv4address = "$dec_octet.$dec_octet.$dec_octet.$dec_octet"; + $ls32 = "(?:$h16:$h16|$IPv4address)"; + $IPvFuture = "v$HEXDIG+\.[:$unreserved$sub_delims]+"; + $IPv6Address = "(?:". + "(?:$h16:){6}$ls32" . + "|::(?:$h16:){5}$ls32" . + "|(?:$h16)?::(?:$h16:){4}$ls32" . + "|(?:(?:$h16:){1}$h16)?::(?:$h16:){3}$ls32" . + "|(?:(?:$h16:){2}$h16)?::(?:$h16:){2}$ls32" . + "|(?:(?:$h16:){3}$h16)?::(?:$h16:){1}$ls32" . + "|(?:(?:$h16:){4}$h16)?::$ls32" . + "|(?:(?:$h16:){5}$h16)?::$h16" . + "|(?:(?:$h16:){6}$h16)?::" . + ")"; + $IP_literal = "\[(?:$IPvFuture|$IPv6Address)\]"; + + // the important regexps, the collide with other names, prefix with r_ + + $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; + $r_authority = "/^(($r_userinfo)@)?(\[$IP_literal\]|[^:]*)(:(\d*))?/"; + // according to the RFC... (but this cuts corners, i.e. non-validating) - $regexp = '!^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?!'; + $r_URI = '!^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?!'; // 12 3 4 5 6 7 8 9 $matches = array(); - $result = preg_match($regexp, $uri, $matches); + $result = preg_match($r_URI, $uri, $matches); if (!$result) return ''; // wow, that's very strange @@ -44,38 +73,19 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // validate authority $matches = array(); - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; - $sub_delims = '!$&\'()'; - $h16 = "{$HEXDIG}{1,4}"; - $dec_octet = '(?:25[0-5]|2[0-4]\d|1\d\d|1\d|[0-9])'; - $IPv4address = "$dec_octet.$dec_octet.$dec_octet.$dec_octet"; - $ls32 = "(?:$h16:$h16|$IPv4address)"; - $IPvFuture = "v$HEXDIG+\.[:$unreserved$sub_delims]+"; - $IPv6Address = "(?:". - "(?:$h16:){6}$ls32" . - "|::(?:$h16:){5}$ls32" . - "|(?:$h16)?::(?:$h16:){4}$ls32" . - "|(?:(?:$h16:){1}$h16)?::(?:$h16:){3}$ls32" . - "|(?:(?:$h16:){2}$h16)?::(?:$h16:){2}$ls32" . - "|(?:(?:$h16:){3}$h16)?::(?:$h16:){1}$ls32" . - "|(?:(?:$h16:){4}$h16)?::$ls32" . - "|(?:(?:$h16:){5}$h16)?::$h16" . - "|(?:(?:$h16:){6}$h16)?::" . - ")"; - $IP_literal = "\[(?:$IPvFuture|$IPv6Address)\]"; - $regexp = "/^(([^@]+)@)?(\[$IP_literal\]|[^:]*)(:(\d*))?/"; - - preg_match($regexp, $authority, $matches); + // IPv6 is broken + preg_match($r_authority, $authority, $matches); $userinfo = !empty($matches[1]) ? $matches[2] : null; $host = !empty($matches[3]) ? $matches[3] : null; $port = !empty($matches[4]) ? $matches[5] : null; + // validate port if ($port !== null) { if (!ctype_digit($port) || $port < 1 || $port > 65535) { $port = null; } } + $authority = ($userinfo === null ? '' : ($userinfo . '@')) . $host .