mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-02-03 10:30:01 +00:00
df3a3bab6e
2.1. The main parts of URLs Scheme names consist of a sequence of characters. The lower case letters "a"--"z", digits, and the characters plus ("+"), period ("."), and hyphen ("-") are allowed. For resiliency, programs interpreting URLs should treat upper case letters as equivalent to lower case in scheme names (e.g., allow "HTTP" as well as "http").
71 lines
2.2 KiB
PHP
71 lines
2.2 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Parses a URI into the components and fragment identifier as specified
|
|
* by RFC 3986.
|
|
*/
|
|
class HTMLPurifier_URIParser
|
|
{
|
|
|
|
/**
|
|
* Instance of HTMLPurifier_PercentEncoder to do normalization with.
|
|
*/
|
|
protected $percentEncoder;
|
|
|
|
public function __construct() {
|
|
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
|
|
}
|
|
|
|
/**
|
|
* Parses a URI.
|
|
* @param $uri string URI to parse
|
|
* @return HTMLPurifier_URI representation of URI. This representation has
|
|
* not been validated yet and may not conform to RFC.
|
|
*/
|
|
public function parse($uri) {
|
|
|
|
$uri = $this->percentEncoder->normalize($uri);
|
|
|
|
// Regexp is as per Appendix B.
|
|
// Note that ["<>] are an addition to the RFC's recommended
|
|
// characters, because they represent external delimeters.
|
|
$r_URI = '!'.
|
|
'(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
|
|
'(//([^/?#"<>]*))?'. // 4. Authority
|
|
'([^?#"<>]*)'. // 5. Path
|
|
'(\?([^#"<>]*))?'. // 7. Query
|
|
'(#([^"<>]*))?'. // 8. Fragment
|
|
'!';
|
|
|
|
$matches = array();
|
|
$result = preg_match($r_URI, $uri, $matches);
|
|
|
|
if (!$result) return false; // *really* invalid URI
|
|
|
|
// seperate out parts
|
|
$scheme = !empty($matches[1]) ? $matches[2] : null;
|
|
$authority = !empty($matches[3]) ? $matches[4] : null;
|
|
$path = $matches[5]; // always present, can be empty
|
|
$query = !empty($matches[6]) ? $matches[7] : null;
|
|
$fragment = !empty($matches[8]) ? $matches[9] : null;
|
|
|
|
// further parse authority
|
|
if ($authority !== null) {
|
|
$r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
|
|
$matches = array();
|
|
preg_match($r_authority, $authority, $matches);
|
|
$userinfo = !empty($matches[1]) ? $matches[2] : null;
|
|
$host = !empty($matches[3]) ? $matches[3] : '';
|
|
$port = !empty($matches[4]) ? (int) $matches[5] : null;
|
|
} else {
|
|
$port = $host = $userinfo = null;
|
|
}
|
|
|
|
return new HTMLPurifier_URI(
|
|
$scheme, $userinfo, $host, $port, $path, $query, $fragment);
|
|
}
|
|
|
|
}
|
|
|
|
// vim: et sw=4 sts=4
|