mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-24 14:21:52 +00:00
e76f4b45d0
Basically, browsers don't parse what should be valid URIs correctly, so we have to go through some backbends to accomodate them. Specifically, for browseable URIs, the following URIs have unintended behavior: - ///example.com - http:/example.com - http:///example.com Furthermore, if the path begins with //, modifying these URLs must be done with care, as if you remove the host-name component, the parse tree changes. I've modified the engine to follow correct URI semantics as much as possible while outputting browser compatible code, and invalidate the URI in cases where we can't deal. There has been a refactoring of URIScheme so that this important check is always performed, introducing a new member variable allow_empty_host which is true on data, file, mailto and news schemes. This also fixes bypass bugs on URI.Munge. Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
69 lines
2.3 KiB
PHP
69 lines
2.3 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
|
|
*/
|
|
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
|
{
|
|
|
|
/**
|
|
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
|
|
*/
|
|
protected $ipv4;
|
|
|
|
/**
|
|
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
|
|
*/
|
|
protected $ipv6;
|
|
|
|
public function __construct() {
|
|
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
|
|
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
|
|
}
|
|
|
|
public function validate($string, $config, $context) {
|
|
$length = strlen($string);
|
|
// empty hostname is OK; it's usually semantically equivalent:
|
|
// the default host as defined by a URI scheme is used:
|
|
//
|
|
// If the URI scheme defines a default for host, then that
|
|
// default applies when the host subcomponent is undefined
|
|
// or when the registered name is empty (zero length).
|
|
if ($string === '') return '';
|
|
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
|
|
//IPv6
|
|
$ip = substr($string, 1, $length - 2);
|
|
$valid = $this->ipv6->validate($ip, $config, $context);
|
|
if ($valid === false) return false;
|
|
return '['. $valid . ']';
|
|
}
|
|
|
|
// need to do checks on unusual encodings too
|
|
$ipv4 = $this->ipv4->validate($string, $config, $context);
|
|
if ($ipv4 !== false) return $ipv4;
|
|
|
|
// A regular domain name.
|
|
|
|
// This breaks I18N domain names, but we don't have proper IRI support,
|
|
// so force users to insert Punycode. If there's complaining we'll
|
|
// try to fix things into an international friendly form.
|
|
|
|
// The productions describing this are:
|
|
$a = '[a-z]'; // alpha
|
|
$an = '[a-z0-9]'; // alphanum
|
|
$and = '[a-z0-9-]'; // alphanum | "-"
|
|
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
|
|
$domainlabel = "$an($and*$an)?";
|
|
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
|
|
$toplabel = "$a($and*$an)?";
|
|
// hostname = *( domainlabel "." ) toplabel [ "." ]
|
|
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
|
|
if (!$match) return false;
|
|
|
|
return $string;
|
|
}
|
|
|
|
}
|
|
|
|
// vim: et sw=4 sts=4
|