2006-08-17 01:05:35 +00:00
|
|
|
<?php
|
|
|
|
|
2006-08-20 21:47:15 +00:00
|
|
|
/**
|
2006-11-12 19:26:49 +00:00
|
|
|
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
|
2006-08-20 21:47:15 +00:00
|
|
|
*/
|
2007-02-14 20:38:51 +00:00
|
|
|
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
2006-08-17 01:05:35 +00:00
|
|
|
{
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2006-08-20 21:47:15 +00:00
|
|
|
/**
|
2013-07-16 13:56:14 +02:00
|
|
|
* IPv4 sub-validator.
|
|
|
|
* @type HTMLPurifier_AttrDef_URI_IPv4
|
2006-08-20 21:47:15 +00:00
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected $ipv4;
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2006-09-23 00:43:21 +00:00
|
|
|
/**
|
2013-07-16 13:56:14 +02:00
|
|
|
* IPv6 sub-validator.
|
|
|
|
* @type HTMLPurifier_AttrDef_URI_IPv6
|
2006-09-23 00:43:21 +00:00
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
protected $ipv6;
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2013-07-16 13:56:14 +02:00
|
|
|
public function __construct()
|
|
|
|
{
|
2007-02-14 20:38:51 +00:00
|
|
|
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
|
|
|
|
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
|
2006-08-17 01:05:35 +00:00
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2013-07-16 13:56:14 +02:00
|
|
|
/**
|
|
|
|
* @param string $string
|
|
|
|
* @param HTMLPurifier_Config $config
|
|
|
|
* @param HTMLPurifier_Context $context
|
|
|
|
* @return bool|string
|
|
|
|
*/
|
|
|
|
public function validate($string, $config, $context)
|
|
|
|
{
|
2006-08-17 01:05:35 +00:00
|
|
|
$length = strlen($string);
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// empty hostname is OK; it's usually semantically equivalent:
|
|
|
|
// the default host as defined by a URI scheme is used:
|
|
|
|
//
|
|
|
|
// If the URI scheme defines a default for host, then that
|
|
|
|
// default applies when the host subcomponent is undefined
|
|
|
|
// or when the registered name is empty (zero length).
|
2013-07-16 13:56:14 +02:00
|
|
|
if ($string === '') {
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
if ($length > 1 && $string[0] === '[' && $string[$length - 1] === ']') {
|
2006-08-17 01:05:35 +00:00
|
|
|
//IPv6
|
|
|
|
$ip = substr($string, 1, $length - 2);
|
|
|
|
$valid = $this->ipv6->validate($ip, $config, $context);
|
2013-07-16 13:56:14 +02:00
|
|
|
if ($valid === false) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return '[' . $valid . ']';
|
2006-08-17 01:05:35 +00:00
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2006-11-12 19:26:49 +00:00
|
|
|
// need to do checks on unusual encodings too
|
2006-08-17 01:05:35 +00:00
|
|
|
$ipv4 = $this->ipv4->validate($string, $config, $context);
|
2013-07-16 13:56:14 +02:00
|
|
|
if ($ipv4 !== false) {
|
|
|
|
return $ipv4;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-14 02:19:00 +00:00
|
|
|
// A regular domain name.
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2012-01-06 05:28:00 -08:00
|
|
|
// This doesn't match I18N domain names, but we don't have proper IRI support,
|
|
|
|
// so force users to insert Punycode.
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2013-07-26 21:33:39 -07:00
|
|
|
// There is not a good sense in which underscores should be
|
|
|
|
// allowed, since it's technically not! (And if you go as
|
|
|
|
// far to allow everything as specified by the DNS spec...
|
|
|
|
// well, that's literally everything, modulo some space limits
|
|
|
|
// for the components and the overall name (which, by the way,
|
|
|
|
// we are NOT checking!). So we (arbitrarily) decide this:
|
|
|
|
// let's allow underscores wherever we would have allowed
|
|
|
|
// hyphens, if they are enabled. This is a pretty good match
|
|
|
|
// for browser behavior, for example, a large number of browsers
|
|
|
|
// cannot handle foo_.example.com, but foo_bar.example.com is
|
|
|
|
// fairly well supported.
|
|
|
|
$underscore = $config->get('Core.AllowHostnameUnderscore') ? '_' : '';
|
|
|
|
|
2008-05-14 02:19:00 +00:00
|
|
|
// The productions describing this are:
|
|
|
|
$a = '[a-z]'; // alpha
|
|
|
|
$an = '[a-z0-9]'; // alphanum
|
2013-07-26 21:33:39 -07:00
|
|
|
$and = "[a-z0-9-$underscore]"; // alphanum | "-"
|
2008-05-14 02:19:00 +00:00
|
|
|
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
|
2013-07-16 13:56:14 +02:00
|
|
|
$domainlabel = "$an($and*$an)?";
|
2008-05-14 02:19:00 +00:00
|
|
|
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
|
2013-07-16 13:56:14 +02:00
|
|
|
$toplabel = "$a($and*$an)?";
|
2008-05-14 02:19:00 +00:00
|
|
|
// hostname = *( domainlabel "." ) toplabel [ "." ]
|
2012-01-06 05:28:00 -08:00
|
|
|
if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
|
|
|
|
return $string;
|
|
|
|
}
|
|
|
|
|
2016-03-02 01:35:07 -08:00
|
|
|
// PHP 5.3 and later support this functionality natively
|
|
|
|
if (function_exists('idn_to_ascii')) {
|
|
|
|
return idn_to_ascii($string);
|
|
|
|
|
2012-01-06 05:28:00 -08:00
|
|
|
// If we have Net_IDNA2 support, we can support IRIs by
|
|
|
|
// punycoding them. (This is the most portable thing to do,
|
|
|
|
// since otherwise we have to assume browsers support
|
2016-03-02 01:35:07 -08:00
|
|
|
} elseif ($config->get('Core.EnableIDNA')) {
|
2012-01-06 05:28:00 -08:00
|
|
|
$idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
|
|
|
|
// we need to encode each period separately
|
|
|
|
$parts = explode('.', $string);
|
|
|
|
try {
|
|
|
|
$new_parts = array();
|
|
|
|
foreach ($parts as $part) {
|
|
|
|
$encodable = false;
|
|
|
|
for ($i = 0, $c = strlen($part); $i < $c; $i++) {
|
|
|
|
if (ord($part[$i]) > 0x7a) {
|
|
|
|
$encodable = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!$encodable) {
|
|
|
|
$new_parts[] = $part;
|
|
|
|
} else {
|
|
|
|
$new_parts[] = $idna->encode($part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$string = implode('.', $new_parts);
|
|
|
|
if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
|
|
|
|
return $string;
|
|
|
|
}
|
|
|
|
} catch (Exception $e) {
|
|
|
|
// XXX error reporting
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
2006-08-17 01:05:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-06 04:24:59 -05:00
|
|
|
// vim: et sw=4 sts=4
|