From 45161b4fb1ea3f9148d02d068d9b2f504a632788 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 23 Mar 2016 22:42:18 -0700 Subject: [PATCH] Accept leading digits in hostnames as per RFC 1123. Signed-off-by: Edward Z. Yang --- NEWS | 2 ++ library/HTMLPurifier/AttrDef/URI/Host.php | 16 +++++++++++----- tests/HTMLPurifier/AttrDef/URI/HostTest.php | 6 ++++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index d563c186..90c07839 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier truncate. Thanks Brandon Farber for reporting. - Linkify regex is smarter, based off of Gruber's regex. - IDNA supported natively on PHP 5.3 and later. +- Non all-numeric top-level names (e.g., foo.1f, 1f) are now + allowed. 4.7.0, released 2015-08-04 # opacity is now considered a "tricky" CSS property rather than a diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php index 371f228a..151f7aff 100644 --- a/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/library/HTMLPurifier/AttrDef/URI/Host.php @@ -76,17 +76,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef // fairly well supported. $underscore = $config->get('Core.AllowHostnameUnderscore') ? '_' : ''; + // Based off of RFC 1738, but amended so that + // as per RFC 3696, the top label need only not be all numeric. // The productions describing this are: $a = '[a-z]'; // alpha $an = '[a-z0-9]'; // alphanum $and = "[a-z0-9-$underscore]"; // alphanum | "-" // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum - $domainlabel = "$an($and*$an)?"; - // toplabel = alpha | alpha *( alphanum | "-" ) alphanum - $toplabel = "$a($and*$an)?"; + $domainlabel = "$an(?:$and*$an)?"; + // AMENDED as per RFC 3696 + // toplabel = alphanum | alphanum *( alphanum | "-" ) alphanum + // side condition: not all numeric + $toplabel = "$an(?:$and*$an)?"; // hostname = *( domainlabel "." ) toplabel [ "." ] - if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { - return $string; + if (preg_match("/^(?:$domainlabel\.)*($toplabel)\.?$/i", $string, $matches)) { + if (!ctype_digit($matches[1])) { + return $string; + } } // PHP 5.3 and later support this functionality natively diff --git a/tests/HTMLPurifier/AttrDef/URI/HostTest.php b/tests/HTMLPurifier/AttrDef/URI/HostTest.php index b645aa4c..1f20a748 100644 --- a/tests/HTMLPurifier/AttrDef/URI/HostTest.php +++ b/tests/HTMLPurifier/AttrDef/URI/HostTest.php @@ -19,12 +19,13 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness $this->assertDef('sub.test.'); $this->assertDef('.test', false); $this->assertDef('ff'); - $this->assertDef('1f', false); + $this->assertDef('1f'); // per RFC 1123 + // See also http://serverfault.com/questions/638260/is-it-valid-for-a-hostname-to-start-with-a-digit $this->assertDef('-f', false); $this->assertDef('f1'); $this->assertDef('f-', false); $this->assertDef('sub.ff'); - $this->assertDef('sub.1f', false); + $this->assertDef('sub.1f'); // per RFC 1123 $this->assertDef('sub.-f', false); $this->assertDef('sub.f1'); $this->assertDef('sub.f-', false); @@ -35,6 +36,7 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness $this->assertDef('f1.top'); $this->assertDef('f1_f2.ex.top', false); $this->assertDef('f-.top', false); + $this->assertDef('1a'); $this->assertDef("\xE4\xB8\xAD\xE6\x96\x87.com.cn", false);