From 92aabf2b230312a5ddb4d719bf4f47e0ccd2b9af Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 2 Mar 2016 02:05:54 -0800 Subject: [PATCH] Fix #76, linkify includes dots at end of URL. Signed-off-by: Edward Z. Yang --- NEWS | 2 ++ library/HTMLPurifier/Injector/Linkify.php | 11 ++++++++--- tests/HTMLPurifier/Injector/LinkifyTest.php | 8 ++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index b8f09cfa..d563c186 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! %CSS.AllowDuplicates permits duplicate CSS properties. - alt truncation could result in malformed UTF-8 sequence. Don't truncate. Thanks Brandon Farber for reporting. +- Linkify regex is smarter, based off of Gruber's regex. +- IDNA supported natively on PHP 5.3 and later. 4.7.0, released 2015-08-04 # opacity is now considered a "tricky" CSS property rather than a diff --git a/library/HTMLPurifier/Injector/Linkify.php b/library/HTMLPurifier/Injector/Linkify.php index 069708c2..74f83eaa 100644 --- a/library/HTMLPurifier/Injector/Linkify.php +++ b/library/HTMLPurifier/Injector/Linkify.php @@ -31,9 +31,14 @@ class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector return; } - // there is/are URL(s). Let's split the string: - // Note: this regex is extremely permissive - $bits = preg_split('#((?:https?|ftp)://[^\s\'",<>()]+)#Su', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE); + // there is/are URL(s). Let's split the string. + // We use this regex: + // https://gist.github.com/gruber/249502 + // but with @cscott's backtracking fix and also + // the Unicode characters un-Unicodified. + $bits = preg_split( + '/\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'".,<>?\x{00ab}\x{00bb}\x{201c}\x{201d}\x{2018}\x{2019}]))/iu', + $token->data, -1, PREG_SPLIT_DELIM_CAPTURE); $token = array(); diff --git a/tests/HTMLPurifier/Injector/LinkifyTest.php b/tests/HTMLPurifier/Injector/LinkifyTest.php index 1954db6b..8eeac449 100644 --- a/tests/HTMLPurifier/Injector/LinkifyTest.php +++ b/tests/HTMLPurifier/Injector/LinkifyTest.php @@ -52,6 +52,14 @@ class HTMLPurifier_Injector_LinkifyTest extends HTMLPurifier_InjectorHarness $this->assertResult('http://example.com'); } + public function testRegexIsSmart() + { + $this->assertResult('http://example.com/foo.', 'http://example.com/foo.'); + $this->assertResult('“http://example.com/foo”', '“http://example.com/foo”'); + $this->assertResult('“http://example.com”', '“http://example.com”'); + $this->assertResult('(http://example.com/f(o)o)', '(http://example.com/f(o)o)'); + } + } // vim: et sw=4 sts=4