From 97125ed18b55a78825803cacb99abedad4ed49c6 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 7 Mar 2010 21:14:44 -0500 Subject: [PATCH] Implement data URI scheme. Signed-off-by: Edward Z. Yang --- NEWS | 2 + TODO | 4 - library/HTMLPurifier.includes.php | 1 + library/HTMLPurifier.safe-includes.php | 1 + .../schema/URI.AllowedSchemes.txt | 2 + library/HTMLPurifier/URIScheme/data.php | 93 +++++++++++++++++++ smoketests/dataScheme.php | 37 ++++++++ tests/HTMLPurifier/URISchemeTest.php | 33 +++++++ 8 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 library/HTMLPurifier/URIScheme/data.php create mode 100644 smoketests/dataScheme.php diff --git a/NEWS b/NEWS index 7fabdf03..0863cf29 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 4.1.0, unknown release date ! Support proprietary height attribute on table element ! Support YouTube slideshows that contain /cp/ in their URL. +! Support for data: URI scheme; not enabled by default, add it using + %URI.AllowedSchemes 4.0.0, released 2009-07-07 # APIs for ConfigSchema subsystem have substantially changed. See diff --git a/TODO b/TODO index 20de77ac..a99d2d95 100644 --- a/TODO +++ b/TODO @@ -13,12 +13,8 @@ afraid to cast your vote for the next feature to be implemented! Standing patches: - - Incorporate data: support as implemented here: - http://htmlpurifier.org/phorum/read.php?3,3491,3548 - Incorporate download and resize support as implemented here: http://htmlpurifier.org/phorum/read.php?3,2795,3628 - - Incorporate remove tags that don't do anything (no attributes): - http://htmlpurifier.org/phorum/read.php?5,2507 Things to do as soon as possible: diff --git a/library/HTMLPurifier.includes.php b/library/HTMLPurifier.includes.php index df63d41b..292637ea 100644 --- a/library/HTMLPurifier.includes.php +++ b/library/HTMLPurifier.includes.php @@ -199,6 +199,7 @@ require 'HTMLPurifier/URIFilter/DisableExternalResources.php'; require 'HTMLPurifier/URIFilter/HostBlacklist.php'; require 'HTMLPurifier/URIFilter/MakeAbsolute.php'; require 'HTMLPurifier/URIFilter/Munge.php'; +require 'HTMLPurifier/URIScheme/data.php'; require 'HTMLPurifier/URIScheme/ftp.php'; require 'HTMLPurifier/URIScheme/http.php'; require 'HTMLPurifier/URIScheme/https.php'; diff --git a/library/HTMLPurifier.safe-includes.php b/library/HTMLPurifier.safe-includes.php index d540ab1a..6402de04 100644 --- a/library/HTMLPurifier.safe-includes.php +++ b/library/HTMLPurifier.safe-includes.php @@ -193,6 +193,7 @@ require_once $__dir . '/HTMLPurifier/URIFilter/DisableExternalResources.php'; require_once $__dir . '/HTMLPurifier/URIFilter/HostBlacklist.php'; require_once $__dir . '/HTMLPurifier/URIFilter/MakeAbsolute.php'; require_once $__dir . '/HTMLPurifier/URIFilter/Munge.php'; +require_once $__dir . '/HTMLPurifier/URIScheme/data.php'; require_once $__dir . '/HTMLPurifier/URIScheme/ftp.php'; require_once $__dir . '/HTMLPurifier/URIScheme/http.php'; require_once $__dir . '/HTMLPurifier/URIScheme/https.php'; diff --git a/library/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt b/library/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt index 98fdfe92..ae3a913f 100644 --- a/library/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt +++ b/library/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt @@ -12,4 +12,6 @@ array ( --DESCRIPTION-- Whitelist that defines the schemes that a URI is allowed to have. This prevents XSS attacks from using pseudo-schemes like javascript or mocha. +There is also support for the data URI scheme, but it is not +enabled by default. --# vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/URIScheme/data.php b/library/HTMLPurifier/URIScheme/data.php new file mode 100644 index 00000000..b7f1989c --- /dev/null +++ b/library/HTMLPurifier/URIScheme/data.php @@ -0,0 +1,93 @@ + true, + 'image/gif' => true, + 'image/png' => true, + ); + + public function validate(&$uri, $config, $context) { + $result = explode(',', $uri->path, 2); + $is_base64 = false; + $charset = null; + $content_type = null; + if (count($result) == 2) { + list($metadata, $data) = $result; + // do some legwork on the metadata + $metas = explode(';', $metadata); + while(!empty($metas)) { + $cur = array_shift($metas); + if ($cur == 'base64') { + $is_base64 = true; + break; + } + if (substr($cur, 0, 8) == 'charset=') { + // doesn't match if there are arbitrary spaces, but + // whatever dude + if ($charset !== null) continue; // garbage + $charset = substr($cur, 8); // not used + } else { + if ($content_type !== null) continue; // garbage + $content_type = $cur; + } + } + } else { + $data = $result[0]; + } + if ($content_type !== null && empty($this->allowed_types[$content_type])) { + return false; + } + if ($charset !== null) { + // error; we don't allow plaintext stuff + $charset = null; + } + $data = rawurldecode($data); + if ($is_base64) { + $raw_data = base64_decode($data); + } else { + $raw_data = $data; + } + // XXX probably want to refactor this into a general mechanism + // for filtering arbitrary content types + $file = tempnam("/tmp", ""); + file_put_contents($file, $raw_data); + if (function_exists('exif_imagetype')) { + $image_code = exif_imagetype($file); + } elseif (function_exists('getimagesize')) { + set_error_handler(array($this, 'muteErrorHandler')); + $info = getimagesize($file); + restore_error_handler(); + if ($info == false) return false; + $image_code = $info[2]; + } else { + trigger_error("could not find exif_imagetype or getimagesize functions", E_USER_ERROR); + } + $real_content_type = image_type_to_mime_type($image_code); + if ($real_content_type != $content_type) { + // we're nice guys; if the content type is something else we + // support, change it over + if (empty($this->allowed_types[$real_content_type])) return false; + $content_type = $real_content_type; + } + // ok, it's kosher, rewrite what we need + $uri->userinfo = null; + $uri->host = null; + $uri->port = null; + $uri->fragment = null; + $uri->query = null; + $uri->path = "$content_type;base64," . base64_encode($raw_data); + return true; + } + + public function muteErrorHandler($errno, $errstr) {} + +} + diff --git a/smoketests/dataScheme.php b/smoketests/dataScheme.php new file mode 100644 index 00000000..7c0bbdc3 --- /dev/null +++ b/smoketests/dataScheme.php @@ -0,0 +1,37 @@ +'; +?> + + + HTML Purifier data Scheme Smoketest + + + +

HTML Purifier data Scheme Smoketest

+'; + +$purifier = new HTMLPurifier(array('URI.AllowedSchemes' => 'data')); + +?> +
purify($string); +?>
+ + + +pngBase64 = + 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAABGdBTUEAALGP'. + 'C/xhBQAAAAlwSFlzAAALEwAACxMBAJqcGAAAAAd0SU1FB9YGARc5KB0XV+IA'. + 'AAAddEVYdENvbW1lbnQAQ3JlYXRlZCB3aXRoIFRoZSBHSU1Q72QlbgAAAF1J'. + 'REFUGNO9zL0NglAAxPEfdLTs4BZM4DIO4C7OwQg2JoQ9LE1exdlYvBBeZ7jq'. + 'ch9//q1uH4TLzw4d6+ErXMMcXuHWxId3KOETnnXXV6MJpcq2MLaI97CER3N0'. + 'vr4MkhoXe0rZigAAAABJRU5ErkJggg=='; + } + protected function assertValidation($uri, $expect_uri = true) { $this->prepareURI($uri, $expect_uri); + $this->config->set('URI.AllowedSchemes', array($uri->scheme)); // convenience hack: the scheme should be explicitly specified $scheme = $uri->getSchemeObj($this->config, $this->context); $result = $scheme->validate($uri, $this->config, $this->context); @@ -132,6 +145,26 @@ class HTMLPurifier_URISchemeTest extends HTMLPurifier_URIHarness ); } + function test_data_png() { + $this->assertValidation( + 'data:image/png;base64,'.$this->pngBase64 + ); + } + + function test_data_malformed() { + $this->assertValidation( + '', + false + ); + } + + function test_data_implicit() { + $this->assertValidation( + 'data:base64,'.$this->pngBase64, + 'data:image/png;base64,'.$this->pngBase64 + ); + } + } // vim: et sw=4 sts=4