From 700d5bcbfc834d807633e33967b3145e9d35bee2 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 27 Jun 2008 16:09:14 -0400 Subject: [PATCH] Implement %AutoFormat.RemoveEmpty, end to start ref, and injector rewind. Injector rewind: Injectors can now use the method rewind() in order to move the input index backwards, so that they can reprocess tokens (other injectors are not affected by a rewind). This functionality was necessary to implement nested node removals in %AutoFormat.RemoveEmpty. End to start ref: To facilitate rewinding, HTMLPurifier_Token_End now maintains a reference called $start to the starting token for their node. %AutoFormat.RemoveEmpty removes empty nodes. Lots of people have requested it, so here is a partially effective implementation. Because it is implemented as an Injector, it's not possible for it to handle newly introduced empty nodes by later validators, specifically auto-closing and child validation. The Injector is only meant to be used on HTML-ish languages. Signed-off-by: Edward Z. Yang --- NEWS | 4 ++ library/HTMLPurifier.includes.php | 1 + library/HTMLPurifier.safe-includes.php | 1 + library/HTMLPurifier/ConfigSchema/schema.ser | 2 +- .../schema/AutoFormat.RemoveEmpty.txt | 44 ++++++++++++ library/HTMLPurifier/Injector.php | 24 +++++++ library/HTMLPurifier/Injector/RemoveEmpty.php | 40 +++++++++++ .../HTMLPurifier/Strategy/MakeWellFormed.php | 70 +++++++++++++------ library/HTMLPurifier/Token/End.php | 6 +- .../HTMLPurifier/Injector/RemoveEmptyTest.php | 58 +++++++++++++++ .../HTMLPurifier/Injector/SafeObjectTest.php | 1 + .../Strategy/MakeWellFormed_InjectorTest.php | 25 ++++++- 12 files changed, 252 insertions(+), 24 deletions(-) create mode 100644 library/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt create mode 100644 library/HTMLPurifier/Injector/RemoveEmpty.php create mode 100644 tests/HTMLPurifier/Injector/RemoveEmptyTest.php diff --git a/NEWS b/NEWS index 683ed7b3..e0b086c7 100644 --- a/NEWS +++ b/NEWS @@ -18,8 +18,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Proper support for name attribute. It is now allowed and equivalent to the id attribute in a and img tags, and is only converted to id when %HTML.TidyLevel is heavy (for all doctypes). +! %AutoFormat.RemoveEmpty to remove some empty tags from documents. Please don't + use on hand-written HTML. . Strategy_MakeWellFormed now operates in-place, saving memory and allowing for more interesting filter-backtracking +. New HTMLPurifier_Injector->rewind() functionality, allows injectors to rewind + index to reprocess tokens. 3.1.1, released 2008-06-19 # %URI.Munge now, by default, does not munge resources (for example, ) diff --git a/library/HTMLPurifier.includes.php b/library/HTMLPurifier.includes.php index 35c70049..fbeebf93 100644 --- a/library/HTMLPurifier.includes.php +++ b/library/HTMLPurifier.includes.php @@ -165,6 +165,7 @@ require 'HTMLPurifier/HTMLModule/Tidy/XHTML.php'; require 'HTMLPurifier/Injector/AutoParagraph.php'; require 'HTMLPurifier/Injector/Linkify.php'; require 'HTMLPurifier/Injector/PurifierLinkify.php'; +require 'HTMLPurifier/Injector/RemoveEmpty.php'; require 'HTMLPurifier/Injector/SafeObject.php'; require 'HTMLPurifier/Lexer/DOMLex.php'; require 'HTMLPurifier/Lexer/DirectLex.php'; diff --git a/library/HTMLPurifier.safe-includes.php b/library/HTMLPurifier.safe-includes.php index 631dcce3..0cf8d791 100644 --- a/library/HTMLPurifier.safe-includes.php +++ b/library/HTMLPurifier.safe-includes.php @@ -159,6 +159,7 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/XHTML.php'; require_once $__dir . '/HTMLPurifier/Injector/AutoParagraph.php'; require_once $__dir . '/HTMLPurifier/Injector/Linkify.php'; require_once $__dir . '/HTMLPurifier/Injector/PurifierLinkify.php'; +require_once $__dir . '/HTMLPurifier/Injector/RemoveEmpty.php'; require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php'; require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php'; require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php'; diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser index 25903e8a..79f5a022 100644 --- a/library/HTMLPurifier/ConfigSchema/schema.ser +++ b/library/HTMLPurifier/ConfigSchema/schema.ser @@ -1 +1 @@ -O:25:"HTMLPurifier_ConfigSchema":2:{s:8:"defaults";a:12:{s:4:"Attr";a:11:{s:19:"AllowedFrameTargets";a:0:{}s:10:"AllowedRel";a:0:{}s:10:"AllowedRev";a:0:{}s:19:"DefaultInvalidImage";s:0:"";s:22:"DefaultInvalidImageAlt";s:13:"Invalid image";s:14:"DefaultTextDir";s:3:"ltr";s:8:"EnableID";b:0;s:11:"IDBlacklist";a:0:{}s:17:"IDBlacklistRegexp";N;s:8:"IDPrefix";s:0:"";s:13:"IDPrefixLocal";s:0:"";}s:10:"AutoFormat";a:4:{s:13:"AutoParagraph";b:0;s:6:"Custom";a:0:{}s:7:"Linkify";b:0;s:15:"PurifierLinkify";b:0;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";s:3:"#%s";}s:3:"CSS";a:6:{s:14:"AllowImportant";b:0;s:11:"AllowTricky";b:0;s:17:"AllowedProperties";N;s:13:"DefinitionRev";i:1;s:12:"MaxImgLength";s:6:"1200px";s:11:"Proprietary";b:0;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";s:10:"Serializer";s:14:"SerializerPath";N;}s:4:"Core";a:15:{s:17:"AggressivelyFixLt";b:0;s:13:"CollectErrors";b:0;s:13:"ColorKeywords";a:17:{s:6:"maroon";s:7:"#800000";s:3:"red";s:7:"#FF0000";s:6:"orange";s:7:"#FFA500";s:6:"yellow";s:7:"#FFFF00";s:5:"olive";s:7:"#808000";s:6:"purple";s:7:"#800080";s:7:"fuchsia";s:7:"#FF00FF";s:5:"white";s:7:"#FFFFFF";s:4:"lime";s:7:"#00FF00";s:5:"green";s:7:"#008000";s:4:"navy";s:7:"#000080";s:4:"blue";s:7:"#0000FF";s:4:"aqua";s:7:"#00FFFF";s:4:"teal";s:7:"#008080";s:5:"black";s:7:"#000000";s:6:"silver";s:7:"#C0C0C0";s:4:"gray";s:7:"#808080";}s:25:"ConvertDocumentToFragment";b:1;s:31:"DirectLexLineNumberSyncInterval";i:0;s:8:"Encoding";s:5:"utf-8";s:21:"EscapeInvalidChildren";b:0;s:17:"EscapeInvalidTags";b:0;s:24:"EscapeNonASCIICharacters";b:0;s:14:"HiddenElements";a:2:{s:6:"script";b:1;s:5:"style";b:1;}s:8:"Language";s:2:"en";s:9:"LexerImpl";N;s:19:"MaintainLineNumbers";N;s:16:"RemoveInvalidImg";b:1;s:20:"RemoveScriptContents";N;}s:6:"Filter";a:3:{s:6:"Custom";a:0:{}s:18:"ExtractStyleBlocks";b:0;s:7:"YouTube";b:0;}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";b:1;s:23:"ExtractStyleBlocksScope";N;s:26:"ExtractStyleBlocksTidyImpl";N;}s:4:"HTML";a:23:{s:7:"Allowed";N;s:17:"AllowedAttributes";N;s:15:"AllowedElements";N;s:14:"AllowedModules";N;s:12:"BlockWrapper";s:1:"p";s:11:"CoreModules";a:7:{s:9:"Structure";b:1;s:4:"Text";b:1;s:9:"Hypertext";b:1;s:4:"List";b:1;s:22:"NonXMLCommonAttributes";b:1;s:19:"XMLCommonAttributes";b:1;s:16:"CommonAttributes";b:1;}s:13:"CustomDoctype";N;s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Doctype";N;s:19:"ForbiddenAttributes";a:0:{}s:17:"ForbiddenElements";a:0:{}s:12:"MaxImgLength";i:1200;s:6:"Parent";s:3:"div";s:11:"Proprietary";b:0;s:9:"SafeEmbed";b:0;s:10:"SafeObject";b:0;s:6:"Strict";b:0;s:7:"TidyAdd";a:0:{}s:9:"TidyLevel";s:6:"medium";s:10:"TidyRemove";a:0:{}s:7:"Trusted";b:0;s:5:"XHTML";b:1;}s:6:"Output";a:4:{s:21:"CommentScriptContents";b:1;s:7:"Newline";N;s:8:"SortAttr";b:0;s:10:"TidyFormat";b:0;}s:4:"Test";a:1:{s:12:"ForceNoIconv";b:0;}s:3:"URI";a:16:{s:14:"AllowedSchemes";a:6:{s:4:"http";b:1;s:5:"https";b:1;s:6:"mailto";b:1;s:3:"ftp";b:1;s:4:"nntp";b:1;s:4:"news";b:1;}s:4:"Base";N;s:13:"DefaultScheme";s:4:"http";s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Disable";b:0;s:15:"DisableExternal";b:0;s:24:"DisableExternalResources";b:0;s:16:"DisableResources";b:0;s:4:"Host";N;s:13:"HostBlacklist";a:0:{}s:12:"MakeAbsolute";b:0;s:5:"Munge";N;s:14:"MungeResources";b:0;s:14:"MungeSecretKey";N;s:22:"OverrideAllowedSchemes";b:1;}}s:4:"info";a:12:{s:4:"Attr";a:12:{s:19:"AllowedFrameTargets";i:8;s:10:"AllowedRel";i:8;s:10:"AllowedRev";i:8;s:19:"DefaultInvalidImage";i:1;s:22:"DefaultInvalidImageAlt";i:1;s:14:"DefaultTextDir";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:2:{s:3:"ltr";b:1;s:3:"rtl";b:1;}}s:8:"EnableID";i:7;s:11:"IDBlacklist";i:9;s:17:"IDBlacklistRegexp";i:-1;s:8:"IDPrefix";i:1;s:13:"IDPrefixLocal";i:1;s:10:"DisableURI";O:8:"stdClass":3:{s:9:"namespace";s:3:"URI";s:4:"name";s:7:"Disable";s:7:"isAlias";b:1;}}s:10:"AutoFormat";a:4:{s:13:"AutoParagraph";i:7;s:6:"Custom";i:9;s:7:"Linkify";i:7;s:15:"PurifierLinkify";i:7;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";i:1;}s:3:"CSS";a:6:{s:14:"AllowImportant";i:7;s:11:"AllowTricky";i:7;s:17:"AllowedProperties";i:-8;s:13:"DefinitionRev";i:5;s:12:"MaxImgLength";i:-1;s:11:"Proprietary";i:7;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";i:-1;s:14:"SerializerPath";i:-1;}s:4:"Core";a:20:{s:15:"DefinitionCache";O:8:"stdClass":3:{s:9:"namespace";s:5:"Cache";s:4:"name";s:14:"DefinitionImpl";s:7:"isAlias";b:1;}s:17:"AggressivelyFixLt";i:7;s:13:"CollectErrors";i:7;s:13:"ColorKeywords";i:10;s:25:"ConvertDocumentToFragment";i:7;s:19:"AcceptFullDocuments";O:8:"stdClass":3:{s:9:"namespace";s:4:"Core";s:4:"name";s:25:"ConvertDocumentToFragment";s:7:"isAlias";b:1;}s:31:"DirectLexLineNumberSyncInterval";i:5;s:8:"Encoding";i:2;s:21:"EscapeInvalidChildren";i:7;s:17:"EscapeInvalidTags";i:7;s:24:"EscapeNonASCIICharacters";i:7;s:14:"HiddenElements";i:8;s:8:"Language";i:1;s:9:"LexerImpl";i:-11;s:19:"MaintainLineNumbers";i:-7;s:16:"RemoveInvalidImg";i:7;s:20:"RemoveScriptContents";i:-7;s:5:"XHTML";O:8:"stdClass":3:{s:9:"namespace";s:4:"HTML";s:4:"name";s:5:"XHTML";s:7:"isAlias";b:1;}s:21:"CommentScriptContents";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:21:"CommentScriptContents";s:7:"isAlias";b:1;}s:10:"TidyFormat";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:10:"TidyFormat";s:7:"isAlias";b:1;}}s:6:"Filter";a:5:{s:6:"Custom";i:9;s:18:"ExtractStyleBlocks";i:7;s:7:"YouTube";i:7;s:26:"ExtractStyleBlocksEscaping";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:26:"ExtractStyleBlocksEscaping";s:7:"isAlias";b:1;}s:23:"ExtractStyleBlocksScope";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:23:"ExtractStyleBlocksScope";s:7:"isAlias";b:1;}}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";i:7;s:23:"ExtractStyleBlocksScope";i:-1;s:26:"ExtractStyleBlocksTidyImpl";i:-11;}s:4:"HTML";a:24:{s:12:"EnableAttrID";O:8:"stdClass":3:{s:9:"namespace";s:4:"Attr";s:4:"name";s:8:"EnableID";s:7:"isAlias";b:1;}s:7:"Allowed";i:-4;s:17:"AllowedAttributes";i:-8;s:15:"AllowedElements";i:-8;s:14:"AllowedModules";i:-8;s:12:"BlockWrapper";i:1;s:11:"CoreModules";i:8;s:13:"CustomDoctype";i:-1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Doctype";O:8:"stdClass":3:{s:4:"type";i:1;s:10:"allow_null";b:1;s:7:"allowed";a:5:{s:22:"HTML 4.01 Transitional";b:1;s:16:"HTML 4.01 Strict";b:1;s:22:"XHTML 1.0 Transitional";b:1;s:16:"XHTML 1.0 Strict";b:1;s:9:"XHTML 1.1";b:1;}}s:19:"ForbiddenAttributes";i:8;s:17:"ForbiddenElements";i:8;s:12:"MaxImgLength";i:-5;s:6:"Parent";i:1;s:11:"Proprietary";i:7;s:9:"SafeEmbed";i:7;s:10:"SafeObject";i:7;s:6:"Strict";i:7;s:7:"TidyAdd";i:8;s:9:"TidyLevel";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:4:{s:4:"none";b:1;s:5:"light";b:1;s:6:"medium";b:1;s:5:"heavy";b:1;}}s:10:"TidyRemove";i:8;s:7:"Trusted";i:7;s:5:"XHTML";i:7;}s:6:"Output";a:4:{s:21:"CommentScriptContents";i:7;s:7:"Newline";i:-1;s:8:"SortAttr";i:7;s:10:"TidyFormat";i:7;}s:4:"Test";a:1:{s:12:"ForceNoIconv";i:7;}s:3:"URI";a:16:{s:14:"AllowedSchemes";i:8;s:4:"Base";i:-1;s:13:"DefaultScheme";i:1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Disable";i:7;s:15:"DisableExternal";i:7;s:24:"DisableExternalResources";i:7;s:16:"DisableResources";i:7;s:4:"Host";i:-1;s:13:"HostBlacklist";i:9;s:12:"MakeAbsolute";i:7;s:5:"Munge";i:-1;s:14:"MungeResources";i:7;s:14:"MungeSecretKey";i:-1;s:22:"OverrideAllowedSchemes";i:7;}}} \ No newline at end of file +O:25:"HTMLPurifier_ConfigSchema":2:{s:8:"defaults";a:12:{s:4:"Attr";a:11:{s:19:"AllowedFrameTargets";a:0:{}s:10:"AllowedRel";a:0:{}s:10:"AllowedRev";a:0:{}s:19:"DefaultInvalidImage";s:0:"";s:22:"DefaultInvalidImageAlt";s:13:"Invalid image";s:14:"DefaultTextDir";s:3:"ltr";s:8:"EnableID";b:0;s:11:"IDBlacklist";a:0:{}s:17:"IDBlacklistRegexp";N;s:8:"IDPrefix";s:0:"";s:13:"IDPrefixLocal";s:0:"";}s:10:"AutoFormat";a:5:{s:13:"AutoParagraph";b:0;s:6:"Custom";a:0:{}s:7:"Linkify";b:0;s:15:"PurifierLinkify";b:0;s:11:"RemoveEmpty";b:0;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";s:3:"#%s";}s:3:"CSS";a:6:{s:14:"AllowImportant";b:0;s:11:"AllowTricky";b:0;s:17:"AllowedProperties";N;s:13:"DefinitionRev";i:1;s:12:"MaxImgLength";s:6:"1200px";s:11:"Proprietary";b:0;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";s:10:"Serializer";s:14:"SerializerPath";N;}s:4:"Core";a:15:{s:17:"AggressivelyFixLt";b:0;s:13:"CollectErrors";b:0;s:13:"ColorKeywords";a:17:{s:6:"maroon";s:7:"#800000";s:3:"red";s:7:"#FF0000";s:6:"orange";s:7:"#FFA500";s:6:"yellow";s:7:"#FFFF00";s:5:"olive";s:7:"#808000";s:6:"purple";s:7:"#800080";s:7:"fuchsia";s:7:"#FF00FF";s:5:"white";s:7:"#FFFFFF";s:4:"lime";s:7:"#00FF00";s:5:"green";s:7:"#008000";s:4:"navy";s:7:"#000080";s:4:"blue";s:7:"#0000FF";s:4:"aqua";s:7:"#00FFFF";s:4:"teal";s:7:"#008080";s:5:"black";s:7:"#000000";s:6:"silver";s:7:"#C0C0C0";s:4:"gray";s:7:"#808080";}s:25:"ConvertDocumentToFragment";b:1;s:31:"DirectLexLineNumberSyncInterval";i:0;s:8:"Encoding";s:5:"utf-8";s:21:"EscapeInvalidChildren";b:0;s:17:"EscapeInvalidTags";b:0;s:24:"EscapeNonASCIICharacters";b:0;s:14:"HiddenElements";a:2:{s:6:"script";b:1;s:5:"style";b:1;}s:8:"Language";s:2:"en";s:9:"LexerImpl";N;s:19:"MaintainLineNumbers";N;s:16:"RemoveInvalidImg";b:1;s:20:"RemoveScriptContents";N;}s:6:"Filter";a:3:{s:6:"Custom";a:0:{}s:18:"ExtractStyleBlocks";b:0;s:7:"YouTube";b:0;}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";b:1;s:23:"ExtractStyleBlocksScope";N;s:26:"ExtractStyleBlocksTidyImpl";N;}s:4:"HTML";a:23:{s:7:"Allowed";N;s:17:"AllowedAttributes";N;s:15:"AllowedElements";N;s:14:"AllowedModules";N;s:12:"BlockWrapper";s:1:"p";s:11:"CoreModules";a:7:{s:9:"Structure";b:1;s:4:"Text";b:1;s:9:"Hypertext";b:1;s:4:"List";b:1;s:22:"NonXMLCommonAttributes";b:1;s:19:"XMLCommonAttributes";b:1;s:16:"CommonAttributes";b:1;}s:13:"CustomDoctype";N;s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Doctype";N;s:19:"ForbiddenAttributes";a:0:{}s:17:"ForbiddenElements";a:0:{}s:12:"MaxImgLength";i:1200;s:6:"Parent";s:3:"div";s:11:"Proprietary";b:0;s:9:"SafeEmbed";b:0;s:10:"SafeObject";b:0;s:6:"Strict";b:0;s:7:"TidyAdd";a:0:{}s:9:"TidyLevel";s:6:"medium";s:10:"TidyRemove";a:0:{}s:7:"Trusted";b:0;s:5:"XHTML";b:1;}s:6:"Output";a:4:{s:21:"CommentScriptContents";b:1;s:7:"Newline";N;s:8:"SortAttr";b:0;s:10:"TidyFormat";b:0;}s:4:"Test";a:1:{s:12:"ForceNoIconv";b:0;}s:3:"URI";a:16:{s:14:"AllowedSchemes";a:6:{s:4:"http";b:1;s:5:"https";b:1;s:6:"mailto";b:1;s:3:"ftp";b:1;s:4:"nntp";b:1;s:4:"news";b:1;}s:4:"Base";N;s:13:"DefaultScheme";s:4:"http";s:12:"DefinitionID";N;s:13:"DefinitionRev";i:1;s:7:"Disable";b:0;s:15:"DisableExternal";b:0;s:24:"DisableExternalResources";b:0;s:16:"DisableResources";b:0;s:4:"Host";N;s:13:"HostBlacklist";a:0:{}s:12:"MakeAbsolute";b:0;s:5:"Munge";N;s:14:"MungeResources";b:0;s:14:"MungeSecretKey";N;s:22:"OverrideAllowedSchemes";b:1;}}s:4:"info";a:12:{s:4:"Attr";a:12:{s:19:"AllowedFrameTargets";i:8;s:10:"AllowedRel";i:8;s:10:"AllowedRev";i:8;s:19:"DefaultInvalidImage";i:1;s:22:"DefaultInvalidImageAlt";i:1;s:14:"DefaultTextDir";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:2:{s:3:"ltr";b:1;s:3:"rtl";b:1;}}s:8:"EnableID";i:7;s:11:"IDBlacklist";i:9;s:17:"IDBlacklistRegexp";i:-1;s:8:"IDPrefix";i:1;s:13:"IDPrefixLocal";i:1;s:10:"DisableURI";O:8:"stdClass":3:{s:9:"namespace";s:3:"URI";s:4:"name";s:7:"Disable";s:7:"isAlias";b:1;}}s:10:"AutoFormat";a:5:{s:13:"AutoParagraph";i:7;s:6:"Custom";i:9;s:7:"Linkify";i:7;s:15:"PurifierLinkify";i:7;s:11:"RemoveEmpty";i:7;}s:15:"AutoFormatParam";a:1:{s:21:"PurifierLinkifyDocURL";i:1;}s:3:"CSS";a:6:{s:14:"AllowImportant";i:7;s:11:"AllowTricky";i:7;s:17:"AllowedProperties";i:-8;s:13:"DefinitionRev";i:5;s:12:"MaxImgLength";i:-1;s:11:"Proprietary";i:7;}s:5:"Cache";a:2:{s:14:"DefinitionImpl";i:-1;s:14:"SerializerPath";i:-1;}s:4:"Core";a:20:{s:15:"DefinitionCache";O:8:"stdClass":3:{s:9:"namespace";s:5:"Cache";s:4:"name";s:14:"DefinitionImpl";s:7:"isAlias";b:1;}s:17:"AggressivelyFixLt";i:7;s:13:"CollectErrors";i:7;s:13:"ColorKeywords";i:10;s:25:"ConvertDocumentToFragment";i:7;s:19:"AcceptFullDocuments";O:8:"stdClass":3:{s:9:"namespace";s:4:"Core";s:4:"name";s:25:"ConvertDocumentToFragment";s:7:"isAlias";b:1;}s:31:"DirectLexLineNumberSyncInterval";i:5;s:8:"Encoding";i:2;s:21:"EscapeInvalidChildren";i:7;s:17:"EscapeInvalidTags";i:7;s:24:"EscapeNonASCIICharacters";i:7;s:14:"HiddenElements";i:8;s:8:"Language";i:1;s:9:"LexerImpl";i:-11;s:19:"MaintainLineNumbers";i:-7;s:16:"RemoveInvalidImg";i:7;s:20:"RemoveScriptContents";i:-7;s:5:"XHTML";O:8:"stdClass":3:{s:9:"namespace";s:4:"HTML";s:4:"name";s:5:"XHTML";s:7:"isAlias";b:1;}s:21:"CommentScriptContents";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:21:"CommentScriptContents";s:7:"isAlias";b:1;}s:10:"TidyFormat";O:8:"stdClass":3:{s:9:"namespace";s:6:"Output";s:4:"name";s:10:"TidyFormat";s:7:"isAlias";b:1;}}s:6:"Filter";a:5:{s:6:"Custom";i:9;s:18:"ExtractStyleBlocks";i:7;s:7:"YouTube";i:7;s:26:"ExtractStyleBlocksEscaping";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:26:"ExtractStyleBlocksEscaping";s:7:"isAlias";b:1;}s:23:"ExtractStyleBlocksScope";O:8:"stdClass":3:{s:9:"namespace";s:11:"FilterParam";s:4:"name";s:23:"ExtractStyleBlocksScope";s:7:"isAlias";b:1;}}s:11:"FilterParam";a:3:{s:26:"ExtractStyleBlocksEscaping";i:7;s:23:"ExtractStyleBlocksScope";i:-1;s:26:"ExtractStyleBlocksTidyImpl";i:-11;}s:4:"HTML";a:24:{s:12:"EnableAttrID";O:8:"stdClass":3:{s:9:"namespace";s:4:"Attr";s:4:"name";s:8:"EnableID";s:7:"isAlias";b:1;}s:7:"Allowed";i:-4;s:17:"AllowedAttributes";i:-8;s:15:"AllowedElements";i:-8;s:14:"AllowedModules";i:-8;s:12:"BlockWrapper";i:1;s:11:"CoreModules";i:8;s:13:"CustomDoctype";i:-1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Doctype";O:8:"stdClass":3:{s:4:"type";i:1;s:10:"allow_null";b:1;s:7:"allowed";a:5:{s:22:"HTML 4.01 Transitional";b:1;s:16:"HTML 4.01 Strict";b:1;s:22:"XHTML 1.0 Transitional";b:1;s:16:"XHTML 1.0 Strict";b:1;s:9:"XHTML 1.1";b:1;}}s:19:"ForbiddenAttributes";i:8;s:17:"ForbiddenElements";i:8;s:12:"MaxImgLength";i:-5;s:6:"Parent";i:1;s:11:"Proprietary";i:7;s:9:"SafeEmbed";i:7;s:10:"SafeObject";i:7;s:6:"Strict";i:7;s:7:"TidyAdd";i:8;s:9:"TidyLevel";O:8:"stdClass":2:{s:4:"type";i:1;s:7:"allowed";a:4:{s:4:"none";b:1;s:5:"light";b:1;s:6:"medium";b:1;s:5:"heavy";b:1;}}s:10:"TidyRemove";i:8;s:7:"Trusted";i:7;s:5:"XHTML";i:7;}s:6:"Output";a:4:{s:21:"CommentScriptContents";i:7;s:7:"Newline";i:-1;s:8:"SortAttr";i:7;s:10:"TidyFormat";i:7;}s:4:"Test";a:1:{s:12:"ForceNoIconv";i:7;}s:3:"URI";a:16:{s:14:"AllowedSchemes";i:8;s:4:"Base";i:-1;s:13:"DefaultScheme";i:1;s:12:"DefinitionID";i:-1;s:13:"DefinitionRev";i:5;s:7:"Disable";i:7;s:15:"DisableExternal";i:7;s:24:"DisableExternalResources";i:7;s:16:"DisableResources";i:7;s:4:"Host";i:-1;s:13:"HostBlacklist";i:9;s:12:"MakeAbsolute";i:7;s:5:"Munge";i:-1;s:14:"MungeResources";i:7;s:14:"MungeSecretKey";i:-1;s:22:"OverrideAllowedSchemes";i:7;}}} \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt b/library/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt new file mode 100644 index 00000000..83f9eae5 --- /dev/null +++ b/library/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveEmpty.txt @@ -0,0 +1,44 @@ +AutoFormat.RemoveEmpty +TYPE: bool +VERSION: 3.1.2 +DEFAULT: false +--DESCRIPTION-- +

+ When enabled, HTML Purifier will attempt to remove empty elements that + contribute no semantic information to the document. The following types + of nodes will be removed: +

+ +

+ Please be very careful when using this functionality; while it may not + seem that empty elements contain useful information, they can alter the + layout of a document given appropriate styling. This directive is most + useful when you are processing machine-generated HTML, please avoid using + it on regular user HTML. +

+

+ Elements that contain only whitespace will be treated as empty. Non-breaking + spaces, however, do not count as whitespace. +

+

+ This algorithm is not perfect; you may still notice some empty tags, + particularly if a node had elements, but those elements were later removed + because they were not permitted in that context, or tags that, after + being auto-closed by another tag, where empty. This is for safety reasons + to prevent clever code from breaking validation. The general rule of thumb: + if a tag looked empty on the way end, it will get removed; if HTML Purifier + made it empty, it will stay. +

diff --git a/library/HTMLPurifier/Injector.php b/library/HTMLPurifier/Injector.php index 1947c340..ee10934a 100644 --- a/library/HTMLPurifier/Injector.php +++ b/library/HTMLPurifier/Injector.php @@ -54,6 +54,30 @@ abstract class HTMLPurifier_Injector */ public $needed = array(); + /** + * Index of inputTokens to rewind to. + */ + protected $rewind = false; + + /** + * Rewind to a spot to re-perform processing. This is useful if you + * deleted a node, and now need to see if this change affected any + * earlier nodes. Rewinding does not affect other injectors, and can + * result in infinite loops if not used carefully. + */ + public function rewind($index) { + $this->rewind = $index; + } + + /** + * Retrieves rewind, and then unsets it. + */ + public function getRewind() { + $r = $this->rewind; + $this->rewind = false; + return $r; + } + /** * Prepares the injector by giving it the config and context objects: * this allows references to important variables to be made within diff --git a/library/HTMLPurifier/Injector/RemoveEmpty.php b/library/HTMLPurifier/Injector/RemoveEmpty.php new file mode 100644 index 00000000..53bb0f00 --- /dev/null +++ b/library/HTMLPurifier/Injector/RemoveEmpty.php @@ -0,0 +1,40 @@ +config = $config; + $this->context = $context; + $this->attrValidator = new HTMLPurifier_AttrValidator(); + } + + public function handleElement(&$token) { + if (!$token instanceof HTMLPurifier_Token_Start) return; + $next = false; + for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) { + $next = $this->inputTokens[$i]; + if ($next instanceof HTMLPurifier_Token_Text && $next->is_whitespace) continue; + break; + } + if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) { + if ($token->name == 'colgroup') return; + $this->attrValidator->validateToken($token, $this->config, $this->context); + $token->armor['ValidateAttributes'] = true; + if (isset($token->attr['id']) || isset($token->attr['name'])) return; + $token = $i - $this->inputIndex + 1; + for ($b = $this->inputIndex - 1; $b > 0; $b--) { + $prev = $this->inputTokens[$b]; + if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue; + break; + } + // This is safe because we removed the token that triggered this. + $this->rewind($b - 1); + return; + } + } + +} diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index b0001f0b..7601b857 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -100,7 +100,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // injector handler code; duplicated for performance reasons foreach ($this->injectors as $i => $injector) { if (!$injector->skip) $injector->handleText($token); - if (is_array($token)) { + if (is_array($token) || is_int($token)) { $this->currentInjector = $i; break; } @@ -144,7 +144,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent); // insert parent end tag before this tag; // end tag isn't processed, but this tag is processed again - $this->insertBefore(new HTMLPurifier_Token_End($parent->name)); + $new_token = new HTMLPurifier_Token_End($parent->name); + $new_token->start = $parent; + $this->insertBefore($new_token); continue; } @@ -157,7 +159,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy if ($ok) { foreach ($this->injectors as $i => $injector) { if (!$injector->skip) $injector->handleElement($token); - if (is_array($token)) { + if (is_array($token) || is_int($token)) { $this->currentInjector = $i; break; } @@ -189,6 +191,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // first, check for the simplest case: everything closes neatly $current_parent = array_pop($this->currentNesting); if ($current_parent->name == $token->name) { + $token->start = $current_parent; foreach ($this->injectors as $i => $injector) { $injector->notifyEnd($token); } @@ -236,6 +239,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]); } $new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name); + $new_token->start = $skipped_tags[$i]; $this->insertAfter($new_token); //printTokens($tokens, $this->inputIndex); //var_dump($this->currentNesting); @@ -261,6 +265,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy } // instead of splice, since we know this is the end $tokens[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name); + $new_token->start = $this->currentNesting[$i]; foreach ($this->injectors as $injector) { $injector->notifyEnd($new_token); } @@ -313,34 +318,59 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy * If $token is false, the current token is deleted. */ protected function processToken($token, $config, $context) { - if (is_array($token)) { + if (is_array($token) || is_int($token)) { // the original token was overloaded by an injector, time // to some fancy acrobatics - - // $this->inputIndex is decremented so that the entire set gets - // re-processed - array_splice($this->inputTokens, $this->inputIndex--, 1, $token); - - // adjust the injector skips based on the array substitution + if (is_array($token)) { + array_splice($this->inputTokens, $this->inputIndex, 1, $token); + } else { + array_splice($this->inputTokens, $this->inputIndex, $token, array()); + } if ($this->injectors) { - $offset = count($token); - for ($i = 0; $i <= $this->currentInjector; $i++) { - // because of the skip back, we need to add one more - // for uninitialized injectors. I'm not exactly - // sure why this is the case, but I think it has to - // do with the fact that we're decrementing skips - // before re-checking text - if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++; - $this->injectors[$i]->skip += $offset; + $rewind = $this->injectors[$this->currentInjector]->getRewind(); + if ($rewind < 0) $rewind = 0; + if ($rewind !== false) { + $offset = $this->inputIndex - $rewind; + if ($this->injectors) { + foreach ($this->injectors as $i => $injector) { + if ($i == $this->currentInjector) { + $injector->skip = 0; + } else { + $injector->skip += $offset; + } + } + } + for ($this->inputIndex--; $this->inputIndex >= $rewind; $this->inputIndex--) { + $prev = $this->inputTokens[$this->inputIndex]; + if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->currentNesting); + elseif ($prev instanceof HTMLPurifier_Token_End) $this->currentNesting[] = $prev->start; + } + $this->inputIndex++; + } else { + // adjust the injector skips based on the array substitution + $offset = is_array($token) ? count($token) : 0; + for ($i = 0; $i <= $this->currentInjector; $i++) { + // because of the skip back, we need to add one more + // for uninitialized injectors. I'm not exactly + // sure why this is the case, but I think it has to + // do with the fact that we're decrementing skips + // before re-checking text + if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++; + $this->injectors[$i]->skip += $offset; + } } } + // ensure that we reprocess these tokens with the other injectors + --$this->inputIndex; + } elseif ($token) { // regular case $this->swap($token); if ($token instanceof HTMLPurifier_Token_Start) { $this->currentNesting[] = $token; } elseif ($token instanceof HTMLPurifier_Token_End) { - array_pop($this->currentNesting); // not actually used + // not actually used + $token->start = array_pop($this->currentNesting); } } else { $this->remove(); diff --git a/library/HTMLPurifier/Token/End.php b/library/HTMLPurifier/Token/End.php index fbf45e81..13cba031 100644 --- a/library/HTMLPurifier/Token/End.php +++ b/library/HTMLPurifier/Token/End.php @@ -9,5 +9,9 @@ */ class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag { - + /** + * Token that started this node. Added by MakeWellFormed. Please + * do not edit this! + */ + public $start; } diff --git a/tests/HTMLPurifier/Injector/RemoveEmptyTest.php b/tests/HTMLPurifier/Injector/RemoveEmptyTest.php new file mode 100644 index 00000000..4726a5df --- /dev/null +++ b/tests/HTMLPurifier/Injector/RemoveEmptyTest.php @@ -0,0 +1,58 @@ +config->set('AutoFormat', 'RemoveEmpty', true); + } + + function testPreserve() { + $this->assertResult('asdf'); + } + + function testRemove() { + $this->assertResult('', ''); + } + + function testRemoveWithSpace() { + $this->assertResult(' ', ''); + } + + function testRemoveWithAttr() { + $this->assertResult('', ''); + } + + function testRemoveIdAndName() { + $this->assertResult('', ''); + } + + function testPreserveColgroup() { + $this->assertResult(''); + } + + function testPreserveId() { + $this->config->set('Attr', 'EnableID', true); + $this->assertResult(''); + } + + function testPreserveName() { + $this->config->set('Attr', 'EnableID', true); + $this->assertResult(''); + } + + function testRemoveNested() { + $this->assertResult('', ''); + } + + function testRemoveNested2() { + $this->assertResult('', ''); + } + + function testRemoveNested3() { + $this->assertResult(' ', ''); + } + +} + diff --git a/tests/HTMLPurifier/Injector/SafeObjectTest.php b/tests/HTMLPurifier/Injector/SafeObjectTest.php index 276ef92b..010f0707 100644 --- a/tests/HTMLPurifier/Injector/SafeObjectTest.php +++ b/tests/HTMLPurifier/Injector/SafeObjectTest.php @@ -10,6 +10,7 @@ class HTMLPurifier_Injector_SafeObjectTest extends HTMLPurifier_InjectorHarness function setup() { parent::setup(); + // there is no AutoFormat.SafeObject directive $this->config->set('AutoFormat', 'Custom', array(new HTMLPurifier_Injector_SafeObject())); $this->config->set('HTML', 'Trusted', true); } diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormed_InjectorTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormed_InjectorTest.php index 039d00a6..a8a9c58c 100644 --- a/tests/HTMLPurifier/Strategy/MakeWellFormed_InjectorTest.php +++ b/tests/HTMLPurifier/Strategy/MakeWellFormed_InjectorTest.php @@ -8,14 +8,19 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str $this->obj = new HTMLPurifier_Strategy_MakeWellFormed(); $this->config->set('AutoFormat', 'AutoParagraph', true); $this->config->set('AutoFormat', 'Linkify', true); + $this->config->set('AutoFormat', 'RemoveEmpty', true); generate_mock_once('HTMLPurifier_Injector'); } function testEndNotification() { $mock = new HTMLPurifier_InjectorMock(); $mock->skip = false; - $mock->expectAt(0, 'notifyEnd', array(new HTMLPurifier_Token_End('b'))); - $mock->expectAt(1, 'notifyEnd', array(new HTMLPurifier_Token_End('i'))); + $b = new HTMLPurifier_Token_End('b'); + $b->start = new HTMLPurifier_Token_Start('b'); + $mock->expectAt(0, 'notifyEnd', array($b)); + $i = new HTMLPurifier_Token_End('i'); + $i->start = new HTMLPurifier_Token_Start('i'); + $mock->expectAt(1, 'notifyEnd', array($i)); $mock->expectCallCount('notifyEnd', 2); $this->config->set('AutoFormat', 'AutoParagraph', false); $this->config->set('AutoFormat', 'Linkify', false); @@ -92,4 +97,20 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str ); } + function testEmptyAndParagraph() { + // This is a fairly degenerate case, but it demonstrates that + // the two don't error out together, at least. + $this->assertResult( + "

asdf\n\nasdf

\n\n

", + "

asdf

asdf

" + ); + } + + function testRewindAndParagraph() { + $this->assertResult( + "bar\n\n

\n\n

\n\nfoo", + "

bar

foo

" + ); + } + }