From d48f9b6b21e824451b9375402c66fb5679d33987 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 12 Nov 2006 19:26:49 +0000 Subject: [PATCH] [1.2.0] - Update TODO . Add another possible plaintext formatter . Reference config-ideas.txt for URI options - Update code-quality.txt, removing issues that have been addressed and updating time for post-beta - Update config-ideas.txt . Added more possible URI directives . Removed silly language control directive - Improved documentation on Class, CSS and Host git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@524 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 4 ++++ TODO | 4 +++- docs/code-quality.txt | 18 ++++++----------- docs/config-ideas.txt | 28 ++++++++++++++------------ library/HTMLPurifier/AttrDef/CSS.php | 1 + library/HTMLPurifier/AttrDef/Class.php | 5 +++-- library/HTMLPurifier/AttrDef/Host.php | 4 +++- 7 files changed, 35 insertions(+), 29 deletions(-) diff --git a/NEWS b/NEWS index c6cf6f74..9e127eee 100644 --- a/NEWS +++ b/NEWS @@ -19,7 +19,11 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier + TODO added request Phalanger + TODO added request Native compression + TODO added request Remove redundant tags + + TODO added possible plaintext formatter for HTML Purifier documentation + Updated ConfigDoc TODO + + Updated code-quality.txt, removing issues that have been resolved + + Improved inline comments in AttrDef/Class.php, AttrDef/CSS.php + and AttrDef/Host.php . Switched to purify()-wide Context object registry . Refactored unit tests to minimize duplication . XSS attack sheet updated diff --git a/TODO b/TODO index bd8e3046..b24b6292 100644 --- a/TODO +++ b/TODO @@ -3,7 +3,7 @@ TODO List 1.2 release - Make URI validation routines tighter (especially mailto) - - More extensive URI filtering schemes + - More extensive URI filtering schemes (see URI in config-ideas.txt) - Allow for background-image and list-style-image (see above) - Distinguish between different types of URIs, for instance, a mailto URI in IMG SRC is nonsensical @@ -29,6 +29,8 @@ TODO List shouldn't be paragraphed, such as lists and tables). - Linkify URLs - Smileys + - Linkification for HTML Purifier docs: notably configuration and + class names 3.0 release - Extended HTML capabilities based on namespacing and tag transforms diff --git a/docs/code-quality.txt b/docs/code-quality.txt index 5b54b699..8fd8e4d3 100644 --- a/docs/code-quality.txt +++ b/docs/code-quality.txt @@ -4,11 +4,8 @@ Code Quality Issues Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They also can do quick prototypes, and then forget to rewrite them later. Well, while I can't list mistakes in here, I can list prototype-like segments -of code that should be aggressively refactored after the beta is released. -This does not list optimization issues, that needs to be done after intense -profiling. - -Here we go: +of code that should be aggressively refactored. This does not list +optimization issues, that needs to be done after intense profiling. AttrDef Class - doesn't support Unicode characters (fringe); uses regular @@ -16,12 +13,10 @@ AttrDef Lang - code duplication; premature optimization; doesn't consult official lists (fringe) Length - easily mistaken for CSSLength - URI - multiple regular expressions; needs host validation routines factored - out for mailto scheme; missing validation for query; fragment and path, - no percent-encode fixing + URI - multiple regular expressions; missing validation for query, + fragment and path CSS - parser doesn't accept advanced CSS (fringe) - Number - constructor interface is inconsistent with Integer -AttrTransform - doesn't accept AttrContext + Number - constructor interface inconsistent with Integer Config - "load configuration" hooks missing, rich set* accessors missing ConfigSchema - redefinition is a mess Strategy @@ -31,8 +26,7 @@ Strategy might be efficient). RemoveForeignElements - should be run in parallel with MakeWellFormed URIScheme - needs to have callable generic checks - ftp - missing typecode check - mailto - doesn't validate emails + mailto - doesn't validate emails, doesn't validate querystring news - doesn't validate opaque path nntp - doesn't constrain path EOL diff --git a/docs/config-ideas.txt b/docs/config-ideas.txt index 3dfa2b31..79e09b00 100644 --- a/docs/config-ideas.txt +++ b/docs/config-ideas.txt @@ -17,24 +17,17 @@ time. Note the naming convention: %Namespace.Directive %Attr.ClassBlacklist. When it's Whitelist, only allow those in %Attr.ClassWhitelist. -%Attr.LangAlphaOnly - designate whether or not to allow numerals in language - code subtags - * RFC 1766, the current standard referenced by XML, does not permit - numbers, but, - * RFC 3066, the superseding best practice standard since January 2001, - permits them. - We allow numbers by default, but you generally never see them - at all, which makes this a little more sane. - %Attr.MaxWidth, %Attr.MaxHeight - caps for width and height related checks. - (a hack in Pixels for an image crashing attack could be replaced by this) + (the hack in Pixels for an image crashing attack could be replaced by this) -%URI.Munge - will munge all URIs to a different URI, which should redirect +%URI.Munge - will munge all external URIs to a different URI, which redirects the user to the applicable page. A urlencoded version of the URI will replace any instances of %s in the string. One possible string is 'http://www.google.com/url?q=%s'. Useful for preventing - pagerank from being sent to other sites + pagerank from being sent to other sites, but can also be used to + redirect to a splash page notifying user that they are leaving your + website. %URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the spread of ill-gotten pagerank @@ -49,7 +42,16 @@ time. Note the naming convention: %Namespace.Directive 'DenyAll' or 'AllowAll' (default) %URI.DisableIPHosts - URIs that have IP addresses for hosts are disallowed. - Be sure to also grab unusual encodings (dword, hex and octal) + Be sure to also grab unusual encodings (dword, hex and octal), which may + be currently be caught by regular DNS +%URI.DisableAbsoluteDNS - Remove extra dots after host names that trigger + absolute DNS. While this is actually the preferred method according to + the RFC, most people opt to use a relative domain name relative to . (root). +%URI.DisableIDN - Disallow raw internationalized domain names. Punycode + will still be permitted. + +%URI.ConvertUnusualIPHosts - transform dword/hex/octal IP addresses to the + regular form %URI.DisableExternalResources - disallow resource links (i.e. URIs that result in immediate requests, such as src in IMG) to external websites diff --git a/library/HTMLPurifier/AttrDef/CSS.php b/library/HTMLPurifier/AttrDef/CSS.php index 1ba0e219..404c7000 100644 --- a/library/HTMLPurifier/AttrDef/CSS.php +++ b/library/HTMLPurifier/AttrDef/CSS.php @@ -43,6 +43,7 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef $propvalues[$property] = $result; } + // procedure does not write the new CSS simultaneously, so it's // slightly inefficient, but it's the only way of getting rid of // duplicates. Perhaps config to optimize it, but not now. diff --git a/library/HTMLPurifier/AttrDef/Class.php b/library/HTMLPurifier/AttrDef/Class.php index 551eb332..5f86823a 100644 --- a/library/HTMLPurifier/AttrDef/Class.php +++ b/library/HTMLPurifier/AttrDef/Class.php @@ -24,13 +24,14 @@ class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef // and plus it would complicate optimization efforts (you never // see that anyway). $matches = array(); - $pattern = '/(?:(?<=\s)|\A)'. + $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'. - '(?:(?=\s)|\z)/'; + '(?:(?=\s)|\z)/'; // look ahead for space or string end preg_match_all($pattern, $string, $matches); if (empty($matches[1])) return false; + // reconstruct class string $new_string = ''; foreach ($matches[1] as $class_names) { $new_string .= $class_names . ' '; diff --git a/library/HTMLPurifier/AttrDef/Host.php b/library/HTMLPurifier/AttrDef/Host.php index 47bce063..94350765 100644 --- a/library/HTMLPurifier/AttrDef/Host.php +++ b/library/HTMLPurifier/AttrDef/Host.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef/IPv4.php'; require_once 'HTMLPurifier/AttrDef/IPv6.php'; /** - * Validates a host according to the IPv4, IPv6 and DNS specifications. + * Validates a host according to the IPv4, IPv6 and DNS (future) specifications. */ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef { @@ -35,6 +35,8 @@ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef if ($valid === false) return false; return '['. $valid . ']'; } + + // need to do checks on unusual encodings too $ipv4 = $this->ipv4->validate($string, $config, $context); if ($ipv4 !== false) return $ipv4;