1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-04 13:18:00 +02:00

Fix two bugs with caching of customized raw definitions.

The first bug is that we will repeatedly write out the result
of a customized raw definition to the filesystem, even when a cache
entry already exists.

The second bug is that caching these definitions doesn't actually
work (the cache entry is written but never used.)  A new API
for retrieving raw definitions permits the user to take advantage
of caching.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
Edward Z. Yang
2010-12-30 23:51:53 +00:00
parent 6dcc37cb55
commit f3d050c517
11 changed files with 375 additions and 91 deletions

View File

@@ -76,7 +76,8 @@ class HTMLPurifier_Config
/**
* Set to false if you do not want line and file numbers in errors
* (useful when unit testing)
* (useful when unit testing). This will also compress some errors
* and exceptions.
*/
public $chatty = true;
@@ -318,26 +319,64 @@ class HTMLPurifier_Config
* Retrieves object reference to the HTML definition.
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawHTMLDefinition, which is more explicitly
* named, instead.
*/
public function getHTMLDefinition($raw = false) {
return $this->getDefinition('HTML', $raw);
public function getHTMLDefinition($raw = false, $optimized = false) {
return $this->getDefinition('HTML', $raw, $optimized);
}
/**
* Retrieves object reference to the CSS definition
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawCSSDefinition, which is more explicitly
* named, instead.
*/
public function getCSSDefinition($raw = false) {
return $this->getDefinition('CSS', $raw);
public function getCSSDefinition($raw = false, $optimized = false) {
return $this->getDefinition('CSS', $raw, $optimized);
}
/**
* Retrieves object reference to the URI definition
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawURIDefinition, which is more explicitly
* named, instead.
*/
public function getURIDefinition($raw = false, $optimized = false) {
return $this->getDefinition('URI', $raw, $optimized);
}
/**
* Retrieves a definition
* @param $type Type of definition: HTML, CSS, etc
* @param $raw Whether or not definition should be returned raw
* @param $optimized Only has an effect when $raw is true. Whether
* or not to return null if the result is already present in
* the cache. This is off by default for backwards
* compatibility reasons, but you need to do things this
* way in order to ensure that caching is done properly.
* Check out enduser-customize.html for more details.
* We probably won't ever change this default, as much as the
* maybe semantics is the "right thing to do."
*/
public function getDefinition($type, $raw = false) {
public function getDefinition($type, $raw = false, $optimized = false) {
if ($optimized && !$raw) {
throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
}
if (!$this->finalized) $this->autoFinalize();
// temporarily suspend locks, so we can handle recursive definition calls
$lock = $this->lock;
@@ -346,52 +385,137 @@ class HTMLPurifier_Config
$cache = $factory->create($type, $this);
$this->lock = $lock;
if (!$raw) {
// see if we can quickly supply a definition
// full definition
// ---------------
// check if definition is in memory
if (!empty($this->definitions[$type])) {
if (!$this->definitions[$type]->setup) {
$this->definitions[$type]->setup($this);
$cache->set($this->definitions[$type], $this);
$def = $this->definitions[$type];
// check if the definition is setup
if ($def->setup) {
return $def;
} else {
$def->setup($this);
if ($def->optimized) $cache->add($def, $this);
return $def;
}
return $this->definitions[$type];
}
// memory check missed, try cache
$this->definitions[$type] = $cache->get($this);
if ($this->definitions[$type]) {
// definition in cache, return it
return $this->definitions[$type];
// check if definition is in cache
$def = $cache->get($this);
if ($def) {
// definition in cache, save to memory and return it
$this->definitions[$type] = $def;
return $def;
}
} elseif (
!empty($this->definitions[$type]) &&
!$this->definitions[$type]->setup
) {
// raw requested, raw in memory, quick return
return $this->definitions[$type];
// initialize it
$def = $this->initDefinition($type);
// set it up
$this->lock = $type;
$def->setup($this);
$this->lock = null;
// save in cache
$cache->add($def, $this);
// return it
return $def;
} else {
// raw definition
// --------------
// check preconditions
$def = null;
if ($optimized) {
if (is_null($this->get($type . '.DefinitionID'))) {
// fatally error out if definition ID not set
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
}
}
if (!empty($this->definitions[$type])) {
$def = $this->definitions[$type];
if ($def->setup && !$optimized) {
$extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
}
if ($def->optimized === null) {
$extra = $this->chatty ? " (try flushing your cache)" : "";
throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
}
if ($def->optimized !== $optimized) {
$msg = $optimized ? "optimized" : "unoptimized";
$extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
}
}
// check if definition was in memory
if ($def) {
if ($def->setup) {
// invariant: $optimized === true (checked above)
return null;
} else {
return $def;
}
}
// if optimized, check if definition was in cache
// (because we do the memory check first, this formulation
// is prone to cache slamming, but I think
// guaranteeing that either /all/ of the raw
// setup code or /none/ of it is run is more important.)
if ($optimized) {
// This code path only gets run once; once we put
// something in $definitions (which is guaranteed by the
// trailing code), we always short-circuit above.
$def = $cache->get($this);
if ($def) {
// save the full definition for later, but don't
// return it yet
$this->definitions[$type] = $def;
return null;
}
}
// check invariants for creation
if (!$optimized) {
if (!is_null($this->get($type . '.DefinitionID'))) {
if ($this->chatty) {
$this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
} else {
$this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
}
}
}
// initialize it
$def = $this->initDefinition($type);
$def->optimized = $optimized;
return $def;
}
throw new HTMLPurifier_Exception("The impossible happened!");
}
private function initDefinition($type) {
// quick checks failed, let's create the object
if ($type == 'HTML') {
$this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
$def = new HTMLPurifier_HTMLDefinition();
} elseif ($type == 'CSS') {
$this->definitions[$type] = new HTMLPurifier_CSSDefinition();
$def = new HTMLPurifier_CSSDefinition();
} elseif ($type == 'URI') {
$this->definitions[$type] = new HTMLPurifier_URIDefinition();
$def = new HTMLPurifier_URIDefinition();
} else {
throw new HTMLPurifier_Exception("Definition of $type type not supported");
}
// quick abort if raw
if ($raw) {
if (is_null($this->get($type . '.DefinitionID'))) {
// fatally error out if definition ID not set
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
}
return $this->definitions[$type];
}
// set it up
$this->lock = $type;
$this->definitions[$type]->setup($this);
$this->lock = null;
// save in cache
$cache->set($this->definitions[$type], $this);
return $this->definitions[$type];
$this->definitions[$type] = $def;
return $def;
}
public function maybeGetRawDefinition($name) {
return $this->getDefinition($name, true, true);
}
public function maybeGetRawHTMLDefinition() {
return $this->getDefinition('HTML', true, true);
}
public function maybeGetRawCSSDefinition() {
return $this->getDefinition('CSS', true, true);
}
public function maybeGetRawURIDefinition() {
return $this->getDefinition('URI', true, true);
}
/**
@@ -549,17 +673,22 @@ class HTMLPurifier_Config
/**
* Produces a nicely formatted error message by supplying the
* stack frame information from two levels up and OUTSIDE of
* HTMLPurifier_Config.
* stack frame information OUTSIDE of HTMLPurifier_Config.
*/
protected function triggerError($msg, $no) {
// determine previous stack frame
$backtrace = debug_backtrace();
if ($this->chatty && isset($backtrace[1])) {
$frame = $backtrace[1];
$extra = " on line {$frame['line']} in file {$frame['file']}";
} else {
$extra = '';
$extra = '';
if ($this->chatty) {
$trace = debug_backtrace();
// zip(tail(trace), trace) -- but PHP is not Haskell har har
for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
continue;
}
$frame = $trace[$i];
$extra = " invoked on line {$frame['line']} in file {$frame['file']}";
break;
}
}
trigger_error($msg . $extra, $no);
}

View File

@@ -1,6 +1,6 @@
HTML.Nofollow
TYPE: bool
VERSION: 4.2.1
VERSION: 4.3.0
DEFAULT: FALSE
--DESCRIPTION--
If enabled, nofollow rel attributes are added to all outgoing links.

View File

@@ -12,6 +12,17 @@ abstract class HTMLPurifier_Definition
*/
public $setup = false;
/**
* If true, write out the final definition object to the cache after
* setup. This will be true only if all invocations to get a raw
* definition object are also optimized. This does not cause file
* system thrashing because on subsequent calls the cached object
* is used and any writes to the raw definition object are short
* circuited. See enduser-customize.html for the high-level
* picture.
*/
public $optimized = null;
/**
* What type of definition is it?
*/