From d1187ed331b457f8f90ead5e97857e28d61783cf Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 29 May 2007 18:19:42 +0000 Subject: [PATCH] [1.7.0] Add versioning to serializer cache - Make some AttrDef member-variables lazy-loading to save serialization space, clean up others - Refactor get*Definition() methods git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1116 48356398-32a2-884e-a903-53898d9a118a --- TODO | 3 +- library/HTMLPurifier/AttrDef/URI/IPv4.php | 16 ++- library/HTMLPurifier/AttrDef/URI/IPv6.php | 2 + library/HTMLPurifier/Config.php | 113 +++++++++--------- library/HTMLPurifier/DefinitionCache.php | 26 +++- .../DefinitionCache/Serializer.php | 14 ++- library/HTMLPurifier/ElementDef.php | 4 + library/HTMLPurifier/HTMLDefinition.php | 6 + .../DefinitionCache/SerializerTest.php | 36 +++++- tests/HTMLPurifier/DefinitionCacheHarness.php | 4 +- tests/HTMLPurifier/DefinitionCacheTest.php | 16 +++ 11 files changed, 169 insertions(+), 71 deletions(-) diff --git a/TODO b/TODO index c56f6637..fbc18bf9 100644 --- a/TODO +++ b/TODO @@ -13,8 +13,7 @@ TODO List - Reorganize configuration directives - Set up anonymous module management by HTMLDefinition (Advanced API) - Get all AttrTypes into string form - # Clean up HTMLDefinition caching, need easy cache invalidation, - versioning of caches, etc. + # Clean up HTMLDefinition caching, need easy cache invalidation. - Parse TinyMCE-style whitelist into our %HTML.Allow* whitelists 1.8 release [Refactor, refactor!] diff --git a/library/HTMLPurifier/AttrDef/URI/IPv4.php b/library/HTMLPurifier/AttrDef/URI/IPv4.php index 0730bbc8..b9aa98ae 100644 --- a/library/HTMLPurifier/AttrDef/URI/IPv4.php +++ b/library/HTMLPurifier/AttrDef/URI/IPv4.php @@ -15,13 +15,10 @@ class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef */ var $ip4; - function HTMLPurifier_AttrDef_URI_IPv4() { - $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 - $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; - } - function validate($aIP, $config, &$context) { + if (!$this->ip4) $this->_loadRegex(); + if (preg_match('#^' . $this->ip4 . '$#s', $aIP)) { return $aIP; @@ -31,6 +28,15 @@ class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef } + /** + * Lazy load function to prevent regex from being stuffed in + * cache. + */ + function _loadRegex() { + $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 + $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/AttrDef/URI/IPv6.php b/library/HTMLPurifier/AttrDef/URI/IPv6.php index 73f085e5..8de8f68f 100644 --- a/library/HTMLPurifier/AttrDef/URI/IPv6.php +++ b/library/HTMLPurifier/AttrDef/URI/IPv6.php @@ -13,6 +13,8 @@ class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4 function validate($aIP, $config, &$context) { + if (!$this->ip4) $this->_loadRegex(); + $original = $aIP; $hex = '[0-9a-fA-F]'; diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index 4a90aa2d..ab2e98d4 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -43,6 +43,13 @@ class HTMLPurifier_Config */ var $version = '1.6.1'; + /** + * Integer key users can use to indicate they have manually + * overridden some internal behavior and would like the + * cache to invalidate itself. + */ + var $revision = 1; + /** * Two-level associative array of configuration directives */ @@ -54,14 +61,9 @@ class HTMLPurifier_Config var $def; /** - * Cached instance of HTMLPurifier_HTMLDefinition + * Indexed array of definitions */ - var $html_definition; - - /** - * Cached instance of HTMLPurifier_CSSDefinition - */ - var $css_definition; + var $definitions; /** * Bool indicator whether or not config is finalized @@ -205,10 +207,8 @@ class HTMLPurifier_Config // reset definitions if the directives they depend on changed // this is a very costly process, so it's discouraged // with finalization - if ($namespace == 'HTML') { - $this->html_definition = null; - } elseif ($namespace == 'CSS') { - $this->css_definition = null; + if ($namespace == 'HTML' || $namespace == 'CSS') { + $this->definitions[$namespace] = null; } } @@ -218,60 +218,61 @@ class HTMLPurifier_Config * called before it's been setup, otherwise won't work. */ function &getHTMLDefinition($raw = false) { - if (!$this->finalized && $this->autoFinalize) $this->finalize(); - $cache = HTMLPurifier_DefinitionCache::create('HTML', $this); - if($this->checkDefinition($this->html_definition, $cache, $raw)) { - return $this->html_definition; - } - return $this->createDefinition( - $this->html_definition, - $cache, - $raw, - new HTMLPurifier_HTMLDefinition() - ); + return $this->getDefinition('HTML', $raw); } /** * Retrieves reference to the CSS definition */ function &getCSSDefinition($raw = false) { + return $this->getDefinition('CSS', $raw); + } + + /** + * Retrieves a definition + * @param $type Type of definition: HTML, CSS, etc + * @param $raw Whether or not definition should be returned raw + */ + function &getDefinition($type, $raw = false) { if (!$this->finalized && $this->autoFinalize) $this->finalize(); - $cache = HTMLPurifier_DefinitionCache::create('CSS', $this); - if($this->checkDefinition($this->css_definition, $cache, $raw)) { - return $this->css_definition; + $cache = HTMLPurifier_DefinitionCache::create($type, $this); + if (!$raw) { + // see if we can quickly supply a definition + if (!empty($this->definitions[$type])) { + if (!$this->definitions[$type]->setup) { + $this->definitions[$type]->setup($this); + } + return $this->definitions[$type]; + } + // memory check missed, try cache + $this->definitions[$type] = $cache->get($this); + if ($this->definitions[$type]) { + // definition in cache, return it + return $this->definitions[$type]; + } + } elseif ( + !empty($this->definitions[$type]) && + !$this->definitions[$type]->setup + ) { + // raw requested, raw in memory, quick return + return $this->definitions[$type]; } - return $this->createDefinition( - $this->css_definition, - $cache, - $raw, - new HTMLPurifier_CSSDefinition() - ); - } - - /** - * Checks the variable and cache for an easy-access definition, - * sets def to variable and returns true if available - */ - function checkDefinition(&$var, $cache, $raw) { - if ($raw) return false; - if (!empty($var)) { - if (!$var->setup) $var->setup($this); - return true; + // quick checks failed, let's create the object + if ($type == 'HTML') { + $this->definitions[$type] = new HTMLPurifier_HTMLDefinition(); + } elseif ($type == 'CSS') { + $this->definitions[$type] = new HTMLPurifier_CSSDefinition(); + } else { + trigger_error("Definition of $type type not supported"); + return false; } - $var = $cache->get($this); - return (bool) $var; - } - - /** - * Generates a new definition, possibly returning it raw, returns - * reference to variable. - */ - function &createDefinition(&$var, $cache, $raw, $obj) { - $var = $obj; - if ($raw) return $var; - $var->setup($this); - $cache->set($var, $this); - return $var; + // quick abort if raw + if ($raw) return $this->definitions[$type]; + // set it up + $this->definitions[$type]->setup($this); + // save in cache + $cache->set($this->definitions[$type], $this); + return $this->definitions[$type]; } /** diff --git a/library/HTMLPurifier/DefinitionCache.php b/library/HTMLPurifier/DefinitionCache.php index cb4d87fc..b2212256 100644 --- a/library/HTMLPurifier/DefinitionCache.php +++ b/library/HTMLPurifier/DefinitionCache.php @@ -33,7 +33,23 @@ class HTMLPurifier_DefinitionCache * @param Instance of HTMLPurifier_Config */ function generateKey($config) { - return md5(serialize($config->getBatch($this->type))); + $version = $config->version; + $revision = $config->revision; + return $version . '-' . $revision . '-' . md5(serialize($config->getBatch($this->type))); + } + + /** + * Tests whether or not a key is old with respect to the configuration's + * version and revision number. + * @param $key Key to test + * @param $config Instance of HTMLPurifier_Config to test against + */ + function isOld($key, $config) { + list($version, $revision, $hash) = explode('-', $key, 3); + $compare = version_compare($version, $config->version); + if ($compare > 0) return false; + if ($compare == 0 && $revision >= $config->revision) return false; + return true; } /** @@ -99,10 +115,16 @@ class HTMLPurifier_DefinitionCache /** * Clears all objects from cache */ - function flush($config) { + function flush() { trigger_error('Cannot call abstract method', E_USER_ERROR); } + /** + * Clears all expired (older version or revision) objects from cache + */ + function cleanup($config) { + trigger_error('Cannot call abstract method', E_USER_ERROR); + } } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/DefinitionCache/Serializer.php b/library/HTMLPurifier/DefinitionCache/Serializer.php index 75d18acc..309fd5b2 100644 --- a/library/HTMLPurifier/DefinitionCache/Serializer.php +++ b/library/HTMLPurifier/DefinitionCache/Serializer.php @@ -44,13 +44,21 @@ class HTMLPurifier_DefinitionCache_Serializer extends while (false !== ($filename = readdir($dh))) { if (empty($filename)) continue; if ($filename[0] === '.') continue; - // optimization: md5 + .ser will always be 36 char long - // needs to be changed if we change the identifier - if (strlen($filename) !== 36) continue; unlink($dir . '/' . $filename); } } + function cleanup($config) { + $dir = $this->generateDirectoryPath(); + $dh = opendir($dir); + while (false !== ($filename = readdir($dh))) { + if (empty($filename)) continue; + if ($filename[0] === '.') continue; + $key = substr($filename, 0, strlen($filename) - 4); + if ($this->isOld($key, $config)) unlink($dir . '/' . $filename); + } + } + /** * Generates the file path to the serial file corresponding to * the configuration and definition name diff --git a/library/HTMLPurifier/ElementDef.php b/library/HTMLPurifier/ElementDef.php index e1be0494..1811f521 100644 --- a/library/HTMLPurifier/ElementDef.php +++ b/library/HTMLPurifier/ElementDef.php @@ -51,6 +51,8 @@ class HTMLPurifier_ElementDef * Abstract string representation of internal ChildDef rules. See * HTMLPurifier_ContentSets for how this is parsed and then transformed * into an HTMLPurifier_ChildDef. + * @warning This is a temporary variable that is not available after + * being processed by HTMLDefinition * @public */ var $content_model; @@ -59,6 +61,8 @@ class HTMLPurifier_ElementDef * Value of $child->type, used to determine which ChildDef to use, * used in combination with $content_model. * @warning This must be lowercase + * @warning This is a temporary variable that is not available after + * being processed by HTMLDefinition * @public */ var $content_model_type; diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index a3505877..4d27771e 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -169,6 +169,12 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition $this->processModules($config); $this->setupConfigStuff($config); unset($this->manager); + + // cleanup some of the element definitions + foreach ($this->info as $k => $v) { + unset($this->info[$k]->content_model); + unset($this->info[$k]->content_model_type); + } } /** diff --git a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php index d7da42e0..2da20ef1 100644 --- a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php +++ b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php @@ -53,7 +53,12 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio $cache = new HTMLPurifier_DefinitionCache_Serializer('Test'); $config_array = array('Foo' => 'Bar'); - $config_md5 = md5(serialize($config_array)); + + $config = $this->generateConfigMock($config_array); + $config->version = '1.0.0'; + $config->revision = 2; + + $config_md5 = '1.0.0-' . $config->revision . '-' . md5(serialize($config_array)); $file = realpath( $rel_file = dirname(__FILE__) . @@ -62,7 +67,6 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio ); if($file && file_exists($file)) unlink($file); // prevent previous failures from causing problems - $config = $this->generateConfigMock($config_array); $this->assertIdentical($config_md5, $cache->generateKey($config)); $def_original = $this->generateDefinition(); @@ -150,6 +154,34 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio } + function testCleanup() { + + $cache = new HTMLPurifier_DefinitionCache_Serializer('Test'); + + // in order of age, oldest first + // note that configurations are all identical, but version/revision + // are different + + $config1 = $this->generateConfigMock(); + $config1->version = '0.9.0'; + $config1->revision = 574; + $def1 = $this->generateDefinition(array('info' => 1)); + + $config2 = $this->generateConfigMock(); + $config2->version = '1.0.0beta'; + $config2->revision = 1; + $def2 = $this->generateDefinition(array('info' => 3)); + + $cache->set($def1, $config1); + $cache->cleanup($config1); + $this->assertEqual($def1, $cache->get($config1)); // no change + + $cache->cleanup($config2); + $this->assertFalse($cache->get($config1)); + $this->assertFalse($cache->get($config2)); + + } + /** * Asserts that a file exists, ignoring the stat cache */ diff --git a/tests/HTMLPurifier/DefinitionCacheHarness.php b/tests/HTMLPurifier/DefinitionCacheHarness.php index 9747b615..d43df6bc 100644 --- a/tests/HTMLPurifier/DefinitionCacheHarness.php +++ b/tests/HTMLPurifier/DefinitionCacheHarness.php @@ -8,10 +8,12 @@ class HTMLPurifier_DefinitionCacheHarness extends UnitTestCase * to a getBatch() call * @param $values Values to return when getBatch is invoked */ - function generateConfigMock($values) { + function generateConfigMock($values = array()) { generate_mock_once('HTMLPurifier_Config'); $config = new HTMLPurifier_ConfigMock($this); $config->setReturnValue('getBatch', $values, array('Test')); + $config->version = '1.0.0'; + $config->revision = 1; return $config; } diff --git a/tests/HTMLPurifier/DefinitionCacheTest.php b/tests/HTMLPurifier/DefinitionCacheTest.php index 0d1f512e..b117a282 100644 --- a/tests/HTMLPurifier/DefinitionCacheTest.php +++ b/tests/HTMLPurifier/DefinitionCacheTest.php @@ -9,6 +9,22 @@ class HTMLPurifier_DefinitionCacheTest extends UnitTestCase $cache = HTMLPurifier_DefinitionCache::create('Test', $config); $this->assertEqual($cache, new HTMLPurifier_DefinitionCache_Serializer('Test')); } + + function test_isOld() { + $cache = new HTMLPurifier_DefinitionCache('Test'); // non-functional + + $config = HTMLPurifier_Config::createDefault(); + $config->version = '1.0.0'; + $config->revision = 10; + + $this->assertIdentical($cache->isOld('1.0.0-10-hashstuffhere', $config), false); + $this->assertIdentical($cache->isOld('1.5.0-1-hashstuffhere', $config), false); + + $this->assertIdentical($cache->isOld('0.9.0-1-hashstuffhere', $config), true); + $this->assertIdentical($cache->isOld('1.0.0-1-hashstuffhere', $config), true); + $this->assertIdentical($cache->isOld('1.0.0beta-11-hashstuffhere', $config), true); + } + } ?> \ No newline at end of file