mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-07-31 03:10:09 +02:00
Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
25
library/HTMLPurifier/EntityLookup.php
Normal file
25
library/HTMLPurifier/EntityLookup.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_EntityLookup {
|
||||
|
||||
var $table;
|
||||
|
||||
function HTMLPurifier_EntityLookup($file = false) {
|
||||
if (!$file) {
|
||||
$file = dirname(__FILE__) . '/EntityLookup/data.txt';
|
||||
}
|
||||
$this->table = unserialize(file_get_contents($file));
|
||||
}
|
||||
|
||||
function instance() {
|
||||
// no references, since PHP doesn't copy unless modified
|
||||
static $instance = null;
|
||||
if (!$instance) {
|
||||
$instance = new HTMLPurifier_EntityLookup();
|
||||
}
|
||||
return $instance;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
1
library/HTMLPurifier/EntityLookup/data.txt
Normal file
1
library/HTMLPurifier/EntityLookup/data.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -85,6 +85,68 @@ class HTMLPurifier_Lexer
|
||||
return $lexer;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Callback regex string for parsing entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_substituteEntitiesRegex =
|
||||
// 1. hex 2. dec 3. string
|
||||
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
|
||||
|
||||
/**
|
||||
* Substitutes non-special entities with their parsed equivalents. Since
|
||||
* running this whenever you have parsed character is t3h 5uck, we run
|
||||
* it before everything else.
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteNonSpecialEntities($string) {
|
||||
// it will try to detect missing semicolons, but don't rely on it
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
|
||||
$string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
* @todo Implement string translations
|
||||
*/
|
||||
function nonSpecialEntityCallback($matches) {
|
||||
// replaces all but big five
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
if (isset($this->_special_dec2str[$int])) return $entity;
|
||||
return chr($int);
|
||||
} else {
|
||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||
if (!$this->_entity_lookup) {
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
$this->_entity_lookup = EntityLookup::instance();
|
||||
}
|
||||
if (isset($this->_entity_lookup->table[$matches[3]])) {
|
||||
return $this->_entity_lookup->table[$matches[3]];
|
||||
} else {
|
||||
return $entity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var $_entity_lookup;
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
* pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8
|
||||
* completely eventually.
|
||||
*
|
||||
* @todo Implement non-special string entity conversion.
|
||||
* @todo Reread XML spec and document differences.
|
||||
* @todo Add support for CDATA sections.
|
||||
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
||||
@@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
''' => "'",
|
||||
);
|
||||
|
||||
/**
|
||||
* Callback regex string for parsing entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_substituteEntitiesRegex =
|
||||
// 1. hex 2. dec 3. string
|
||||
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
|
||||
|
||||
/**
|
||||
* Substitutes non-special entities with their parsed equivalents.
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteNonSpecialEntities($string) {
|
||||
// it will try to detect missing semicolons, but don't rely on it
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
|
||||
$string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
* @todo Implement string translations
|
||||
*/
|
||||
function nonSpecialEntityCallback($matches) {
|
||||
// replaces all but big five
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
if (isset($this->_special_dec2str[$int])) return $entity;
|
||||
return chr($int);
|
||||
} else {
|
||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||
// translate $matches[3]
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitutes only special entities with their parsed equivalents.
|
||||
*
|
||||
|
Reference in New Issue
Block a user