1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-17 14:08:15 +01:00
- Documentation updated
- API docs now exclude more files that are not classes
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
- (internal) Refactored parseData() to general Lexer class

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-09-27 02:09:54 +00:00
parent d9bb97cc26
commit 37def0104b
8 changed files with 99 additions and 79 deletions

View File

@ -4,7 +4,7 @@
# Project related configuration options # Project related configuration options
#--------------------------------------------------------------------------- #---------------------------------------------------------------------------
PROJECT_NAME = HTML Purifier PROJECT_NAME = HTML Purifier
PROJECT_NUMBER = 1.0.0 PROJECT_NUMBER = 1.1.1
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
CREATE_SUBDIRS = NO CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English OUTPUT_LANGUAGE = English
@ -89,9 +89,12 @@ EXCLUDE =
EXCLUDE_SYMLINKS = NO EXCLUDE_SYMLINKS = NO
EXCLUDE_PATTERNS = */tests/* \ EXCLUDE_PATTERNS = */tests/* \
*/benchmarks/* \ */benchmarks/* \
*/docs/phpdoc/* \ */docs/* \
*/docs/doxygen/* \ */test-settings.php \
*/test-settings.php */configdoc/* \
*/test-settings.php \
*/maintenance/* \
*/smoketests/*
EXAMPLE_PATH = EXAMPLE_PATH =
EXAMPLE_PATTERNS = * EXAMPLE_PATTERNS = *
EXAMPLE_RECURSIVE = NO EXAMPLE_RECURSIVE = NO

3
NEWS
View File

@ -6,7 +6,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.1.2, unknown projected release date 1.1.2, unknown projected release date
(bugfix release, may be merged with 1.2.0 if new features precede major bugs) (bugfix release, may be merged with 1.2.0 if new features precede major bugs)
- Documentation updated
- API docs now exclude more files that are not classes
- Line endings standardized throughout project - Line endings standardized throughout project
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
1.1.1, released 2006-09-24 1.1.1, released 2006-09-24
- Various documentation updates - Various documentation updates

View File

@ -3,7 +3,7 @@
/*! /*!
* @mainpage * @mainpage
* *
* HTMLPurifier is an HTML filter that will take an arbitrary snippet of * HTML Purifier is an HTML filter that will take an arbitrary snippet of
* HTML and rigorously test, validate and filter it into a version that * HTML and rigorously test, validate and filter it into a version that
* is safe for output onto webpages. It achieves this by: * is safe for output onto webpages. It achieves this by:
* *
@ -22,7 +22,7 @@
*/ */
/* /*
HTMLPurifier - Standards Compliant HTML Filtering HTML Purifier - Standards Compliant HTML Filtering
Copyright (C) 2006 Edward Z. Yang Copyright (C) 2006 Edward Z. Yang
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or

View File

@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
$this->_entity_parser = new HTMLPurifier_EntityParser(); $this->_entity_parser = new HTMLPurifier_EntityParser();
} }
/**
* Most common entity to raw value conversion table for special entities.
* @protected
*/
var $_special_entity2str =
array(
'"' => '"',
'&' => '&',
'&lt;' => '<',
'&gt;' => '>',
'&#39;' => "'",
'&#039;' => "'",
'&#x27;' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param $string String character data to be parsed.
* @returns Parsed character data.
*/
function parseData($string) {
// following functions require at least one character
if ($string === '') return '';
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if (!$num_amp) return $string; // abort if no entities
$num_esc_amp = substr_count($string, '&amp;');
$string = strtr($string, $this->_special_entity2str);
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
return $string;
}
var $_encoder; var $_encoder;
/** /**

View File

@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
* completely eventually. * completely eventually.
* *
* @todo Reread XML spec and document differences. * @todo Reread XML spec and document differences.
* @todo Add support for CDATA sections. *
* @todo Determine correct behavior in outputting comment data. (preserve dashes?) * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
* @todo Optimize main function tokenizeHTML().
* @todo Less than sign (<) being prohibited (even as entity) in attr-values?
*/ */
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{ {
/**
* Most common entity to raw value conversion table for special entities.
* @protected
*/
var $_special_entity2str =
array(
'&quot;' => '"',
'&amp;' => '&',
'&lt;' => '<',
'&gt;' => '>',
'&#39;' => "'",
'&#039;' => "'",
'&#x27;' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param $string String character data to be parsed.
* @returns Parsed character data.
*/
function parseData($string) {
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if (!$num_amp) return $string; // abort if no entities
$num_esc_amp = substr_count($string, '&amp;');
$string = strtr($string, $this->_special_entity2str);
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
return $string;
}
/** /**
* Whitespace characters for str(c)spn. * Whitespace characters for str(c)spn.
* @protected * @protected

View File

@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
* whatever it does for poorly formed HTML is up to it. * whatever it does for poorly formed HTML is up to it.
* *
* @todo Generalize so that XML_HTMLSax is also supported. * @todo Generalize so that XML_HTMLSax is also supported.
*
* @warning Entity-resolution inside attributes is broken.
*/ */
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
$parser->set_element_handler('openHandler','closeHandler'); $parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler'); $parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler'); $parser->set_escape_handler('escapeHandler');
// doesn't seem to work correctly for attributes
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($string); $parser->parse($string);
@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
* Open tag event handler, interface is defined by PEAR package. * Open tag event handler, interface is defined by PEAR package.
*/ */
function openHandler(&$parser, $name, $attrs, $closed) { function openHandler(&$parser, $name, $attrs, $closed) {
// entities are not resolved in attrs
foreach ($attrs as $key => $attr) {
$attrs[$key] = $this->parseData($attr);
}
if ($closed) { if ($closed) {
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else { } else {

View File

@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
} }
function test_parseData() {
$HP =& $this->DirectLex;
$this->assertIdentical('asdf', $HP->parseData('asdf'));
$this->assertIdentical('&', $HP->parseData('&amp;'));
$this->assertIdentical('"', $HP->parseData('&quot;'));
$this->assertIdentical("'", $HP->parseData('&#039;'));
$this->assertIdentical("'", $HP->parseData('&#39;'));
$this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
$this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
$this->assertIdentical('Procter & Gamble',
$HP->parseData('Procter & Gamble')); // [INVALID]
// This is not special, thus not converted. Test of fault tolerance,
// realistically speaking, this should never happen
$this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
}
// internals testing // internals testing
function test_parseAttributeString() { function test_parseAttributeString() {

View File

@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->assertIdentical($extract, $result); $this->assertIdentical($extract, $result);
} }
function test_parseData() {
$HP =& $this->Lexer;
$this->assertIdentical('asdf', $HP->parseData('asdf'));
$this->assertIdentical('&', $HP->parseData('&amp;'));
$this->assertIdentical('"', $HP->parseData('&quot;'));
$this->assertIdentical("'", $HP->parseData('&#039;'));
$this->assertIdentical("'", $HP->parseData('&#39;'));
$this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
$this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
$this->assertIdentical('Procter & Gamble',
$HP->parseData('Procter & Gamble')); // [INVALID]
// This is not special, thus not converted. Test of fault tolerance,
// realistically speaking, this should never happen
$this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
}
function test_extractBody() { function test_extractBody() {
$this->assertExtractBody('<b>Bold</b>'); $this->assertExtractBody('<b>Bold</b>');
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>'); $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
,new HTMLPurifier_Token_Text('Link') ,new HTMLPurifier_Token_Text('Link')
,new HTMLPurifier_Token_End('a') ,new HTMLPurifier_Token_End('a')
); );
$sax_expect[16] = false; // PEARSax doesn't support it!
// test that UTF-8 is preserved // test that UTF-8 is preserved
$char_hearts = $this->_entity_lookup->table['hearts']; $char_hearts = $this->_entity_lookup->table['hearts'];
$input[17] = $char_hearts; $input[17] = $char_hearts;
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
// test weird characters in attributes
$input[18] = '<br test="x &lt; 6" />';
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
$default_config = HTMLPurifier_Config::createDefault(); $default_config = HTMLPurifier_Config::createDefault();
foreach($input as $i => $discard) { foreach($input as $i => $discard) {
if (!isset($config[$i])) $config[$i] = $default_config; if (!isset($config[$i])) $config[$i] = $default_config;