mirror of
				https://github.com/ezyang/htmlpurifier.git
				synced 2025-10-26 18:06:43 +01:00 
			
		
		
		
	- Fix debugger so that tokens can be printed without an index - Fix some broken PEAR unit tests git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1383 48356398-32a2-884e-a903-53898d9a118a
		
			
				
	
	
		
			528 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			528 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| require_once 'HTMLPurifier/Lexer/DirectLex.php';
 | |
| 
 | |
| class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
 | |
| {
 | |
|     
 | |
|     var $_has_pear = false;
 | |
|     
 | |
|     function HTMLPurifier_LexerTest() {
 | |
|         parent::HTMLPurifier_Harness();
 | |
|         // E_STRICT = 2048, int used for PHP4 compat: this check disables
 | |
|         // PEAR if PHP 5 strict mode is on, since the class is not strict safe
 | |
|         if (
 | |
|             $GLOBALS['HTMLPurifierTest']['PEAR'] &&
 | |
|             ((error_reporting() & 2048) != 2048) // ought to be a better way
 | |
|         ) {
 | |
|             require_once 'HTMLPurifier/Lexer/PEARSax3.php';
 | |
|             $this->_has_pear = true;
 | |
|         }
 | |
|         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
 | |
|             require_once 'HTMLPurifier/Lexer/PH5P.php';
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     // HTMLPurifier_Lexer::create() --------------------------------------------
 | |
|     
 | |
|     function test_create() {
 | |
|         $this->config->set('Core', 'MaintainLineNumbers', true);
 | |
|         $lexer = HTMLPurifier_Lexer::create($this->config);
 | |
|         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
 | |
|     }
 | |
|     
 | |
|     // HTMLPurifier_Lexer->parseData() -----------------------------------------
 | |
|     
 | |
|     function assertParseData($input, $expect = true) {
 | |
|         if ($expect === true) $expect = $input;
 | |
|         $lexer = new HTMLPurifier_Lexer();
 | |
|         $this->assertIdentical($expect, $lexer->parseData($input));
 | |
|     }
 | |
|     
 | |
|     function test_parseData_plainText() {
 | |
|         $this->assertParseData('asdf');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_ampersandEntity() {
 | |
|         $this->assertParseData('&', '&');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_quotEntity() {
 | |
|         $this->assertParseData('"', '"');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_aposNumericEntity() {
 | |
|         $this->assertParseData(''', "'");
 | |
|     }
 | |
|     
 | |
|     function test_parseData_aposCompactNumericEntity() {
 | |
|         $this->assertParseData(''', "'");
 | |
|     }
 | |
|     
 | |
|     function test_parseData_adjacentAmpersandEntities() {
 | |
|         $this->assertParseData('&&&', '&&&');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_trailingUnescapedAmpersand() {
 | |
|         $this->assertParseData('&&', '&&');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_internalUnescapedAmpersand() {
 | |
|         $this->assertParseData('Procter & Gamble');
 | |
|     }
 | |
|     
 | |
|     function test_parseData_improperEntityFaultToleranceTest() {
 | |
|         $this->assertParseData('-');
 | |
|     }
 | |
|     
 | |
|     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
 | |
|     
 | |
|     function assertExtractBody($text, $extract = true) {
 | |
|         $lexer = new HTMLPurifier_Lexer();
 | |
|         $result = $lexer->extractBody($text);
 | |
|         if ($extract === true) $extract = $text;
 | |
|         $this->assertIdentical($extract, $result);
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_noBodyTags() {
 | |
|         $this->assertExtractBody('<b>Bold</b>');
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_lowercaseBodyTags() {
 | |
|         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_uppercaseBodyTags() {
 | |
|         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_realisticUseCase() {
 | |
|         $this->assertExtractBody(
 | |
| '<?xml version="1.0"
 | |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 | |
|     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 | |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 | |
|    <head>
 | |
|       <title>xyz</title>
 | |
|    </head>
 | |
|    <body>
 | |
|       <form method="post" action="whatever1">
 | |
|          <div>
 | |
|             <input type="text" name="username" />
 | |
|             <input type="text" name="password" />
 | |
|             <input type="submit" />
 | |
|          </div>
 | |
|       </form>
 | |
|    </body>
 | |
| </html>',
 | |
|     '
 | |
|       <form method="post" action="whatever1">
 | |
|          <div>
 | |
|             <input type="text" name="username" />
 | |
|             <input type="text" name="password" />
 | |
|             <input type="submit" />
 | |
|          </div>
 | |
|       </form>
 | |
|    ');
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_bodyWithAttributes() {
 | |
|         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 | |
|     }
 | |
|     
 | |
|     function test_extractBody_preserveUnclosedBody() {
 | |
|         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 | |
|     }
 | |
|     
 | |
|     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 | |
|     
 | |
|     function assertTokenization($input, $expect, $alt_expect = array()) {
 | |
|         $lexers = array();
 | |
|         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 | |
|         if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
 | |
|         if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
 | |
|             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 | |
|             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 | |
|         }
 | |
|         foreach ($lexers as $name => $lexer) {
 | |
|             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 | |
|             if (isset($alt_expect[$name])) {
 | |
|                 if ($alt_expect[$name] === false) continue;
 | |
|                 $t_expect = $alt_expect[$name];
 | |
|                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 | |
|             } else {
 | |
|                 $t_expect = $expect;
 | |
|                 $this->assertIdentical($result, $expect, "$name: %s");
 | |
|             }
 | |
|             if ($t_expect != $result) {
 | |
|                 printTokens($result);
 | |
|                 //var_dump($result);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_emptyInput() {
 | |
|         $this->assertTokenization('', array());
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_plainText() {
 | |
|         $this->assertTokenization(
 | |
|             'This is regular text.',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Text('This is regular text.')
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_textAndTags() {
 | |
|         $this->assertTokenization(
 | |
|             'This is <b>bold</b> text',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Text('This is '),
 | |
|                 new HTMLPurifier_Token_Start('b', array()),
 | |
|                 new HTMLPurifier_Token_Text('bold'),
 | |
|                 new HTMLPurifier_Token_End('b'),
 | |
|                 new HTMLPurifier_Token_Text(' text'),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_normalizeCase() {
 | |
|         $this->assertTokenization(
 | |
|             '<DIV>Totally rad dude. <b>asdf</b></div>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Start('DIV', array()),
 | |
|                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 | |
|                 new HTMLPurifier_Token_Start('b', array()),
 | |
|                 new HTMLPurifier_Token_Text('asdf'),
 | |
|                 new HTMLPurifier_Token_End('b'),
 | |
|                 new HTMLPurifier_Token_End('div'),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_notWellFormed() {
 | |
|         $this->assertTokenization(
 | |
|             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Start('asdf'),
 | |
|                 new HTMLPurifier_Token_End('asdf'),
 | |
|                 new HTMLPurifier_Token_Start('d'),
 | |
|                 new HTMLPurifier_Token_End('d'),
 | |
|                 new HTMLPurifier_Token_Start('poOloka'),
 | |
|                 new HTMLPurifier_Token_Start('poolasdf'),
 | |
|                 new HTMLPurifier_Token_Start('ds'),
 | |
|                 new HTMLPurifier_Token_End('asdf'),
 | |
|                 new HTMLPurifier_Token_End('ASDF'),
 | |
|             ),
 | |
|             array(
 | |
|                 'DOMLex' => $alt = array(
 | |
|                     new HTMLPurifier_Token_Empty('asdf'),
 | |
|                     new HTMLPurifier_Token_Empty('d'),
 | |
|                     new HTMLPurifier_Token_Start('pooloka'),
 | |
|                     new HTMLPurifier_Token_Start('poolasdf'),
 | |
|                     new HTMLPurifier_Token_Empty('ds'),
 | |
|                     new HTMLPurifier_Token_End('poolasdf'),
 | |
|                     new HTMLPurifier_Token_End('pooloka'),
 | |
|                 ),
 | |
|                 'PH5P' => $alt,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_whitespaceInTag() {
 | |
|         $this->assertTokenization(
 | |
|             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 | |
|                 new HTMLPurifier_Token_Text('Link to '),
 | |
|                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 | |
|                 new HTMLPurifier_Token_Text('foobar'),
 | |
|                 new HTMLPurifier_Token_End('b'),
 | |
|                 new HTMLPurifier_Token_End('a'),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_emptyTag() {
 | |
|         $this->assertTokenization(
 | |
|             '<br />',
 | |
|             array( new HTMLPurifier_Token_Empty('br') )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_comment() {
 | |
|         $this->assertTokenization(
 | |
|             '<!-- Comment -->',
 | |
|             array( new HTMLPurifier_Token_Comment(' Comment ') ),
 | |
|             array(
 | |
|                 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_malformedComment() {
 | |
|         $this->assertTokenization(
 | |
|             '<!-- not so well formed --->',
 | |
|             array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
 | |
|             array(
 | |
|                 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_unterminatedTag() {
 | |
|         $this->assertTokenization(
 | |
|             '<a href=""',
 | |
|             array( new HTMLPurifier_Token_Text('<a href=""') ),
 | |
|             array(
 | |
|                 // I like our behavior better, but it's non-standard
 | |
|                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 | |
|                 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
 | |
|                 'PH5P' => false, // total barfing, grabs scaffolding too
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_specialEntities() {
 | |
|         $this->assertTokenization(
 | |
|             '<b>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Text('<b>')
 | |
|             ),
 | |
|             array(
 | |
|                 // some parsers will separate entities out
 | |
|                 'PEARSax3' => $split = array(
 | |
|                     new HTMLPurifier_Token_Text('<'),
 | |
|                     new HTMLPurifier_Token_Text('b'),
 | |
|                     new HTMLPurifier_Token_Text('>'),
 | |
|                 ),
 | |
|                 'PH5P' => $split,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_earlyQuote() {
 | |
|         $this->assertTokenization(
 | |
|             '<a "=>',
 | |
|             array( new HTMLPurifier_Token_Empty('a') ),
 | |
|             array(
 | |
|                 // we barf on this input
 | |
|                 'DirectLex' => $tokens = array(
 | |
|                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 | |
|                 ),
 | |
|                 'PEARSax3' => $tokens,
 | |
|                 'PH5P' => array(
 | |
|                     new HTMLPurifier_Token_Empty('a', array('"' => ''))
 | |
|                 ),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_unescapedQuote() {
 | |
|         $this->assertTokenization(
 | |
|             '"',
 | |
|             array( new HTMLPurifier_Token_Text('"') )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_escapedQuote() {
 | |
|         $this->assertTokenization(
 | |
|             '"',
 | |
|             array( new HTMLPurifier_Token_Text('"') ),
 | |
|             array(
 | |
|                 'PEARSax3' => false, // PEAR barfs on this
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_cdata() {
 | |
|         $this->assertTokenization(
 | |
|             '<![CDATA[You <b>can't</b> get me!]]>',
 | |
|             array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
 | |
|             array(
 | |
|                 // PEAR splits up all of the CDATA
 | |
|                 'PEARSax3' => $split = array(
 | |
|                     new HTMLPurifier_Token_Text('You '),
 | |
|                     new HTMLPurifier_Token_Text('<'),
 | |
|                     new HTMLPurifier_Token_Text('b'),
 | |
|                     new HTMLPurifier_Token_Text('>'),
 | |
|                     new HTMLPurifier_Token_Text('can'),
 | |
|                     new HTMLPurifier_Token_Text('&'),
 | |
|                     new HTMLPurifier_Token_Text('#39;t'),
 | |
|                     new HTMLPurifier_Token_Text('<'),
 | |
|                     new HTMLPurifier_Token_Text('/b'),
 | |
|                     new HTMLPurifier_Token_Text('>'),
 | |
|                     new HTMLPurifier_Token_Text(' get me!'),
 | |
|                 ),
 | |
|                 'PH5P' => $split,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_characterEntity() {
 | |
|         $this->assertTokenization(
 | |
|             'θ',
 | |
|             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_characterEntityInCDATA() {
 | |
|         $this->assertTokenization(
 | |
|             '<![CDATA[→]]>',
 | |
|             array( new HTMLPurifier_Token_Text("→") ),
 | |
|             array(
 | |
|                 'PEARSax3' => $split = array(
 | |
|                     new HTMLPurifier_Token_Text('&'),
 | |
|                     new HTMLPurifier_Token_Text('rarr;'),
 | |
|                 ),
 | |
|                 'PH5P' => $split,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_entityInAttribute() {
 | |
|         $this->assertTokenization(
 | |
|             '<a href="index.php?title=foo&id=bar">Link</a>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 | |
|                 new HTMLPurifier_Token_Text('Link'),
 | |
|                 new HTMLPurifier_Token_End('a'),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_preserveUTF8() {
 | |
|         $this->assertTokenization(
 | |
|             "\xCE\xB8",
 | |
|             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_specialEntityInAttribute() {
 | |
|         $this->assertTokenization(
 | |
|             '<br test="x < 6" />',
 | |
|             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_emoticonProtection() {
 | |
|         $this->config->set('Core', 'AggressivelyFixLt', true);
 | |
|         $this->assertTokenization(
 | |
|             '<b>Whoa! <3 That\'s not good >.></b>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Start('b'),
 | |
|                 new HTMLPurifier_Token_Text('Whoa! '),
 | |
|                 new HTMLPurifier_Token_Text('<3 That\'s not good >'),
 | |
|                 new HTMLPurifier_Token_Text('.>'),
 | |
|                 new HTMLPurifier_Token_End('b')
 | |
|             ),
 | |
|             array(
 | |
|                 // text is absorbed together
 | |
|                 'DOMLex' => array(
 | |
|                     new HTMLPurifier_Token_Start('b'),
 | |
|                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 | |
|                     new HTMLPurifier_Token_End('b'),
 | |
|                 ),
 | |
|                 'PEARSax3' => false, // totally mangled
 | |
|                 'PH5P' => array( // interesting grouping
 | |
|                     new HTMLPurifier_Token_Start('b'),
 | |
|                     new HTMLPurifier_Token_Text('Whoa! '),
 | |
|                     new HTMLPurifier_Token_Text('<'),
 | |
|                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 | |
|                     new HTMLPurifier_Token_End('b'),
 | |
|                 ),
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_commentWithFunkyChars() {
 | |
|         $this->assertTokenization(
 | |
|             '<!-- This >< comment --><br />',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Comment(' This >< comment '),
 | |
|                 new HTMLPurifier_Token_Empty('br'),
 | |
|             ),
 | |
|             array(
 | |
|                 'PEARSax3' => false,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_unterminatedComment() {
 | |
|         $this->assertTokenization(
 | |
|             '<!-- This >< comment',
 | |
|             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 | |
|             array(
 | |
|                 'DOMLex'   => false,
 | |
|                 'PEARSax3' => false,
 | |
|                 'PH5P'     => false,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_scriptCDATAContents() {
 | |
|         $this->config->set('HTML', 'Trusted', true);
 | |
|         $this->assertTokenization(
 | |
|             'Foo: <script>alert("<foo>");</script>',
 | |
|             array(
 | |
|                 new HTMLPurifier_Token_Text('Foo: '),
 | |
|                 new HTMLPurifier_Token_Start('script'),
 | |
|                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 | |
|                 new HTMLPurifier_Token_End('script'),
 | |
|             ),
 | |
|             array(
 | |
|                 'PEARSax3' => false,
 | |
|                 // PH5P, for some reason, bubbles the script to <head>
 | |
|                 'PH5P' => false,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_entitiesInComment() {
 | |
|         $this->config->set('Core', 'AggressivelyFixLt', true);
 | |
|         $this->assertTokenization(
 | |
|             '<!-- This comment < < & -->',
 | |
|             array( new HTMLPurifier_Token_Comment(' This comment < < & ') ),
 | |
|             array(
 | |
|                 'PEARSax3' => false
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 | |
|         $this->assertTokenization(
 | |
|             '<a href="><>">',
 | |
|             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 | |
|             array(
 | |
|                 'DirectLex' => array(
 | |
|                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 | |
|                     new HTMLPurifier_Token_Text('<">'),
 | |
|                 ),
 | |
|                 'PEARSax3' => false,
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 | |
|         $this->assertTokenization(
 | |
|             '<param name="src" value="http://example.com/video.wmv" />',
 | |
|             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 | |
|         );
 | |
|     }
 | |
|     
 | |
|     /*
 | |
|     
 | |
|     function test_tokenizeHTML_() {
 | |
|         $this->assertTokenization(
 | |
|             ,
 | |
|             array(
 | |
|                 
 | |
|             )
 | |
|         );
 | |
|     }
 | |
|     */
 | |
|     
 | |
| }
 | |
| 
 |