diff --git a/HTML_Lexer.php b/HTML_Lexer.php index d69629d4..ebcd0a33 100644 --- a/HTML_Lexer.php +++ b/HTML_Lexer.php @@ -9,7 +9,7 @@ TODO: */ -class MarkupLexer +class HTML_Lexer { function nextQuote($string, $offset = 0) { @@ -98,7 +98,10 @@ class MarkupLexer continue; } - // Check if it is self closing, if so, remove trailing slash + // Check if it is explicitly self closing, if so, remove + // trailing slash. Remember, we could have a tag like
, so + // any later token processing scripts must convert improperly + // classified EmptyTags from StartTags. $is_self_closing = (strpos($segment,'/') === strlen($segment) - 1); if ($is_self_closing) { $segment = substr($segment, 0, strlen($segment) - 1); @@ -189,4 +192,45 @@ class MarkupLexer } +// uses the PEAR class XML_HTMLSax3 to parse XML +class HTML_Lexer_Sax extends HTML_Lexer +{ + + var $tokens = array(); + + function tokenizeHTML($html) { + $this->tokens = array(); + $parser=& new XML_HTMLSax3(); + $parser->set_object($this); + $parser->set_element_handler('openHandler','closeHandler'); + $parser->set_data_handler('dataHandler'); + $parser->set_escape_handler('escapeHandler'); + $parser->parse($html); + return $this->tokens; + } + + function openHandler(&$parser, $name, $attrs) { + $this->tokens[] = new MF_StartTag($name, $attrs); + return true; + } + + function closeHandler(&$parser, $name) { + $this->tokens[] = new MF_EndTag($name); + return true; + } + + function dataHandler(&$parser, $data) { + $this->tokens[] = new MF_Text($data); + return true; + } + + function escapeHandler(&$parser, $data) { + if (strpos($data, '-') === 0) { + $this->tokens[] = new MF_Comment($data); + } + return true; + } + +} + ?> \ No newline at end of file diff --git a/tests/HTML_Lexer.php b/tests/HTML_Lexer.php index a4d9c35e..8a3563f3 100644 --- a/tests/HTML_Lexer.php +++ b/tests/HTML_Lexer.php @@ -1,16 +1,18 @@ MarkupLexer =& new MarkupLexer(); + $this->HTML_Lexer =& new HTML_Lexer(); + $this->HTML_Lexer_Sax =& new HTML_Lexer_Sax(); } function test_nextWhiteSpace() { - $HP =& $this->MarkupLexer; + $HP =& $this->HTML_Lexer; $this->assertIdentical(false, $HP->nextWhiteSpace('asdf')); $this->assertIdentical(0, $HP->nextWhiteSpace(' asdf')); $this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf")); @@ -90,9 +92,13 @@ class TestCase_MarkupLexer extends UnitTestCase $size = count($input); for($i = 0; $i < $size; $i++) { - $result = $this->MarkupLexer->tokenizeHTML($input[$i]); + $result = $this->HTML_Lexer->tokenizeHTML($input[$i]); $this->assertEqual($expect[$i], $result); paintIf($result, $expect[$i] != $result); + + // since I didn't write the parser, I can't define its behavior + // however, make sure that the class runs without any errors + $exp_result = $this->HTML_Lexer_Sax->tokenizeHTML($input[$i]); } } @@ -116,7 +122,7 @@ class TestCase_MarkupLexer extends UnitTestCase $size = count($input); for($i = 0; $i < $size; $i++) { - $result = $this->MarkupLexer->tokenizeAttributeString($input[$i]); + $result = $this->HTML_Lexer->tokenizeAttributeString($input[$i]); $this->assertEqual($expect[$i], $result); paintIf($result, $expect[$i] != $result); }