diff --git a/HTML_Lexer.php b/HTML_Lexer.php
index d69629d4..ebcd0a33 100644
--- a/HTML_Lexer.php
+++ b/HTML_Lexer.php
@@ -9,7 +9,7 @@ TODO:
*/
-class MarkupLexer
+class HTML_Lexer
{
function nextQuote($string, $offset = 0) {
@@ -98,7 +98,10 @@ class MarkupLexer
continue;
}
- // Check if it is self closing, if so, remove trailing slash
+ // Check if it is explicitly self closing, if so, remove
+ // trailing slash. Remember, we could have a tag like
, so
+ // any later token processing scripts must convert improperly
+ // classified EmptyTags from StartTags.
$is_self_closing = (strpos($segment,'/') === strlen($segment) - 1);
if ($is_self_closing) {
$segment = substr($segment, 0, strlen($segment) - 1);
@@ -189,4 +192,45 @@ class MarkupLexer
}
+// uses the PEAR class XML_HTMLSax3 to parse XML
+class HTML_Lexer_Sax extends HTML_Lexer
+{
+
+ var $tokens = array();
+
+ function tokenizeHTML($html) {
+ $this->tokens = array();
+ $parser=& new XML_HTMLSax3();
+ $parser->set_object($this);
+ $parser->set_element_handler('openHandler','closeHandler');
+ $parser->set_data_handler('dataHandler');
+ $parser->set_escape_handler('escapeHandler');
+ $parser->parse($html);
+ return $this->tokens;
+ }
+
+ function openHandler(&$parser, $name, $attrs) {
+ $this->tokens[] = new MF_StartTag($name, $attrs);
+ return true;
+ }
+
+ function closeHandler(&$parser, $name) {
+ $this->tokens[] = new MF_EndTag($name);
+ return true;
+ }
+
+ function dataHandler(&$parser, $data) {
+ $this->tokens[] = new MF_Text($data);
+ return true;
+ }
+
+ function escapeHandler(&$parser, $data) {
+ if (strpos($data, '-') === 0) {
+ $this->tokens[] = new MF_Comment($data);
+ }
+ return true;
+ }
+
+}
+
?>
\ No newline at end of file
diff --git a/tests/HTML_Lexer.php b/tests/HTML_Lexer.php
index a4d9c35e..8a3563f3 100644
--- a/tests/HTML_Lexer.php
+++ b/tests/HTML_Lexer.php
@@ -1,16 +1,18 @@
MarkupLexer =& new MarkupLexer();
+ $this->HTML_Lexer =& new HTML_Lexer();
+ $this->HTML_Lexer_Sax =& new HTML_Lexer_Sax();
}
function test_nextWhiteSpace() {
- $HP =& $this->MarkupLexer;
+ $HP =& $this->HTML_Lexer;
$this->assertIdentical(false, $HP->nextWhiteSpace('asdf'));
$this->assertIdentical(0, $HP->nextWhiteSpace(' asdf'));
$this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf"));
@@ -90,9 +92,13 @@ class TestCase_MarkupLexer extends UnitTestCase
$size = count($input);
for($i = 0; $i < $size; $i++) {
- $result = $this->MarkupLexer->tokenizeHTML($input[$i]);
+ $result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
$this->assertEqual($expect[$i], $result);
paintIf($result, $expect[$i] != $result);
+
+ // since I didn't write the parser, I can't define its behavior
+ // however, make sure that the class runs without any errors
+ $exp_result = $this->HTML_Lexer_Sax->tokenizeHTML($input[$i]);
}
}
@@ -116,7 +122,7 @@ class TestCase_MarkupLexer extends UnitTestCase
$size = count($input);
for($i = 0; $i < $size; $i++) {
- $result = $this->MarkupLexer->tokenizeAttributeString($input[$i]);
+ $result = $this->HTML_Lexer->tokenizeAttributeString($input[$i]);
$this->assertEqual($expect[$i], $result);
paintIf($result, $expect[$i] != $result);
}