diff --git a/Lexer/PEARSax3.php b/Lexer/PEARSax3.php new file mode 100644 index 00000000..0232d9a3 --- /dev/null +++ b/Lexer/PEARSax3.php @@ -0,0 +1,399 @@ +_entity_utf8 = version_compare(PHP_VERSION, '5', '>='); + } + + // this is QUITE a knotty problem + // + // The main trouble is that, even while assuming UTF-8 is what we're + // using, we've got to deal with HTML entities (like —) + // Not even sure if the PHP 5 decoding function does that. Plus, + // SimpleTest doesn't use UTF-8! + // + // However, we MUST parse everything possible, because once you get + // to the HTML generator, it will escape everything possible (although + // that may not be correct, and we should be using htmlspecialchars() ). + // + // Nevertheless, strictly XML speaking, we cannot assume any character + // entities are defined except the htmlspecialchars() ones, so leaving + // the entities inside HERE is not acceptable. (plus, htmlspecialchars + // might convert them anyway). So EVERYTHING must get parsed. + // + // We may need to roll our own character entity lookup table. It's only + // about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8. + function parseData($string) { + // we may want to let the user do a different char encoding, + // although there is NO REASON why they shouldn't be able + // to convert it to UTF-8 before they pass it to us + + // no support for less than PHP 4.3 + if ($this->_entity_utf8) { + // PHP 5+, UTF-8 is nicely supported + return @html_entity_decode($string, ENT_QUOTES, 'UTF-8'); + } else { + // PHP 4, do compat stuff + $string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1'); + // get the numeric UTF-8 stuff + $string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string); + $string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string); + // get the stringy UTF-8 stuff + return $string; + } + } + + function nextQuote($string, $offset = 0) { + $quotes = array('"', "'"); + return $this->next($string, $quotes, $offset); + } + + function nextWhiteSpace($string, $offset = 0) { + $spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA)); + return $this->next($string, $spaces, $offset); + } + + function next($haystack, $needles, $offset = 0) { + if (is_string($needles)) { + $string_needles = $needles; + $needles = array(); + $size = strlen($string_needles); + for ($i = 0; $i < $size; $i++) { + $needles[] = $string_needles{$i}; + } + } + $positions = array(); + foreach ($needles as $needle) { + $position = strpos($haystack, $needle, $offset); + if ($position !== false) { + $positions[] = $position; + } + } + return empty($positions) ? false : min($positions); + } + + function tokenizeHTML($string) { + + // some quick checking (if empty, return empty) + $string = (string) $string; + if ($string == '') return array(); + + $cursor = 0; // our location in the text + $inside_tag = false; // whether or not we're parsing the inside of a tag + $array = array(); // result array + + // infinite loop protection + // has to be pretty big, since html docs can be big + // we're allow two hundred thousand tags... more than enough? + $loops = 0; + + while(true) { + + // infinite loop protection + if (++$loops > 200000) return array(); + + $position_next_lt = strpos($string, '<', $cursor); + $position_next_gt = strpos($string, '>', $cursor); + + // triggers on "asdf" but not "asdf " + if ($position_next_lt === $cursor) { + $inside_tag = true; + $cursor++; + } + + if (!$inside_tag && $position_next_lt !== false) { + // We are not inside tag and there still is another tag to parse + $array[] = new + HTMLPurifier_Token_Text( + html_entity_decode( + substr( + $string, $cursor, $position_next_lt - $cursor + ), + ENT_QUOTES + ) + ); + $cursor = $position_next_lt + 1; + $inside_tag = true; + continue; + } elseif (!$inside_tag) { + // We are not inside tag but there are no more tags + // If we're already at the end, break + if ($cursor === strlen($string)) break; + // Create Text of rest of string + $array[] = new + HTMLPurifier_Token_Text( + html_entity_decode( + substr( + $string, $cursor + ), + ENT_QUOTES + ) + ); + break; + } elseif ($inside_tag && $position_next_gt !== false) { + // We are in tag and it is well formed + // Grab the internals of the tag + $segment = substr($string, $cursor, $position_next_gt-$cursor); + + // Check if it's a comment + if ( + substr($segment,0,3) == '!--' && + substr($segment,strlen($segment)-2,2) == '--' + ) { + $array[] = new + HTMLPurifier_Token_Comment( + substr( + $segment, 3, strlen($segment) - 5 + ) + ); + $inside_tag = false; + $cursor = $position_next_gt + 1; + continue; + } + + // Check if it's an end tag + $is_end_tag = (strpos($segment,'/') === 0); + if ($is_end_tag) { + $type = substr($segment, 1); + $array[] = new HTMLPurifier_Token_End($type); + $inside_tag = false; + $cursor = $position_next_gt + 1; + continue; + } + + // Check if it is explicitly self closing, if so, remove + // trailing slash. Remember, we could have a tag like
, so + // any later token processing scripts must convert improperly + // classified EmptyTags from StartTags. + $is_self_closing= (strpos($segment,'/') === strlen($segment)-1); + if ($is_self_closing) { + $segment = substr($segment, 0, strlen($segment) - 1); + } + + // Check if there are any attributes + $position_first_space = $this->nextWhiteSpace($segment); + if ($position_first_space === false) { + if ($is_self_closing) { + $array[] = new HTMLPurifier_Token_Empty($segment); + } else { + $array[] = new HTMLPurifier_Token_Start($segment); + } + $inside_tag = false; + $cursor = $position_next_gt + 1; + continue; + } + + // Grab out all the data + $type = substr($segment, 0, $position_first_space); + $attribute_string = + trim( + substr( + $segment, $position_first_space + ) + ); + $attributes = $this->tokenizeAttributeString($attribute_string); + if ($is_self_closing) { + $array[] = new HTMLPurifier_Token_Empty($type, $attributes); + } else { + $array[] = new HTMLPurifier_Token_Start($type, $attributes); + } + $cursor = $position_next_gt + 1; + $inside_tag = false; + continue; + } else { + $array[] = new + HTMLPurifier_Token_Text( + '<' . + html_entity_decode( + substr($string, $cursor), + ENT_QUOTES + ) + ); + break; + } + break; + } + return $array; + } + + function tokenizeAttributeString($string) { + $string = (string) $string; + if ($string == '') return array(); + $array = array(); + $cursor = 0; + $in_value = false; + $i = 0; + $size = strlen($string); + + // if we have unquoted attributes, the parser expects a terminating + // space, so let's guarantee that there's always a terminating space. + $string .= ' '; + + // infinite loop protection + $loops = 0; + + while(true) { + + // infinite loop protection + // if we've looped 1000 times, abort. Nothing good can come of this + if (++$loops > 1000) return array(); + + if ($cursor >= $size) { + break; + } + $position_next_space = $this->nextWhiteSpace($string, $cursor); + //scroll to the last whitespace before text + while ($position_next_space === $cursor) { + $cursor++; + $position_next_space = $this->nextWhiteSpace($string, $cursor); + } + $position_next_equal = strpos($string, '=', $cursor); + if ($position_next_equal !== false && + ($position_next_equal < $position_next_space || + $position_next_space === false)) { + //attr="asdf" + // grab the key + $key = trim( + substr( + $string, $cursor, $position_next_equal - $cursor + ) + ); + + // set cursor right after the equal sign + $cursor = $position_next_equal + 1; + + // consume all spaces after the equal sign + $position_next_space = $this->nextWhiteSpace($string, $cursor); + while ($position_next_space === $cursor) { + $cursor++; + $position_next_space=$this->nextWhiteSpace($string,$cursor); + } + + // if we've hit the end, assign the key an empty value and abort + if ($cursor >= $size) { + $array[$key] = ''; + break; + } + + // find the next quote + $position_next_quote = $this->nextQuote($string, $cursor); + + // if the quote is not where the cursor is, we're dealing + // with an unquoted attribute + if ($position_next_quote !== $cursor) { + if ($key) { + $array[$key] = trim(substr($string, $cursor, + $position_next_space - $cursor)); + } + $cursor = $position_next_space + 1; + continue; + } + + // otherwise, regular attribute + $quote = $string{$position_next_quote}; + $position_end_quote = strpos( + $string, $quote, $position_next_quote + 1 + ); + + // check if the ending quote is missing + if ($position_end_quote === false) { + // it is, assign it to the end of the string + $position_end_quote = $size; + } + + $value = substr($string, $position_next_quote + 1, + $position_end_quote - $position_next_quote - 1); + if ($key) { + $array[$key] = html_entity_decode($value, ENT_QUOTES); + } + $cursor = $position_end_quote + 1; + } else { + //boolattr + if ($position_next_space === false) { + $position_next_space = $size; + } + $key = substr($string, $cursor, $position_next_space - $cursor); + if ($key) { + $array[$key] = $key; + } + $cursor = $position_next_space + 1; + } + } + return $array; + } + +} + +// uses the PEAR class XML_HTMLSax3 to parse XML +// only shares the tokenizeHTML() function +class HTMLPurifier_Lexer_Sax extends HTMLPurifier_Lexer +{ + + var $tokens = array(); + + function tokenizeHTML($html) { + $this->tokens = array(); + $parser=& new XML_HTMLSax3(); + $parser->set_object($this); + $parser->set_element_handler('openHandler','closeHandler'); + $parser->set_data_handler('dataHandler'); + $parser->set_escape_handler('escapeHandler'); + $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); + $parser->parse($html); + return $this->tokens; + } + + function openHandler(&$parser, $name, $attrs, $closed) { + if ($closed) { + $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); + } else { + $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs); + } + return true; + } + + function closeHandler(&$parser, $name) { + // HTMLSax3 seems to always send empty tags an extra close tag + // check and ignore if you see it: + // [TESTME] to make sure it doesn't overreach + if ($this->tokens[count($this->tokens)-1]->type == 'empty') { + return true; + } + $this->tokens[] = new HTMLPurifier_Token_End($name); + return true; + } + + function dataHandler(&$parser, $data) { + $this->tokens[] = new HTMLPurifier_Token_Text($data); + return true; + } + + function escapeHandler(&$parser, $data) { + if (strpos($data, '-') === 0) { + $this->tokens[] = new HTMLPurifier_Token_Comment($data); + } + return true; + } + +} + +?> \ No newline at end of file