svn:eol-style = native

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@97 48356398-32a2-884e-a903-53898d9a118a
2025-10-14 21:54:24 +02:00 · 2006-07-23 00:11:03 +00:00
parent 39c16f5cfd
commit 14f481bcf6
21 changed files with 3219 additions and 3219 deletions
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -1,11 +1,11 @@
-<?php
-
-class HTMLPurifier_AttrDef
-{
-    var $def;
-    function HTMLPurifier_AttrDef($def) {
-        $this->def = $def;
-    }
-}
-
+<?php
+
+class HTMLPurifier_AttrDef
+{
+    var $def;
+    function HTMLPurifier_AttrDef($def) {
+        $this->def = $def;
+    }
+}
+
 ?>
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@@ -1,169 +1,169 @@
-<?php
-
-// HTMLPurifier_ChildDef and inheritance have three types of output:
-// true = leave nodes as is
-// false = delete parent node and all children
-// array(...) = replace children nodes with these
-
-// this is the hardest one to implement. We'll use fancy regexp tricks
-// right now, we only expect it to return TRUE or FALSE (it won't attempt
-// to fix the tree)
-
-// we may end up writing custom code for each HTML case
-// in order to make it self correcting
-class HTMLPurifier_ChildDef
-{
-    var $type = 'custom';
-    var $dtd_regex;
-    var $_pcre_regex;
-    function HTMLPurifier_ChildDef($dtd_regex) {
-        $this->dtd_regex = $dtd_regex;
-        $this->_compileRegex();
-    }
-    function _compileRegex() {
-        $raw = str_replace(' ', '', $this->dtd_regex);
-        if ($raw{0} != '(') {
-            $raw = "($raw)";
-        }
-        $reg = str_replace(',', ',?', $raw);
-        $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
-        $this->_pcre_regex = $reg;
-    }
-    function validateChildren($tokens_of_children) {
-        $list_of_children = '';
-        $nesting = 0; // depth into the nest
-        foreach ($tokens_of_children as $token) {
-            if (!empty($token->is_whitespace)) continue;
-            
-            $is_child = ($nesting == 0); // direct
-            
-            if ($token->type == 'start') {
-                $nesting++;
-            } elseif ($token->type == 'end') {
-                $nesting--;
-            }
-            
-            if ($is_child) {
-                $list_of_children .= $token->name . ',';
-            }
-        }
-        $list_of_children = rtrim($list_of_children, ',');
-        
-        $okay =
-            preg_match(
-                '/^'.$this->_pcre_regex.'$/',
-                $list_of_children
-            );
-        
-        return (bool) $okay;
-    }
-}
-class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef
-{
-    var $elements = array();
-    function HTMLPurifier_ChildDef_Simple($elements) {
-        if (is_string($elements)) {
-            $elements = str_replace(' ', '', $elements);
-            $elements = explode('|', $elements);
-        }
-        $elements = array_flip($elements);
-        foreach ($elements as $i => $x) $elements[$i] = true;
-        $this->elements = $elements;
-        $this->gen = new HTMLPurifier_Generator();
-    }
-    function validateChildren() {
-        trigger_error('Cannot call abstract function!', E_USER_ERROR);
-    }
-}
-class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple
-{
-    var $type = 'required';
-    function validateChildren($tokens_of_children) {
-        // if there are no tokens, delete parent node
-        if (empty($tokens_of_children)) return false;
-        
-        // the new set of children
-        $result = array();
-        
-        // current depth into the nest
-        $nesting = 0;
-        
-        // whether or not we're deleting a node
-        $is_deleting = false;
-        
-        // whether or not parsed character data is allowed
-        // this controls whether or not we silently drop a tag
-        // or generate escaped HTML from it
-        $pcdata_allowed = isset($this->elements['#PCDATA']);
-        
-        // a little sanity check to make sure it's not ALL whitespace
-        $all_whitespace = true;
-        
-        foreach ($tokens_of_children as $token) {
-            if (!empty($token->is_whitespace)) {
-                $result[] = $token;
-                continue;
-            }
-            $all_whitespace = false; // phew, we're not talking about whitespace
-            
-            $is_child = ($nesting == 0);
-            
-            if ($token->type == 'start') {
-                $nesting++;
-            } elseif ($token->type == 'end') {
-                $nesting--;
-            }
-            
-            if ($is_child) {
-                $is_deleting = false;
-                if (!isset($this->elements[$token->name])) {
-                    $is_deleting = true;
-                    if ($pcdata_allowed) {
-                        $result[] = new HTMLPurifier_Token_Text(
-                            $this->gen->generateFromToken($token)
-                        );
-                    }
-                    continue;
-                }
-            }
-            if (!$is_deleting) {
-                $result[] = $token;
-            } elseif ($pcdata_allowed) {
-                $result[] =
-                    new HTMLPurifier_Token_Text(
-                        $this->gen->generateFromToken( $token )
-                    );
-            } else {
-                // drop silently
-            }
-        }
-        if (empty($result)) return false;
-        if ($all_whitespace) return false;
-        if ($tokens_of_children == $result) return true;
-        return $result;
-    }
-}
-
-// only altered behavior is that it returns an empty array
-// instead of a false (to delete the node)
-class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
-{
-    var $type = 'optional';
-    function validateChildren($tokens_of_children) {
-        $result = parent::validateChildren($tokens_of_children);
-        if ($result === false) return array();
-        return $result;
-    }
-}
-
-// placeholder
-class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
-{
-    var $type = 'empty';
-    function HTMLPurifier_ChildDef_Empty() {}
-    function validateChildren() {
-        return false;
-    }
-}
-
+<?php
+
+// HTMLPurifier_ChildDef and inheritance have three types of output:
+// true = leave nodes as is
+// false = delete parent node and all children
+// array(...) = replace children nodes with these
+
+// this is the hardest one to implement. We'll use fancy regexp tricks
+// right now, we only expect it to return TRUE or FALSE (it won't attempt
+// to fix the tree)
+
+// we may end up writing custom code for each HTML case
+// in order to make it self correcting
+class HTMLPurifier_ChildDef
+{
+    var $type = 'custom';
+    var $dtd_regex;
+    var $_pcre_regex;
+    function HTMLPurifier_ChildDef($dtd_regex) {
+        $this->dtd_regex = $dtd_regex;
+        $this->_compileRegex();
+    }
+    function _compileRegex() {
+        $raw = str_replace(' ', '', $this->dtd_regex);
+        if ($raw{0} != '(') {
+            $raw = "($raw)";
+        }
+        $reg = str_replace(',', ',?', $raw);
+        $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
+        $this->_pcre_regex = $reg;
+    }
+    function validateChildren($tokens_of_children) {
+        $list_of_children = '';
+        $nesting = 0; // depth into the nest
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) continue;
+            
+            $is_child = ($nesting == 0); // direct
+            
+            if ($token->type == 'start') {
+                $nesting++;
+            } elseif ($token->type == 'end') {
+                $nesting--;
+            }
+            
+            if ($is_child) {
+                $list_of_children .= $token->name . ',';
+            }
+        }
+        $list_of_children = rtrim($list_of_children, ',');
+        
+        $okay =
+            preg_match(
+                '/^'.$this->_pcre_regex.'$/',
+                $list_of_children
+            );
+        
+        return (bool) $okay;
+    }
+}
+class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef
+{
+    var $elements = array();
+    function HTMLPurifier_ChildDef_Simple($elements) {
+        if (is_string($elements)) {
+            $elements = str_replace(' ', '', $elements);
+            $elements = explode('|', $elements);
+        }
+        $elements = array_flip($elements);
+        foreach ($elements as $i => $x) $elements[$i] = true;
+        $this->elements = $elements;
+        $this->gen = new HTMLPurifier_Generator();
+    }
+    function validateChildren() {
+        trigger_error('Cannot call abstract function!', E_USER_ERROR);
+    }
+}
+class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple
+{
+    var $type = 'required';
+    function validateChildren($tokens_of_children) {
+        // if there are no tokens, delete parent node
+        if (empty($tokens_of_children)) return false;
+        
+        // the new set of children
+        $result = array();
+        
+        // current depth into the nest
+        $nesting = 0;
+        
+        // whether or not we're deleting a node
+        $is_deleting = false;
+        
+        // whether or not parsed character data is allowed
+        // this controls whether or not we silently drop a tag
+        // or generate escaped HTML from it
+        $pcdata_allowed = isset($this->elements['#PCDATA']);
+        
+        // a little sanity check to make sure it's not ALL whitespace
+        $all_whitespace = true;
+        
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) {
+                $result[] = $token;
+                continue;
+            }
+            $all_whitespace = false; // phew, we're not talking about whitespace
+            
+            $is_child = ($nesting == 0);
+            
+            if ($token->type == 'start') {
+                $nesting++;
+            } elseif ($token->type == 'end') {
+                $nesting--;
+            }
+            
+            if ($is_child) {
+                $is_deleting = false;
+                if (!isset($this->elements[$token->name])) {
+                    $is_deleting = true;
+                    if ($pcdata_allowed) {
+                        $result[] = new HTMLPurifier_Token_Text(
+                            $this->gen->generateFromToken($token)
+                        );
+                    }
+                    continue;
+                }
+            }
+            if (!$is_deleting) {
+                $result[] = $token;
+            } elseif ($pcdata_allowed) {
+                $result[] =
+                    new HTMLPurifier_Token_Text(
+                        $this->gen->generateFromToken( $token )
+                    );
+            } else {
+                // drop silently
+            }
+        }
+        if (empty($result)) return false;
+        if ($all_whitespace) return false;
+        if ($tokens_of_children == $result) return true;
+        return $result;
+    }
+}
+
+// only altered behavior is that it returns an empty array
+// instead of a false (to delete the node)
+class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
+{
+    var $type = 'optional';
+    function validateChildren($tokens_of_children) {
+        $result = parent::validateChildren($tokens_of_children);
+        if ($result === false) return array();
+        return $result;
+    }
+}
+
+// placeholder
+class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
+{
+    var $type = 'empty';
+    function HTMLPurifier_ChildDef_Empty() {}
+    function validateChildren() {
+        return false;
+    }
+}
+
 ?>
--- a/library/HTMLPurifier/Definition.php
+++ b/library/HTMLPurifier/Definition.php
@@ -1,445 +1,445 @@
-<?php
-
-require_once 'HTMLPurifier/AttrDef.php';
-require_once 'HTMLPurifier/ChildDef.php';
-require_once 'HTMLPurifier/Generator.php';
-require_once 'HTMLPurifier/Token.php';
-
-class HTMLPurifier_Definition
-{
-    
-    var $generator;
-    var $info = array();
-    var $info_closes_p = array(
-        // these are all block elements: blocks aren't allowed in P
-        'address'       => true,
-        'blockquote'    => true,
-        'dd'            => true,
-        'dir'           => true,
-        'div'           => true, 
-        'dl'            => true,
-        'dt'            => true,
-        'h1'            => true,
-        'h2'            => true,
-        'h3'            => true,
-        'h4'            => true, 
-        'h5'            => true,
-        'h6'            => true,
-        'hr'            => true,
-        'ol'            => true,
-        'p'             => true,
-        'pre'           => true, 
-        'table'         => true,
-        'ul'            => true
-        );
-    
-    function HTMLPurifier_Definition() {
-        $this->generator = new HTMLPurifier_Generator();
-    }
-    
-    function loadData() {
-        // emulates the structure of the DTD
-        
-        // entities: prefixed with e_ and _ replaces .
-        // we don't use an array because that complicates interpolation
-        // strings are used instead of arrays because if you use arrays,
-        // you have to do some hideous manipulation with array_merge()
-        
-        // these are condensed, remember, with bad stuff taken out
-        
-        // transforms: font, menu, dir, center
-        
-        // DON'T MONKEY AROUND THIS unless you know what you are doing
-        // and also know the assumptions the code makes about what this
-        // contains for optimization purposes (see fixNesting)
-        
-        $e_special_extra = 'img';
-        $e_special_basic = 'br | span | bdo';
-        $e_special = "$e_special_basic | $e_special_extra";
-        $e_fontstyle_extra = 'big | small';
-        $e_fontstyle_basic = 'tt | i | b | u | s | strike';
-        $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
-        $e_phrase_extra = 'sub | sup';
-        $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
-          ' | cite | abbr | acronym';
-        $e_phrase = "$e_phrase_basic | $e_phrase_extra";
-        $e_inline_forms = ''; // humor the dtd
-        $e_misc_inline = 'ins | del';
-        $e_misc = "$e_misc_inline";
-        $e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
-          " | $e_inline_forms";
-        // note the casing
-        $e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline".
-          " | $e_misc_inline");
-        $e_heading = 'h1|h2|h3|h4|h5|h6';
-        $e_lists = 'ul | ol | dl';
-        $e_blocktext = 'pre | hr | blockquote | address';
-        $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
-        $e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block".
-          " | $e_inline | $e_misc");
-        $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special".
-          " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
-        $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
-          " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
-          " | $e_inline_forms | $e_misc_inline");
-        $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
-        $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
-        
-        $this->info['ins'] =
-        $this->info['del'] = 
-        $this->info['blockquote'] =
-        $this->info['dd']  =
-        $this->info['li']  =
-        $this->info['div'] = new HTMLPurifier_ElementDef($e_Flow);
-        
-        $this->info['em']  =
-        $this->info['strong'] =
-        $this->info['dfn']  =
-        $this->info['code'] =
-        $this->info['samp'] =
-        $this->info['kbd']  =
-        $this->info['var']  =
-        $this->info['code'] =
-        $this->info['samp'] =
-        $this->info['kbd']  =
-        $this->info['var']  =
-        $this->info['cite'] =
-        $this->info['abbr'] =
-        $this->info['acronym'] =
-        $this->info['q']    =
-        $this->info['sub']  =
-        $this->info['tt']   =
-        $this->info['sup']  =
-        $this->info['i']    =
-        $this->info['b']    =
-        $this->info['big']  =
-        $this->info['small'] =
-        $this->info['u']    =
-        $this->info['s']    =
-        $this->info['strike'] =
-        $this->info['bdo']  =
-        $this->info['span'] =
-        $this->info['dt']   =
-        $this->info['p']    = 
-        $this->info['h1']   = 
-        $this->info['h2']   = 
-        $this->info['h3']   = 
-        $this->info['h4']   = 
-        $this->info['h5']   = 
-        $this->info['h6']   = new HTMLPurifier_ElementDef($e_Inline);
-        
-        $this->info['ol']   =
-        $this->info['ul']   =
-          new HTMLPurifier_ElementDef(
-            new HTMLPurifier_ChildDef_Required('li')
-          );
-        
-        $this->info['dl']   =
-          new HTMLPurifier_ElementDef(
-            new HTMLPurifier_ChildDef_Required('dt|dd')
-          );
-        $this->info['address'] =
-          new HTMLPurifier_ElementDef(
-            new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
-              " | $e_misc_inline")
-          );
-        
-        $this->info['img']  =
-        $this->info['br']   =
-        $this->info['hr']   = new HTMLPurifier_ElementDef(new HTMLPurifier_ChildDef_Empty());
-        
-        $this->info['pre']  = new HTMLPurifier_ElementDef($e_pre_content);
-        
-        $this->info['a']    = new HTMLPurifier_ElementDef($e_a_content);
-        
-    }
-    
-    function purifyTokens($tokens) {
-        if (empty($this->info)) $this->loadData();
-        $tokens = $this->removeForeignElements($tokens);
-        $tokens = $this->makeWellFormed($tokens);
-        $tokens = $this->fixNesting($tokens);
-        $tokens = $this->validateAttributes($tokens);
-        return $tokens;
-    }
-    
-    function removeForeignElements($tokens) {
-        if (empty($this->info)) $this->loadData();
-        $result = array();
-        foreach($tokens as $token) {
-            if (!empty( $token->is_tag )) {
-                if (!isset($this->info[$token->name])) {
-                    // invalid tag, generate HTML and insert in
-                    $token = new HTMLPurifier_Token_Text(
-                        $this->generator->generateFromToken($token)
-                    );
-                }
-            } elseif ($token->type == 'comment') {
-                // strip comments
-                continue;
-            } elseif ($token->type == 'text') {
-            } else {
-                continue;
-            }
-            $result[] = $token;
-        }
-        return $result;
-    }
-    
-    function makeWellFormed($tokens) {
-        if (empty($this->info)) $this->loadData();
-        $result = array();
-        $current_nesting = array();
-        foreach ($tokens as $token) {
-            if (empty( $token->is_tag )) {
-                $result[] = $token;
-                continue;
-            }
-            $info = $this->info[$token->name]; // assumption but valid
-            
-            // test if it claims to be a start tag but is empty
-            if ($info->child_def->type == 'empty' &&
-                $token->type == 'start' ) {
-                
-                $result[] = new HTMLPurifier_Token_Empty($token->name,
-                                                         $token->attributes);
-                continue;
-            }
-            
-            // test if it claims to be empty but really is a start tag
-            if ($info->child_def->type != 'empty' &&
-                $token->type == 'empty' ) {
-                
-                $result[] = new HTMLPurifier_Token_Start($token->name,
-                                                         $token->attributes);
-                $result[] = new HTMLPurifier_Token_End($token->name);
-                
-                continue;
-            }
-            
-            // automatically insert empty tags
-            if ($token->type == 'empty') {
-                $result[] = $token;
-                continue;
-            }
-            
-            // we give start tags precedence, so automatically accept unless...
-            // it's one of those special cases
-            if ($token->type == 'start') {
-                
-                // if there's a parent, check for special case
-                if (!empty($current_nesting)) {
-                    $current_parent = array_pop($current_nesting);
-                    
-                    // check if we're closing a P tag
-                    if ($current_parent->name == 'p' &&
-                        isset($this->info_closes_p[$token->name])
-                        ) {
-                        $result[] = new HTMLPurifier_Token_End('p');
-                        $result[] = $token;
-                        $current_nesting[] = $token;
-                        continue;
-                    }
-                    
-                    // check if we're closing a LI tag
-                    if ($current_parent->name == 'li' &&
-                        $token->name == 'li'
-                        ) {
-                        $result[] = new HTMLPurifier_Token_End('li');
-                        $result[] = $token;
-                        $current_nesting[] = $token;
-                        continue;
-                    }
-                    
-                    // this is more TIDY stuff
-                    // we should also get some TABLE related code
-                    // mismatched h#
-                    
-                    $current_nesting[] = $current_parent; // undo the pop
-                }
-                
-                $result[] = $token;
-                $current_nesting[] = $token;
-                continue;
-            }
-            
-            // sanity check
-            if ($token->type != 'end') continue;
-            
-            // okay, we're dealing with a closing tag
-            
-            // make sure that we have something open
-            if (empty($current_nesting)) {
-                $result[] = new HTMLPurifier_Token_Text(
-                    $this->generator->generateFromToken($token)
-                );
-                continue;
-            }
-            
-            // first, check for the simplest case: everything closes neatly
-            
-            // current_nesting is modified
-            $current_parent = array_pop($current_nesting);
-            if ($current_parent->name == $token->name) {
-                $result[] = $token;
-                continue;
-            }
-            
-            // undo the array_pop
-            $current_nesting[] = $current_parent;
-            
-            // okay, so we're trying to close the wrong tag
-            
-            // scroll back the entire nest, trying to find our tag
-            // feature could be to specify how far you'd like to go
-            $size = count($current_nesting);
-            // -2 because -1 is the last element, but we already checked that
-            $skipped_tags = false;
-            for ($i = $size - 2; $i >= 0; $i--) {
-                if ($current_nesting[$i]->name == $token->name) {
-                    // current nesting is modified
-                    $skipped_tags = array_splice($current_nesting, $i);
-                    break;
-                }
-            }
-            
-            // we still didn't find the tag, so translate to text
-            if ($skipped_tags === false) {
-                $result[] = new HTMLPurifier_Token_Text(
-                    $this->generator->generateFromToken($token)
-                );
-                continue;
-            }
-            
-            // okay, we found it, close all the skipped tags
-            // note that skipped tags contains the element we need closed
-            $size = count($skipped_tags);
-            for ($i = $size - 1; $i >= 0; $i--) {
-                $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
-            }
-            
-            // done!
-            
-        }
-        
-        // we're at the end now, fix all still unclosed tags
-        
-        if (!empty($current_nesting)) {
-            $size = count($current_nesting);
-            for ($i = $size - 1; $i >= 0; $i--) {
-                $result[] =
-                    new HTMLPurifier_Token_End($current_nesting[$i]->name);
-            }
-        }
-        
-        return $result;
-    }
-    
-    function fixNesting($tokens) {
-        if (empty($this->info)) $this->loadData();
-        
-        // insert implicit "parent" node, will be removed at end
-        array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
-        $tokens[] = new HTMLPurifier_Token_End('div');
-        
-        for ($i = 0, $size = count($tokens) ; $i < $size; ) {
-            
-            $child_tokens = array();
-            
-            // scroll to the end of this node, and report number
-            for ($j = $i, $depth = 0; ; $j++) {
-                if ($tokens[$j]->type == 'start') {
-                    $depth++;
-                    // skip token assignment on first iteration
-                    if ($depth == 1) continue;
-                } elseif ($tokens[$j]->type == 'end') {
-                    $depth--;
-                    // skip token assignment on last iteration
-                    if ($depth == 0) break;
-                }
-                $child_tokens[] = $tokens[$j];
-            }
-            
-            // $i is index of start token
-            // $j is index of end token
-            
-            // have DTD child def validate children
-            $element_def = $this->info[$tokens[$i]->name];
-            $result = $element_def->child_def->validateChildren($child_tokens);
-            
-            // process result
-            if ($result === true) {
-                
-                // leave the nodes as is
-                
-            } elseif($result === false) {
-                
-                // WARNING WARNING WARNING!!!
-                // While for the original DTD, there will never be
-                // cascading removal, more complex ones may have such
-                // a problem.
-                
-                // If you modify the info array such that an element
-                // that requires children may contain a child that requires
-                // children, you need to also scroll back and re-check that
-                // elements parent node
-                
-                $length = $j - $i + 1;
-                
-                // remove entire node
-                array_splice($tokens, $i, $length);
-                
-                // change size
-                $size -= $length;
-                
-                // ensure that we scroll to the next node
-                $i--;
-                
-            } else {
-                
-                $length = $j - $i - 1;
-                
-                // replace node with $result
-                array_splice($tokens, $i + 1, $length, $result);
-                
-                // change size
-                $size -= $length;
-                $size += count($result);
-                
-            }
-            
-            // scroll to next node
-            $i++;
-            while ($i < $size and $tokens[$i]->type != 'start') $i++;
-            
-        }
-        
-        // remove implicit divs
-        array_shift($tokens);
-        array_pop($tokens);
-        
-        return $tokens;
-        
-    }
-    
-    function validateAttributes($tokens) {
-        if (empty($this->info)) $this->loadData();
-        
-    }
-    
-}
-
-class HTMLPurifier_ElementDef
-{
-    
-    var $child_def;
-    var $attr_def = array();
-    
-    function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
-        $this->child_def = $child_def;
-        $this->attr_def  = $attr_def;
-    }
-    
-}
-
+<?php
+
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/ChildDef.php';
+require_once 'HTMLPurifier/Generator.php';
+require_once 'HTMLPurifier/Token.php';
+
+class HTMLPurifier_Definition
+{
+    
+    var $generator;
+    var $info = array();
+    var $info_closes_p = array(
+        // these are all block elements: blocks aren't allowed in P
+        'address'       => true,
+        'blockquote'    => true,
+        'dd'            => true,
+        'dir'           => true,
+        'div'           => true, 
+        'dl'            => true,
+        'dt'            => true,
+        'h1'            => true,
+        'h2'            => true,
+        'h3'            => true,
+        'h4'            => true, 
+        'h5'            => true,
+        'h6'            => true,
+        'hr'            => true,
+        'ol'            => true,
+        'p'             => true,
+        'pre'           => true, 
+        'table'         => true,
+        'ul'            => true
+        );
+    
+    function HTMLPurifier_Definition() {
+        $this->generator = new HTMLPurifier_Generator();
+    }
+    
+    function loadData() {
+        // emulates the structure of the DTD
+        
+        // entities: prefixed with e_ and _ replaces .
+        // we don't use an array because that complicates interpolation
+        // strings are used instead of arrays because if you use arrays,
+        // you have to do some hideous manipulation with array_merge()
+        
+        // these are condensed, remember, with bad stuff taken out
+        
+        // transforms: font, menu, dir, center
+        
+        // DON'T MONKEY AROUND THIS unless you know what you are doing
+        // and also know the assumptions the code makes about what this
+        // contains for optimization purposes (see fixNesting)
+        
+        $e_special_extra = 'img';
+        $e_special_basic = 'br | span | bdo';
+        $e_special = "$e_special_basic | $e_special_extra";
+        $e_fontstyle_extra = 'big | small';
+        $e_fontstyle_basic = 'tt | i | b | u | s | strike';
+        $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
+        $e_phrase_extra = 'sub | sup';
+        $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
+          ' | cite | abbr | acronym';
+        $e_phrase = "$e_phrase_basic | $e_phrase_extra";
+        $e_inline_forms = ''; // humor the dtd
+        $e_misc_inline = 'ins | del';
+        $e_misc = "$e_misc_inline";
+        $e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
+          " | $e_inline_forms";
+        // note the casing
+        $e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline".
+          " | $e_misc_inline");
+        $e_heading = 'h1|h2|h3|h4|h5|h6';
+        $e_lists = 'ul | ol | dl';
+        $e_blocktext = 'pre | hr | blockquote | address';
+        $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
+        $e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block".
+          " | $e_inline | $e_misc");
+        $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special".
+          " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
+        $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
+          " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
+          " | $e_inline_forms | $e_misc_inline");
+        $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
+        $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
+        
+        $this->info['ins'] =
+        $this->info['del'] = 
+        $this->info['blockquote'] =
+        $this->info['dd']  =
+        $this->info['li']  =
+        $this->info['div'] = new HTMLPurifier_ElementDef($e_Flow);
+        
+        $this->info['em']  =
+        $this->info['strong'] =
+        $this->info['dfn']  =
+        $this->info['code'] =
+        $this->info['samp'] =
+        $this->info['kbd']  =
+        $this->info['var']  =
+        $this->info['code'] =
+        $this->info['samp'] =
+        $this->info['kbd']  =
+        $this->info['var']  =
+        $this->info['cite'] =
+        $this->info['abbr'] =
+        $this->info['acronym'] =
+        $this->info['q']    =
+        $this->info['sub']  =
+        $this->info['tt']   =
+        $this->info['sup']  =
+        $this->info['i']    =
+        $this->info['b']    =
+        $this->info['big']  =
+        $this->info['small'] =
+        $this->info['u']    =
+        $this->info['s']    =
+        $this->info['strike'] =
+        $this->info['bdo']  =
+        $this->info['span'] =
+        $this->info['dt']   =
+        $this->info['p']    = 
+        $this->info['h1']   = 
+        $this->info['h2']   = 
+        $this->info['h3']   = 
+        $this->info['h4']   = 
+        $this->info['h5']   = 
+        $this->info['h6']   = new HTMLPurifier_ElementDef($e_Inline);
+        
+        $this->info['ol']   =
+        $this->info['ul']   =
+          new HTMLPurifier_ElementDef(
+            new HTMLPurifier_ChildDef_Required('li')
+          );
+        
+        $this->info['dl']   =
+          new HTMLPurifier_ElementDef(
+            new HTMLPurifier_ChildDef_Required('dt|dd')
+          );
+        $this->info['address'] =
+          new HTMLPurifier_ElementDef(
+            new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
+              " | $e_misc_inline")
+          );
+        
+        $this->info['img']  =
+        $this->info['br']   =
+        $this->info['hr']   = new HTMLPurifier_ElementDef(new HTMLPurifier_ChildDef_Empty());
+        
+        $this->info['pre']  = new HTMLPurifier_ElementDef($e_pre_content);
+        
+        $this->info['a']    = new HTMLPurifier_ElementDef($e_a_content);
+        
+    }
+    
+    function purifyTokens($tokens) {
+        if (empty($this->info)) $this->loadData();
+        $tokens = $this->removeForeignElements($tokens);
+        $tokens = $this->makeWellFormed($tokens);
+        $tokens = $this->fixNesting($tokens);
+        $tokens = $this->validateAttributes($tokens);
+        return $tokens;
+    }
+    
+    function removeForeignElements($tokens) {
+        if (empty($this->info)) $this->loadData();
+        $result = array();
+        foreach($tokens as $token) {
+            if (!empty( $token->is_tag )) {
+                if (!isset($this->info[$token->name])) {
+                    // invalid tag, generate HTML and insert in
+                    $token = new HTMLPurifier_Token_Text(
+                        $this->generator->generateFromToken($token)
+                    );
+                }
+            } elseif ($token->type == 'comment') {
+                // strip comments
+                continue;
+            } elseif ($token->type == 'text') {
+            } else {
+                continue;
+            }
+            $result[] = $token;
+        }
+        return $result;
+    }
+    
+    function makeWellFormed($tokens) {
+        if (empty($this->info)) $this->loadData();
+        $result = array();
+        $current_nesting = array();
+        foreach ($tokens as $token) {
+            if (empty( $token->is_tag )) {
+                $result[] = $token;
+                continue;
+            }
+            $info = $this->info[$token->name]; // assumption but valid
+            
+            // test if it claims to be a start tag but is empty
+            if ($info->child_def->type == 'empty' &&
+                $token->type == 'start' ) {
+                
+                $result[] = new HTMLPurifier_Token_Empty($token->name,
+                                                         $token->attributes);
+                continue;
+            }
+            
+            // test if it claims to be empty but really is a start tag
+            if ($info->child_def->type != 'empty' &&
+                $token->type == 'empty' ) {
+                
+                $result[] = new HTMLPurifier_Token_Start($token->name,
+                                                         $token->attributes);
+                $result[] = new HTMLPurifier_Token_End($token->name);
+                
+                continue;
+            }
+            
+            // automatically insert empty tags
+            if ($token->type == 'empty') {
+                $result[] = $token;
+                continue;
+            }
+            
+            // we give start tags precedence, so automatically accept unless...
+            // it's one of those special cases
+            if ($token->type == 'start') {
+                
+                // if there's a parent, check for special case
+                if (!empty($current_nesting)) {
+                    $current_parent = array_pop($current_nesting);
+                    
+                    // check if we're closing a P tag
+                    if ($current_parent->name == 'p' &&
+                        isset($this->info_closes_p[$token->name])
+                        ) {
+                        $result[] = new HTMLPurifier_Token_End('p');
+                        $result[] = $token;
+                        $current_nesting[] = $token;
+                        continue;
+                    }
+                    
+                    // check if we're closing a LI tag
+                    if ($current_parent->name == 'li' &&
+                        $token->name == 'li'
+                        ) {
+                        $result[] = new HTMLPurifier_Token_End('li');
+                        $result[] = $token;
+                        $current_nesting[] = $token;
+                        continue;
+                    }
+                    
+                    // this is more TIDY stuff
+                    // we should also get some TABLE related code
+                    // mismatched h#
+                    
+                    $current_nesting[] = $current_parent; // undo the pop
+                }
+                
+                $result[] = $token;
+                $current_nesting[] = $token;
+                continue;
+            }
+            
+            // sanity check
+            if ($token->type != 'end') continue;
+            
+            // okay, we're dealing with a closing tag
+            
+            // make sure that we have something open
+            if (empty($current_nesting)) {
+                $result[] = new HTMLPurifier_Token_Text(
+                    $this->generator->generateFromToken($token)
+                );
+                continue;
+            }
+            
+            // first, check for the simplest case: everything closes neatly
+            
+            // current_nesting is modified
+            $current_parent = array_pop($current_nesting);
+            if ($current_parent->name == $token->name) {
+                $result[] = $token;
+                continue;
+            }
+            
+            // undo the array_pop
+            $current_nesting[] = $current_parent;
+            
+            // okay, so we're trying to close the wrong tag
+            
+            // scroll back the entire nest, trying to find our tag
+            // feature could be to specify how far you'd like to go
+            $size = count($current_nesting);
+            // -2 because -1 is the last element, but we already checked that
+            $skipped_tags = false;
+            for ($i = $size - 2; $i >= 0; $i--) {
+                if ($current_nesting[$i]->name == $token->name) {
+                    // current nesting is modified
+                    $skipped_tags = array_splice($current_nesting, $i);
+                    break;
+                }
+            }
+            
+            // we still didn't find the tag, so translate to text
+            if ($skipped_tags === false) {
+                $result[] = new HTMLPurifier_Token_Text(
+                    $this->generator->generateFromToken($token)
+                );
+                continue;
+            }
+            
+            // okay, we found it, close all the skipped tags
+            // note that skipped tags contains the element we need closed
+            $size = count($skipped_tags);
+            for ($i = $size - 1; $i >= 0; $i--) {
+                $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
+            }
+            
+            // done!
+            
+        }
+        
+        // we're at the end now, fix all still unclosed tags
+        
+        if (!empty($current_nesting)) {
+            $size = count($current_nesting);
+            for ($i = $size - 1; $i >= 0; $i--) {
+                $result[] =
+                    new HTMLPurifier_Token_End($current_nesting[$i]->name);
+            }
+        }
+        
+        return $result;
+    }
+    
+    function fixNesting($tokens) {
+        if (empty($this->info)) $this->loadData();
+        
+        // insert implicit "parent" node, will be removed at end
+        array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
+        $tokens[] = new HTMLPurifier_Token_End('div');
+        
+        for ($i = 0, $size = count($tokens) ; $i < $size; ) {
+            
+            $child_tokens = array();
+            
+            // scroll to the end of this node, and report number
+            for ($j = $i, $depth = 0; ; $j++) {
+                if ($tokens[$j]->type == 'start') {
+                    $depth++;
+                    // skip token assignment on first iteration
+                    if ($depth == 1) continue;
+                } elseif ($tokens[$j]->type == 'end') {
+                    $depth--;
+                    // skip token assignment on last iteration
+                    if ($depth == 0) break;
+                }
+                $child_tokens[] = $tokens[$j];
+            }
+            
+            // $i is index of start token
+            // $j is index of end token
+            
+            // have DTD child def validate children
+            $element_def = $this->info[$tokens[$i]->name];
+            $result = $element_def->child_def->validateChildren($child_tokens);
+            
+            // process result
+            if ($result === true) {
+                
+                // leave the nodes as is
+                
+            } elseif($result === false) {
+                
+                // WARNING WARNING WARNING!!!
+                // While for the original DTD, there will never be
+                // cascading removal, more complex ones may have such
+                // a problem.
+                
+                // If you modify the info array such that an element
+                // that requires children may contain a child that requires
+                // children, you need to also scroll back and re-check that
+                // elements parent node
+                
+                $length = $j - $i + 1;
+                
+                // remove entire node
+                array_splice($tokens, $i, $length);
+                
+                // change size
+                $size -= $length;
+                
+                // ensure that we scroll to the next node
+                $i--;
+                
+            } else {
+                
+                $length = $j - $i - 1;
+                
+                // replace node with $result
+                array_splice($tokens, $i + 1, $length, $result);
+                
+                // change size
+                $size -= $length;
+                $size += count($result);
+                
+            }
+            
+            // scroll to next node
+            $i++;
+            while ($i < $size and $tokens[$i]->type != 'start') $i++;
+            
+        }
+        
+        // remove implicit divs
+        array_shift($tokens);
+        array_pop($tokens);
+        
+        return $tokens;
+        
+    }
+    
+    function validateAttributes($tokens) {
+        if (empty($this->info)) $this->loadData();
+        
+    }
+    
+}
+
+class HTMLPurifier_ElementDef
+{
+    
+    var $child_def;
+    var $attr_def = array();
+    
+    function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
+        $this->child_def = $child_def;
+        $this->attr_def  = $attr_def;
+    }
+    
+}
+
 ?>
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@@ -1,45 +1,45 @@
-<?php
-
-class HTMLPurifier_Generator
-{
-    
-    function generateFromTokens($tokens) {
-        $html = '';
-        foreach ($tokens as $token) {
-            $html .= $this->generateFromToken($token);
-        }
-        return $html;
-    }
-    
-    function generateFromToken($token) {
-        if ($token->type == 'start') {
-            $attr = $this->generateAttributes($token->attributes);
-            return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
-            
-        } elseif ($token->type == 'end') {
-            return '</' . $token->name . '>';
-            
-        } elseif ($token->type == 'empty') {
-            $attr = $this->generateAttributes($token->attributes);
-             return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
-            
-        } elseif ($token->type == 'text') {
-            return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
-            
-        } else {
-            return '';
-            
-        }
-    }
-    
-    function generateAttributes($assoc_array_of_attributes) {
-        $html = '';
-        foreach ($assoc_array_of_attributes as $key => $value) {
-            $html .= $key.'="'.htmlentities($value, ENT_COMPAT, 'UTF-8').'" ';
-        }
-        return rtrim($html);
-    }
-    
-}
-
+<?php
+
+class HTMLPurifier_Generator
+{
+    
+    function generateFromTokens($tokens) {
+        $html = '';
+        foreach ($tokens as $token) {
+            $html .= $this->generateFromToken($token);
+        }
+        return $html;
+    }
+    
+    function generateFromToken($token) {
+        if ($token->type == 'start') {
+            $attr = $this->generateAttributes($token->attributes);
+            return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
+            
+        } elseif ($token->type == 'end') {
+            return '</' . $token->name . '>';
+            
+        } elseif ($token->type == 'empty') {
+            $attr = $this->generateAttributes($token->attributes);
+             return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
+            
+        } elseif ($token->type == 'text') {
+            return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
+            
+        } else {
+            return '';
+            
+        }
+    }
+    
+    function generateAttributes($assoc_array_of_attributes) {
+        $html = '';
+        foreach ($assoc_array_of_attributes as $key => $value) {
+            $html .= $key.'="'.htmlentities($value, ENT_COMPAT, 'UTF-8').'" ';
+        }
+        return rtrim($html);
+    }
+    
+}
+
 ?>
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -1,354 +1,354 @@
-<?php
-
-/*
-
-TODO:
- * Reread the XML spec and make sure I got everything right
- * Add support for CDATA sections
- * Have comments output with the leading and trailing --s
- * Optimize and benchmark
- * Check MF_Text behavior: shouldn't the info in there be raw (entities parsed?)
-
-*/
-
-require_once 'HTMLPurifier/Lexer.php';
-
-class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
-{
-    
-    // does this version of PHP support utf8 as entity function charset?
-    var $_entity_utf8;
-    
-    function HTMLPurifier_Lexer() {
-        $this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
-    }
-    
-    // this is QUITE a knotty problem
-    // 
-    // The main trouble is that, even while assuming UTF-8 is what we're
-    // using, we've got to deal with HTML entities (like &mdash;)
-    // Not even sure if the PHP 5 decoding function does that. Plus,
-    // SimpleTest doesn't use UTF-8!
-    // 
-    // However, we MUST parse everything possible, because once you get
-    // to the HTML generator, it will escape everything possible (although
-    // that may not be correct, and we should be using htmlspecialchars() ).
-    // 
-    // Nevertheless, strictly XML speaking, we cannot assume any character
-    // entities are defined except the htmlspecialchars() ones, so leaving
-    // the entities inside HERE is not acceptable. (plus, htmlspecialchars
-    // might convert them anyway). So EVERYTHING must get parsed.
-    // 
-    // We may need to roll our own character entity lookup table. It's only
-    // about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
-    function parseData($string) {
-        // we may want to let the user do a different char encoding,
-        // although there is NO REASON why they shouldn't be able
-        // to convert it to UTF-8 before they pass it to us
-        
-        // no support for less than PHP 4.3
-        if ($this->_entity_utf8) {
-            // PHP 5+, UTF-8 is nicely supported
-            return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
-        } else {
-            // PHP 4, do compat stuff
-            $string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
-            // get the numeric UTF-8 stuff
-            $string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
-            $string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
-            // get the stringy UTF-8 stuff
-            return $string;
-        }
-    }
-    
-    function nextQuote($string, $offset = 0) {
-        $next = strcspn($string, '"\'', $offset) + $offset;
-        return strlen($string) == $next ? false : $next;
-    }
-    
-    function nextWhiteSpace($string, $offset = 0) {
-        $next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
-        return strlen($string) == $next ? false : $next;
-    }
-    
-    function tokenizeHTML($string) {
-        
-        // some quick checking (if empty, return empty)
-        $string = @ (string) $string;
-        if ($string == '') return array();
-        
-        $cursor = 0; // our location in the text
-        $inside_tag = false; // whether or not we're parsing the inside of a tag
-        $array = array(); // result array
-        
-        // infinite loop protection
-        // has to be pretty big, since html docs can be big
-        // we're allow two hundred thousand tags... more than enough?
-        $loops = 0;
-        
-        while(true) {
-            
-            // infinite loop protection
-            if (++$loops > 200000) return array();
-            
-            $position_next_lt = strpos($string, '<', $cursor);
-            $position_next_gt = strpos($string, '>', $cursor);
-            
-            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
-            if ($position_next_lt === $cursor) {
-                $inside_tag = true;
-                $cursor++;
-            }
-            
-            if (!$inside_tag && $position_next_lt !== false) {
-                // We are not inside tag and there still is another tag to parse
-                $array[] = new
-                    HTMLPurifier_Token_Text(
-                        html_entity_decode(
-                            substr(
-                                $string, $cursor, $position_next_lt - $cursor
-                            ),
-                            ENT_QUOTES
-                        )
-                    );
-                $cursor  = $position_next_lt + 1;
-                $inside_tag = true;
-                continue;
-            } elseif (!$inside_tag) {
-                // We are not inside tag but there are no more tags
-                // If we're already at the end, break
-                if ($cursor === strlen($string)) break;
-                // Create Text of rest of string
-                $array[] = new
-                    HTMLPurifier_Token_Text(
-                        html_entity_decode(
-                            substr(
-                                $string, $cursor
-                            ),
-                            ENT_QUOTES
-                        )
-                    );
-                break;
-            } elseif ($inside_tag && $position_next_gt !== false) {
-                // We are in tag and it is well formed
-                // Grab the internals of the tag
-                $segment = substr($string, $cursor, $position_next_gt-$cursor);
-                
-                // Check if it's a comment
-                if (
-                    substr($segment,0,3) == '!--' &&
-                    substr($segment,strlen($segment)-2,2) == '--'
-                ) {
-                    $array[] = new
-                        HTMLPurifier_Token_Comment(
-                            substr(
-                                $segment, 3, strlen($segment) - 5
-                            )
-                        );
-                    $inside_tag = false;
-                    $cursor = $position_next_gt + 1;
-                    continue;
-                }
-                
-                // Check if it's an end tag
-                $is_end_tag = (strpos($segment,'/') === 0);
-                if ($is_end_tag) {
-                    $type = substr($segment, 1);
-                    $array[] = new HTMLPurifier_Token_End($type);
-                    $inside_tag = false;
-                    $cursor = $position_next_gt + 1;
-                    continue;
-                }
-                
-                // Check if it is explicitly self closing, if so, remove
-                // trailing slash. Remember, we could have a tag like <br>, so
-                // any later token processing scripts must convert improperly
-                // classified EmptyTags from StartTags.
-                $is_self_closing= (strpos($segment,'/') === strlen($segment)-1);
-                if ($is_self_closing) {
-                    $segment = substr($segment, 0, strlen($segment) - 1);
-                }
-                
-                // Check if there are any attributes
-                $position_first_space = $this->nextWhiteSpace($segment);
-                if ($position_first_space === false) {
-                    if ($is_self_closing) {
-                        $array[] = new HTMLPurifier_Token_Empty($segment);
-                    } else {
-                        $array[] = new HTMLPurifier_Token_Start($segment);
-                    }
-                    $inside_tag = false;
-                    $cursor = $position_next_gt + 1;
-                    continue;
-                }
-                
-                // Grab out all the data
-                $type = substr($segment, 0, $position_first_space);
-                $attribute_string =
-                    trim(
-                        substr(
-                            $segment, $position_first_space
-                        )
-                    );
-                if ($attribute_string) {
-                    $attributes = $this->tokenizeAttributeString(
-                                        $attribute_string
-                                  );
-                } else {
-                    $attributes = array();
-                }
-                
-                if ($is_self_closing) {
-                    $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
-                } else {
-                    $array[] = new HTMLPurifier_Token_Start($type, $attributes);
-                }
-                $cursor = $position_next_gt + 1;
-                $inside_tag = false;
-                continue;
-            } else {
-                $array[] = new
-                    HTMLPurifier_Token_Text(
-                        '<' .
-                        html_entity_decode(
-                            substr($string, $cursor),
-                            ENT_QUOTES
-                        )
-                    );
-                break;
-            }
-            break;
-        }
-        return $array;
-    }
-    
-    function tokenizeAttributeString($string) {
-        $string = (string) $string; // quick typecast
-        
-        if ($string == '') return array(); // no attributes
-        
-        // let's see if we can abort as quickly as possible
-        // one equal sign, no spaces => one attribute
-        $num_equal = substr_count($string, '=');
-        $has_space = strpos($string, ' ');
-        if ($num_equal === 0 && !$has_space) {
-            // bool attribute
-            return array($string => $string);
-        } elseif ($num_equal === 1 && !$has_space) {
-            // only one attribute
-            list($key, $quoted_value) = explode('=', $string);
-            $quoted_value = trim($quoted_value);
-            if (!$key) return array();
-            if (!$quoted_value) return array($key => '');
-            $first_char = @$quoted_value[0];
-            $last_char  = @$quoted_value[strlen($quoted_value)-1];
-            
-            $same_quote = ($first_char == $last_char);
-            $open_quote = ($first_char == '"' || $first_char == "'");
-            
-            if ( $same_quote && $open_quote) {
-                // well behaved
-                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
-            } else {
-                // not well behaved
-                if ($open_quote) {
-                    $value = substr($quoted_value, 1);
-                } else {
-                    $value = $quoted_value;
-                }
-            }
-            return array($key => $value);
-        }
-        
-        // setup loop environment
-        $array  = array(); // return assoc array of attributes
-        $cursor = 0; // current position in string (moves forward)
-        $size   = strlen($string); // size of the string (stays the same)
-        
-        // if we have unquoted attributes, the parser expects a terminating
-        // space, so let's guarantee that there's always a terminating space.
-        $string .= ' ';
-        
-        // infinite loop protection
-        $loops = 0;
-        
-        while(true) {
-            
-            // infinite loop protection
-            if (++$loops > 1000) return array();
-            
-            if ($cursor >= $size) {
-                break;
-            }
-            
-            $cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
-            
-            $position_next_space = $this->nextWhiteSpace($string, $cursor);
-            $position_next_equal = strpos($string, '=', $cursor);
-            
-            // grab the key
-            
-            $key_begin = $cursor; //we're currently at the start of the key
-            
-            // scroll past all characters that are the key (not whitespace or =)
-            $cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
-            
-            $key_end = $cursor; // now at the end of the key
-            
-            $key = substr($string, $key_begin, $key_end - $key_begin);
-            
-            if (!$key) continue; // empty key
-            
-            // scroll past all whitespace
-            $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
-            
-            if ($cursor >= $size) {
-                $array[$key] = $key;
-                break;
-            }
-            
-            // if the next character is an equal sign, we've got a regular
-            // pair, otherwise, it's a bool attribute
-            $first_char = @$string[$cursor];
-            
-            if ($first_char == '=') {
-                // key="value"
-                
-                $cursor++;
-                $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
-                
-                // we might be in front of a quote right now
-                
-                $char = @$string[$cursor];
-                
-                if ($char == '"' || $char == "'") {
-                    // it's quoted, end bound is $char
-                    $cursor++;
-                    $value_begin = $cursor;
-                    $cursor = strpos($string, $char, $cursor);
-                    $value_end = $cursor;
-                } else {
-                    // it's not quoted, end bound is whitespace
-                    $value_begin = $cursor;
-                    $cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
-                    $value_end = $cursor;
-                }
-                
-                $value = substr($string, $value_begin, $value_end - $value_begin);
-                $array[$key] = $value;
-                $cursor++;
-                
-            } else {
-                // boolattr
-                if ($key !== '') {
-                    $array[$key] = $key;
-                }
-                
-            }
-        }
-        return $array;
-    }
-    
-}
-
+<?php
+
+/*
+
+TODO:
+ * Reread the XML spec and make sure I got everything right
+ * Add support for CDATA sections
+ * Have comments output with the leading and trailing --s
+ * Optimize and benchmark
+ * Check MF_Text behavior: shouldn't the info in there be raw (entities parsed?)
+
+*/
+
+require_once 'HTMLPurifier/Lexer.php';
+
+class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
+{
+    
+    // does this version of PHP support utf8 as entity function charset?
+    var $_entity_utf8;
+    
+    function HTMLPurifier_Lexer() {
+        $this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
+    }
+    
+    // this is QUITE a knotty problem
+    // 
+    // The main trouble is that, even while assuming UTF-8 is what we're
+    // using, we've got to deal with HTML entities (like &mdash;)
+    // Not even sure if the PHP 5 decoding function does that. Plus,
+    // SimpleTest doesn't use UTF-8!
+    // 
+    // However, we MUST parse everything possible, because once you get
+    // to the HTML generator, it will escape everything possible (although
+    // that may not be correct, and we should be using htmlspecialchars() ).
+    // 
+    // Nevertheless, strictly XML speaking, we cannot assume any character
+    // entities are defined except the htmlspecialchars() ones, so leaving
+    // the entities inside HERE is not acceptable. (plus, htmlspecialchars
+    // might convert them anyway). So EVERYTHING must get parsed.
+    // 
+    // We may need to roll our own character entity lookup table. It's only
+    // about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
+    function parseData($string) {
+        // we may want to let the user do a different char encoding,
+        // although there is NO REASON why they shouldn't be able
+        // to convert it to UTF-8 before they pass it to us
+        
+        // no support for less than PHP 4.3
+        if ($this->_entity_utf8) {
+            // PHP 5+, UTF-8 is nicely supported
+            return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
+        } else {
+            // PHP 4, do compat stuff
+            $string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
+            // get the numeric UTF-8 stuff
+            $string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
+            $string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
+            // get the stringy UTF-8 stuff
+            return $string;
+        }
+    }
+    
+    function nextQuote($string, $offset = 0) {
+        $next = strcspn($string, '"\'', $offset) + $offset;
+        return strlen($string) == $next ? false : $next;
+    }
+    
+    function nextWhiteSpace($string, $offset = 0) {
+        $next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
+        return strlen($string) == $next ? false : $next;
+    }
+    
+    function tokenizeHTML($string) {
+        
+        // some quick checking (if empty, return empty)
+        $string = @ (string) $string;
+        if ($string == '') return array();
+        
+        $cursor = 0; // our location in the text
+        $inside_tag = false; // whether or not we're parsing the inside of a tag
+        $array = array(); // result array
+        
+        // infinite loop protection
+        // has to be pretty big, since html docs can be big
+        // we're allow two hundred thousand tags... more than enough?
+        $loops = 0;
+        
+        while(true) {
+            
+            // infinite loop protection
+            if (++$loops > 200000) return array();
+            
+            $position_next_lt = strpos($string, '<', $cursor);
+            $position_next_gt = strpos($string, '>', $cursor);
+            
+            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
+            if ($position_next_lt === $cursor) {
+                $inside_tag = true;
+                $cursor++;
+            }
+            
+            if (!$inside_tag && $position_next_lt !== false) {
+                // We are not inside tag and there still is another tag to parse
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        html_entity_decode(
+                            substr(
+                                $string, $cursor, $position_next_lt - $cursor
+                            ),
+                            ENT_QUOTES
+                        )
+                    );
+                $cursor  = $position_next_lt + 1;
+                $inside_tag = true;
+                continue;
+            } elseif (!$inside_tag) {
+                // We are not inside tag but there are no more tags
+                // If we're already at the end, break
+                if ($cursor === strlen($string)) break;
+                // Create Text of rest of string
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        html_entity_decode(
+                            substr(
+                                $string, $cursor
+                            ),
+                            ENT_QUOTES
+                        )
+                    );
+                break;
+            } elseif ($inside_tag && $position_next_gt !== false) {
+                // We are in tag and it is well formed
+                // Grab the internals of the tag
+                $segment = substr($string, $cursor, $position_next_gt-$cursor);
+                
+                // Check if it's a comment
+                if (
+                    substr($segment,0,3) == '!--' &&
+                    substr($segment,strlen($segment)-2,2) == '--'
+                ) {
+                    $array[] = new
+                        HTMLPurifier_Token_Comment(
+                            substr(
+                                $segment, 3, strlen($segment) - 5
+                            )
+                        );
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Check if it's an end tag
+                $is_end_tag = (strpos($segment,'/') === 0);
+                if ($is_end_tag) {
+                    $type = substr($segment, 1);
+                    $array[] = new HTMLPurifier_Token_End($type);
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Check if it is explicitly self closing, if so, remove
+                // trailing slash. Remember, we could have a tag like <br>, so
+                // any later token processing scripts must convert improperly
+                // classified EmptyTags from StartTags.
+                $is_self_closing= (strpos($segment,'/') === strlen($segment)-1);
+                if ($is_self_closing) {
+                    $segment = substr($segment, 0, strlen($segment) - 1);
+                }
+                
+                // Check if there are any attributes
+                $position_first_space = $this->nextWhiteSpace($segment);
+                if ($position_first_space === false) {
+                    if ($is_self_closing) {
+                        $array[] = new HTMLPurifier_Token_Empty($segment);
+                    } else {
+                        $array[] = new HTMLPurifier_Token_Start($segment);
+                    }
+                    $inside_tag = false;
+                    $cursor = $position_next_gt + 1;
+                    continue;
+                }
+                
+                // Grab out all the data
+                $type = substr($segment, 0, $position_first_space);
+                $attribute_string =
+                    trim(
+                        substr(
+                            $segment, $position_first_space
+                        )
+                    );
+                if ($attribute_string) {
+                    $attributes = $this->tokenizeAttributeString(
+                                        $attribute_string
+                                  );
+                } else {
+                    $attributes = array();
+                }
+                
+                if ($is_self_closing) {
+                    $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
+                } else {
+                    $array[] = new HTMLPurifier_Token_Start($type, $attributes);
+                }
+                $cursor = $position_next_gt + 1;
+                $inside_tag = false;
+                continue;
+            } else {
+                $array[] = new
+                    HTMLPurifier_Token_Text(
+                        '<' .
+                        html_entity_decode(
+                            substr($string, $cursor),
+                            ENT_QUOTES
+                        )
+                    );
+                break;
+            }
+            break;
+        }
+        return $array;
+    }
+    
+    function tokenizeAttributeString($string) {
+        $string = (string) $string; // quick typecast
+        
+        if ($string == '') return array(); // no attributes
+        
+        // let's see if we can abort as quickly as possible
+        // one equal sign, no spaces => one attribute
+        $num_equal = substr_count($string, '=');
+        $has_space = strpos($string, ' ');
+        if ($num_equal === 0 && !$has_space) {
+            // bool attribute
+            return array($string => $string);
+        } elseif ($num_equal === 1 && !$has_space) {
+            // only one attribute
+            list($key, $quoted_value) = explode('=', $string);
+            $quoted_value = trim($quoted_value);
+            if (!$key) return array();
+            if (!$quoted_value) return array($key => '');
+            $first_char = @$quoted_value[0];
+            $last_char  = @$quoted_value[strlen($quoted_value)-1];
+            
+            $same_quote = ($first_char == $last_char);
+            $open_quote = ($first_char == '"' || $first_char == "'");
+            
+            if ( $same_quote && $open_quote) {
+                // well behaved
+                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
+            } else {
+                // not well behaved
+                if ($open_quote) {
+                    $value = substr($quoted_value, 1);
+                } else {
+                    $value = $quoted_value;
+                }
+            }
+            return array($key => $value);
+        }
+        
+        // setup loop environment
+        $array  = array(); // return assoc array of attributes
+        $cursor = 0; // current position in string (moves forward)
+        $size   = strlen($string); // size of the string (stays the same)
+        
+        // if we have unquoted attributes, the parser expects a terminating
+        // space, so let's guarantee that there's always a terminating space.
+        $string .= ' ';
+        
+        // infinite loop protection
+        $loops = 0;
+        
+        while(true) {
+            
+            // infinite loop protection
+            if (++$loops > 1000) return array();
+            
+            if ($cursor >= $size) {
+                break;
+            }
+            
+            $cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
+            
+            $position_next_space = $this->nextWhiteSpace($string, $cursor);
+            $position_next_equal = strpos($string, '=', $cursor);
+            
+            // grab the key
+            
+            $key_begin = $cursor; //we're currently at the start of the key
+            
+            // scroll past all characters that are the key (not whitespace or =)
+            $cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
+            
+            $key_end = $cursor; // now at the end of the key
+            
+            $key = substr($string, $key_begin, $key_end - $key_begin);
+            
+            if (!$key) continue; // empty key
+            
+            // scroll past all whitespace
+            $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
+            
+            if ($cursor >= $size) {
+                $array[$key] = $key;
+                break;
+            }
+            
+            // if the next character is an equal sign, we've got a regular
+            // pair, otherwise, it's a bool attribute
+            $first_char = @$string[$cursor];
+            
+            if ($first_char == '=') {
+                // key="value"
+                
+                $cursor++;
+                $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
+                
+                // we might be in front of a quote right now
+                
+                $char = @$string[$cursor];
+                
+                if ($char == '"' || $char == "'") {
+                    // it's quoted, end bound is $char
+                    $cursor++;
+                    $value_begin = $cursor;
+                    $cursor = strpos($string, $char, $cursor);
+                    $value_end = $cursor;
+                } else {
+                    // it's not quoted, end bound is whitespace
+                    $value_begin = $cursor;
+                    $cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
+                    $value_end = $cursor;
+                }
+                
+                $value = substr($string, $value_begin, $value_end - $value_begin);
+                $array[$key] = $value;
+                $cursor++;
+                
+            } else {
+                // boolattr
+                if ($key !== '') {
+                    $array[$key] = $key;
+                }
+                
+            }
+        }
+        return $array;
+    }
+    
+}
+
 ?>
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -1,58 +1,58 @@
-<?php
-
-require_once 'XML/HTMLSax3.php'; // PEAR
-require_once 'HTMLPurifier/Lexer.php';
-
-// uses the PEAR class XML_HTMLSax3 to parse XML
-class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
-{
-    
-    var $tokens;
-    
-    function tokenizeHTML($html) {
-        $this->tokens = array();
-        $parser=& new XML_HTMLSax3();
-        $parser->set_object($this);
-        $parser->set_element_handler('openHandler','closeHandler');
-        $parser->set_data_handler('dataHandler');
-        $parser->set_escape_handler('escapeHandler');
-        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
-        $parser->parse($html);
-        return $this->tokens;
-    }
-    
-    function openHandler(&$parser, $name, $attrs, $closed) {
-        if ($closed) {
-            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
-        } else {
-            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
-        }
-        return true;
-    }
-    
-    function closeHandler(&$parser, $name) {
-        // HTMLSax3 seems to always send empty tags an extra close tag
-        // check and ignore if you see it:
-        // [TESTME] to make sure it doesn't overreach
-        if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
-            return true;
-        }
-        $this->tokens[] = new HTMLPurifier_Token_End($name);
-        return true;
-    }
-    
-    function dataHandler(&$parser, $data) {
-        $this->tokens[] = new HTMLPurifier_Token_Text($data);
-        return true;
-    }
-    
-    function escapeHandler(&$parser, $data) {
-        if (strpos($data, '-') === 0) {
-            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
-        }
-        return true;
-    }
-    
-}
-
+<?php
+
+require_once 'XML/HTMLSax3.php'; // PEAR
+require_once 'HTMLPurifier/Lexer.php';
+
+// uses the PEAR class XML_HTMLSax3 to parse XML
+class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
+{
+    
+    var $tokens;
+    
+    function tokenizeHTML($html) {
+        $this->tokens = array();
+        $parser=& new XML_HTMLSax3();
+        $parser->set_object($this);
+        $parser->set_element_handler('openHandler','closeHandler');
+        $parser->set_data_handler('dataHandler');
+        $parser->set_escape_handler('escapeHandler');
+        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
+        $parser->parse($html);
+        return $this->tokens;
+    }
+    
+    function openHandler(&$parser, $name, $attrs, $closed) {
+        if ($closed) {
+            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
+        } else {
+            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
+        }
+        return true;
+    }
+    
+    function closeHandler(&$parser, $name) {
+        // HTMLSax3 seems to always send empty tags an extra close tag
+        // check and ignore if you see it:
+        // [TESTME] to make sure it doesn't overreach
+        if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
+            return true;
+        }
+        $this->tokens[] = new HTMLPurifier_Token_End($name);
+        return true;
+    }
+    
+    function dataHandler(&$parser, $data) {
+        $this->tokens[] = new HTMLPurifier_Token_Text($data);
+        return true;
+    }
+    
+    function escapeHandler(&$parser, $data) {
+        if (strpos($data, '-') === 0) {
+            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+        }
+        return true;
+    }
+    
+}
+
 ?>
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@@ -1,60 +1,60 @@
-<?php
-
-// all objects here are immutable
-
-class HTMLPurifier_Token {} // abstract
-
-class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
-{
-    var $is_tag = true;
-    var $name;
-    var $attributes = array();
-    function HTMLPurifier_Token_Tag($name, $attributes = array()) {
-        $this->name = ctype_lower($name) ? $name : strtolower($name);
-        $this->attributes = $attributes;
-    }
-}
-
-// start CONCRETE ones
-
-class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
-{
-    var $type = 'start';
-}
-
-class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
-{
-    var $type = 'empty';
-}
-
-// accepts attributes even though it really can't, for optimization reasons
-class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
-{
-    var $type = 'end';
-}
-
-class HTMLPurifier_Token_Text extends HTMLPurifier_Token
-{
-    var $name = '#PCDATA';
-    var $type = 'text';
-    var $data;
-    var $is_whitespace = false;
-    function HTMLPurifier_Token_Text($data) {
-        $this->data = $data;
-        if (ctype_space($data)) $this->is_whitespace = true;
-    }
-    function append($text) {
-        return new HTMLPurifier_Token_Text($this->data . $text->data);
-    }
-}
-
-class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
-{
-    var $data;
-    var $type = 'comment';
-    function HTMLPurifier_Token_Comment($data) {
-        $this->data = $data;
-    }
-}
-
+<?php
+
+// all objects here are immutable
+
+class HTMLPurifier_Token {} // abstract
+
+class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
+{
+    var $is_tag = true;
+    var $name;
+    var $attributes = array();
+    function HTMLPurifier_Token_Tag($name, $attributes = array()) {
+        $this->name = ctype_lower($name) ? $name : strtolower($name);
+        $this->attributes = $attributes;
+    }
+}
+
+// start CONCRETE ones
+
+class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
+{
+    var $type = 'start';
+}
+
+class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
+{
+    var $type = 'empty';
+}
+
+// accepts attributes even though it really can't, for optimization reasons
+class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
+{
+    var $type = 'end';
+}
+
+class HTMLPurifier_Token_Text extends HTMLPurifier_Token
+{
+    var $name = '#PCDATA';
+    var $type = 'text';
+    var $data;
+    var $is_whitespace = false;
+    function HTMLPurifier_Token_Text($data) {
+        $this->data = $data;
+        if (ctype_space($data)) $this->is_whitespace = true;
+    }
+    function append($text) {
+        return new HTMLPurifier_Token_Text($this->data . $text->data);
+    }
+}
+
+class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
+{
+    var $data;
+    var $type = 'comment';
+    function HTMLPurifier_Token_Comment($data) {
+        $this->data = $data;
+    }
+}
+
 ?>