mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-05 13:47:24 +02:00
[2.0.1] Officially add experimental auto-paragraphing and linkification functionality. Rename %Core.DefinitionCache to %Cache.DefinitionImpl. Have AutoParagraph handle even more edge cases. Fix MakeWellFormed bug.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1223 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
@@ -67,6 +67,7 @@ class HTMLPurifier_ConfigSchema {
|
||||
$this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.');
|
||||
$this->defineNamespace('HTML', 'Configuration regarding allowed HTML.');
|
||||
$this->defineNamespace('CSS', 'Configuration regarding allowed CSS.');
|
||||
$this->defineNamespace('AutoFormat', 'Configuration regarding auto-formatting functionality such as auto-paragraphing or linkification.');
|
||||
$this->defineNamespace('Output', 'Configuration relating to the generation of (X)HTML.');
|
||||
$this->defineNamespace('Cache', 'Configuration for DefinitionCache and related subclasses.');
|
||||
$this->defineNamespace('Test', 'Developer testing configuration for our unit tests.');
|
||||
|
@@ -3,7 +3,7 @@
|
||||
require_once 'HTMLPurifier/DefinitionCache.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'DefinitionCache', 'Serializer', 'string/null', '
|
||||
'Cache', 'DefinitionImpl', 'Serializer', 'string/null', '
|
||||
This directive defines which method to use when caching definitions,
|
||||
the complex data-type that makes HTML Purifier tick. Set to null
|
||||
to disable caching (not recommended, as you will see a definite
|
||||
@@ -11,7 +11,12 @@ performance degradation). This directive has been available since 2.0.0.
|
||||
');
|
||||
|
||||
HTMLPurifier_ConfigSchema::defineAllowedValues(
|
||||
'Core', 'DefinitionCache', array('Serializer')
|
||||
'Cache', 'DefinitionImpl', array('Serializer')
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigSchema::defineAlias(
|
||||
'Core', 'DefinitionCache',
|
||||
'Cache', 'DefinitionImpl'
|
||||
);
|
||||
|
||||
|
||||
@@ -54,7 +59,7 @@ class HTMLPurifier_DefinitionCacheFactory
|
||||
function &create($type, $config) {
|
||||
// only one implementation as for right now, $config will
|
||||
// be used to determine implementation
|
||||
$method = $config->get('Core', 'DefinitionCache');
|
||||
$method = $config->get('Cache', 'DefinitionImpl');
|
||||
if ($method === null) {
|
||||
$null = new HTMLPurifier_DefinitionCache_Null($type);
|
||||
return $null;
|
||||
|
@@ -2,6 +2,24 @@
|
||||
|
||||
require_once 'HTMLPurifier/Injector.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'AutoFormat', 'AutoParagraph', false, 'bool', '
|
||||
<p>
|
||||
This directive turns on auto-paragraphing, where double newlines are
|
||||
converted in to paragraphs whenever possible. Auto-paragraphing
|
||||
applies when:
|
||||
</p>
|
||||
<ul>
|
||||
<li>There are inline elements or text in the root node</li>
|
||||
<li>There are inline elements or text with double newlines or
|
||||
block elements in nodes that allow paragraph tags</li>
|
||||
<li>There are double newlines in paragraph tags</li>
|
||||
</ul>
|
||||
<p>
|
||||
This directive has been available since 2.0.1.
|
||||
</p>
|
||||
');
|
||||
|
||||
/**
|
||||
* Injector that auto paragraphs text in the root node based on
|
||||
* double-spacing.
|
||||
@@ -11,27 +29,94 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
|
||||
|
||||
function handleText(&$token) {
|
||||
$text = $token->data;
|
||||
// $token is the focus: if processing is needed, it gets
|
||||
// turned into an array of tokens that will replace the
|
||||
// original token
|
||||
if (empty($this->currentNesting)) {
|
||||
if (!$this->allowsElement('p')) return;
|
||||
// we're in root node, and the root node allows paragraphs
|
||||
// start a paragraph since we just hit some text
|
||||
// case 1: we're in root node (and it allows paragraphs)
|
||||
$token = array(new HTMLPurifier_Token_Start('p'));
|
||||
$this->_splitText($text, $token);
|
||||
} elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') {
|
||||
// we're not in root node but we're in a paragraph, so don't
|
||||
// add a paragraph start tag but still perform processing
|
||||
// case 2: we're in a paragraph
|
||||
$token = array();
|
||||
$this->_splitText($text, $token);
|
||||
} elseif ($this->allowsElement('p')) {
|
||||
// case 3: we're in an element that allows paragraphs
|
||||
if (strpos($text, PHP_EOL . PHP_EOL) !== false) {
|
||||
// case 3.1: this text node has a double-newline
|
||||
$token = array(new HTMLPurifier_Token_Start('p'));
|
||||
$this->_splitText($text, $token);
|
||||
} else {
|
||||
$ok = false;
|
||||
// test if up-coming tokens are either block or have
|
||||
// a double newline in them
|
||||
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
|
||||
if ($this->inputTokens[$i]->type == 'start'){
|
||||
if (!$this->_isInline($this->inputTokens[$i])) {
|
||||
$ok = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if ($this->inputTokens[$i]->type == 'end') break;
|
||||
if ($this->inputTokens[$i]->type == 'text') {
|
||||
if (strpos($this->inputTokens[$i]->data, PHP_EOL . PHP_EOL) !== false) {
|
||||
$ok = true;
|
||||
}
|
||||
if (!$this->inputTokens[$i]->is_whitespace) break;
|
||||
}
|
||||
}
|
||||
if ($ok) {
|
||||
// case 3.2: this text node is next to another node
|
||||
// that will start a paragraph
|
||||
$token = array(new HTMLPurifier_Token_Start('p'), $token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function handleStart(&$token) {
|
||||
// check if we're inside a tag already, if so, don't add
|
||||
// paragraph tags
|
||||
if (!empty($this->currentNesting)) return;
|
||||
// check if we're inside a tag already
|
||||
if (!empty($this->currentNesting)) {
|
||||
if ($this->allowsElement('p')) {
|
||||
// special case: we're in an element that allows paragraphs
|
||||
|
||||
// this token is already paragraph, abort
|
||||
if ($token->name == 'p') return;
|
||||
|
||||
// check if this token is adjacent to the parent
|
||||
if ($this->inputTokens[$this->inputIndex - 1]->type != 'start') {
|
||||
// not adjacent, we can abort early
|
||||
// add lead paragraph tag if our token is inline
|
||||
if ($this->_isInline($token)) {
|
||||
$token = array(new HTMLPurifier_Token_Start('p'), $token);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// this token is the first child of the element that allows
|
||||
// paragraph. We have to peek ahead and see whether or not
|
||||
// there is anything inside that suggests that a paragraph
|
||||
// will be needed
|
||||
$ok = false;
|
||||
// maintain a mini-nesting counter, this lets us bail out
|
||||
// early if possible
|
||||
$j = 2; // current nesting, is two due to parent and this start
|
||||
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
|
||||
if ($this->inputTokens[$i]->type == 'start') $j++;
|
||||
if ($this->inputTokens[$i]->type == 'end') $j--;
|
||||
if ($this->inputTokens[$i]->type == 'text') {
|
||||
if (strpos($this->inputTokens[$i]->data, PHP_EOL . PHP_EOL) !== false) {
|
||||
$ok = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($j <= 0) break;
|
||||
}
|
||||
if ($ok) {
|
||||
$token = array(new HTMLPurifier_Token_Start('p'), $token);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// check if the start tag counts as a "block" element
|
||||
if (!$this->_isInline($token)) return;
|
||||
@@ -56,55 +141,79 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
|
||||
|
||||
// remove empty paragraphs
|
||||
$paragraphs = array();
|
||||
$needs_start = false;
|
||||
$first = true;
|
||||
foreach ($raw_paragraphs as $par) {
|
||||
if (trim($par) !== '') $paragraphs[] = $par;
|
||||
if (trim($par) !== '') {
|
||||
$paragraphs[] = $par;
|
||||
} elseif (empty($result) && $first) {
|
||||
// The empty result indicates that the AutoParagraph
|
||||
// injector did not add any start paragraph tokens.
|
||||
// The fact that the first paragraph is empty indicates
|
||||
// that there was a double-newline at the start of the
|
||||
// data.
|
||||
// Combined together, this means that we are in a paragraph,
|
||||
// and the newline means we should start a new one.
|
||||
$result[] = new HTMLPurifier_Token_End('p');
|
||||
// However, the start token should only be added if
|
||||
// there is more processing to be done (i.e. there are
|
||||
// real paragraphs in here). If there are none, the
|
||||
// next start paragraph tag will be handled by the
|
||||
// next run-around the injector
|
||||
$needs_start = true;
|
||||
}
|
||||
$first = false;
|
||||
}
|
||||
|
||||
// check if there are no "real" paragraphs to be processed
|
||||
if (empty($paragraphs) && count($raw_paragraphs) > 1) {
|
||||
$result[] = new HTMLPurifier_Token_End('p');
|
||||
if (empty($paragraphs)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// add a start tag if an end tag was added while processing
|
||||
// the raw paragraphs (that happens if there's a leading double
|
||||
// newline)
|
||||
if ($needs_start) $result[] = new HTMLPurifier_Token_Start('p');
|
||||
|
||||
// append the paragraphs onto the result
|
||||
foreach ($paragraphs as $par) {
|
||||
$result[] = new HTMLPurifier_Token_Text($par);
|
||||
$result[] = new HTMLPurifier_Token_End('p');
|
||||
$result[] = new HTMLPurifier_Token_Start('p');
|
||||
}
|
||||
array_pop($result); // remove trailing start token
|
||||
|
||||
// remove trailing start token, if one is needed, it will
|
||||
// be handled the next time this injector is called
|
||||
array_pop($result);
|
||||
|
||||
// check the outside to determine whether or not the
|
||||
// end paragraph tag should be removed. It should be removed
|
||||
// unless the next non-whitespace token is a paragraph
|
||||
// or a block element.
|
||||
|
||||
$remove_paragraph_end = true;
|
||||
// Start of the checks one after the current token's index
|
||||
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
|
||||
if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') {
|
||||
$remove_paragraph_end = $this->_isInline($this->inputTokens[$i]);
|
||||
break;
|
||||
}
|
||||
// check if we can abort early (whitespace means we carry-on!)
|
||||
if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break;
|
||||
// end tags will automatically be handled by MakeWellFormed,
|
||||
// so we don't have to worry about them
|
||||
if ($this->inputTokens[$i]->type == 'end') break;
|
||||
}
|
||||
|
||||
// check the outside to determine whether or not the
|
||||
// end paragraph tag should be removed
|
||||
if ($this->_removeParagraphEnd()) {
|
||||
if ($remove_paragraph_end) {
|
||||
array_pop($result);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns boolean whether or not to remove the paragraph end tag
|
||||
* that was automatically added. The paragraph end tag should be
|
||||
* removed unless the next token is a paragraph or block element.
|
||||
* @private
|
||||
*/
|
||||
function _removeParagraphEnd() {
|
||||
$tokens =& $this->inputTokens;
|
||||
$remove_paragraph_end = true;
|
||||
// Start of the checks one after the current token's index
|
||||
for ($i = $this->inputIndex + 1; isset($tokens[$i]); $i++) {
|
||||
if ($tokens[$i]->type == 'start' || $tokens[$i]->type == 'empty') {
|
||||
$remove_paragraph_end = $this->_isInline($tokens[$i]);
|
||||
break;
|
||||
}
|
||||
// check if we can abort early (whitespace means we carry-on!)
|
||||
if ($tokens[$i]->type == 'text' && !$tokens[$i]->is_whitespace) break;
|
||||
if ($tokens[$i]->type == 'end') break; // nonsensical
|
||||
}
|
||||
return $remove_paragraph_end;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if passed token is inline (and, ergo, allowed in
|
||||
* paragraph tags)
|
||||
|
@@ -2,6 +2,14 @@
|
||||
|
||||
require_once 'HTMLPurifier/Injector.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'AutoFormat', 'Linkify', false, 'bool', '
|
||||
<p>
|
||||
This directive turns on linkification, auto-linking http, ftp and
|
||||
https URLs. This directive has been available since 2.0.1.
|
||||
</p>
|
||||
');
|
||||
|
||||
/**
|
||||
* Injector that converts http, https and ftp text URLs to actual links.
|
||||
*/
|
||||
|
@@ -7,26 +7,6 @@ require_once 'HTMLPurifier/Generator.php';
|
||||
require_once 'HTMLPurifier/Injector/AutoParagraph.php';
|
||||
require_once 'HTMLPurifier/Injector/Linkify.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'AutoParagraph', false, 'bool', '
|
||||
<p>
|
||||
This directive will cause HTML Purifier to automatically paragraph text
|
||||
in the document fragment root based on two newlines and block tags.
|
||||
This directive has been available since 2.0.1.
|
||||
</p>
|
||||
'
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'AutoLinkify', false, 'bool', '
|
||||
<p>
|
||||
This directive will cause HTML Purifier to automatically linkify
|
||||
text that looks like URLs. This directive has been available since
|
||||
2.0.1.
|
||||
</p>
|
||||
'
|
||||
);
|
||||
|
||||
/**
|
||||
* Takes tokens makes them well-formed (balance end tags, etc.)
|
||||
*/
|
||||
@@ -70,11 +50,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
|
||||
// we need a generic way of adding injectors, and also its own
|
||||
// configuration namespace
|
||||
if ($config->get('Core', 'AutoParagraph')) {
|
||||
if ($config->get('AutoFormat', 'AutoParagraph')) {
|
||||
$this->injectors[] = new HTMLPurifier_Injector_AutoParagraph();
|
||||
}
|
||||
|
||||
if ($config->get('Core', 'AutoLinkify')) {
|
||||
if ($config->get('AutoFormat', 'Linkify')) {
|
||||
$this->injectors[] = new HTMLPurifier_Injector_Linkify();
|
||||
}
|
||||
|
||||
@@ -163,7 +143,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
|
||||
// injector handler code; duplicated for performance reasons
|
||||
foreach ($this->injectors as $i => $x) {
|
||||
if (!$x->skip[$i]) $x->handleStart($token, $config, $context);
|
||||
if (!$x->skip) $x->handleStart($token, $config, $context);
|
||||
if (is_array($token)) {
|
||||
$this->currentInjector = $i;
|
||||
break;
|
||||
|
Reference in New Issue
Block a user