diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php
index 754d995a..c1cdc7fa 100644
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -5,7 +5,7 @@ class HTMLPurifier_AttrDef
{
function HTMLPurifier_AttrDef() {}
- function validate() {
+ function validate($string, $config = null) {
trigger_error('Cannot call abstract function', E_USER_ERROR);
}
diff --git a/library/HTMLPurifier/AttrDef/Lang.php b/library/HTMLPurifier/AttrDef/Lang.php
new file mode 100644
index 00000000..a18b6b5c
--- /dev/null
+++ b/library/HTMLPurifier/AttrDef/Lang.php
@@ -0,0 +1,73 @@
+ 8 || !ctype_alnum($subtags[1])) {
+ return $new_string;
+ }
+ if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]);
+
+ $new_string .= '-' . $subtags[1];
+ if ($num_subtags == 2) return $new_string;
+
+ // process all other subtags, index 2 and up
+ for ($i = 2; $i < $num_subtags; $i++) {
+ $length = strlen($subtags[$i]);
+ if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
+ return $new_string;
+ }
+ if (!ctype_lower($subtags[$i])) {
+ $subtags[$i] = strotolower($subtags[$i]);
+ }
+ $new_string .= '-' . $subtags[$i];
+ }
+
+ return $new_string;
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/AttrTransform.php b/library/HTMLPurifier/AttrTransform.php
new file mode 100644
index 00000000..8df5d3d2
--- /dev/null
+++ b/library/HTMLPurifier/AttrTransform.php
@@ -0,0 +1,14 @@
+
\ No newline at end of file
diff --git a/library/HTMLPurifier/AttrTransform/Lang.php b/library/HTMLPurifier/AttrTransform/Lang.php
new file mode 100644
index 00000000..fc0b72ba
--- /dev/null
+++ b/library/HTMLPurifier/AttrTransform/Lang.php
@@ -0,0 +1,31 @@
+attributes['lang']) ?
+ $token->attributes['lang'] : false;
+ $xml_lang = isset($token->attributes['xml:lang']) ?
+ $token->attributes['xml:lang'] : false;
+
+ if ($lang === false && $xml_lang == false) return $token;
+
+ $new_token = $token->copy();
+
+ if ($lang !== false && $xml_lang === false) {
+ $new_token->attributes['xml:lang'] = $lang;
+ } elseif ($xml_lang !== false) {
+ $new_token->attributes['lang'] = $xml_lang;
+ }
+
+ return $new_token;
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php
index 6af372ec..2ed84ab7 100644
--- a/library/HTMLPurifier/Definition.php
+++ b/library/HTMLPurifier/Definition.php
@@ -45,6 +45,9 @@ class HTMLPurifier_Definition
// used solely by HTMLPurifier_Strategy_RemoveForeignElements
var $info_tag_transform = array();
+ // used solely by HTMLPurifier_Strategy_ValidateAttributes
+ var $info_attr_transform = array();
+
// WARNING! Prototype is not passed by reference, so in order to get
// a copy of the real one, you'll have to destroy your copy and
// use instance() to get it.
@@ -238,11 +241,22 @@ class HTMLPurifier_Definition
// which manually override these in their local definitions
$this->info_global_attr = array(
// core attrs
- 'id' => new HTMLPurifier_AttrDef_ID(),
+ 'id' => new HTMLPurifier_AttrDef_ID(),
'class' => new HTMLPurifier_AttrDef_Class(),
'title' => new HTMLPurifier_AttrDef_Text(),
// i18n
- 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
+ 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
+ 'lang' => new HTMLPurifier_AttrDef_Lang(),
+ 'xml:lang' => new HTMLPurifier_AttrDef_Lang(),
+ );
+
+ // required attribute stipulation handled in attribute transformation
+ $this->info['bdo']->attr = array();
+
+ $this->info['br']->attr = array(
+ 'dir' => false,
+ 'lang' => false,
+ 'xml:lang' => false,
);
//////////////////////////////////////////////////////////////////////
@@ -275,9 +289,11 @@ class HTMLPurifier_Definition
// UNIMP : info[]->attr_transform : attribute transformations in elements
//////////////////////////////////////////////////////////////////////
- // UNIMP : info_attr_transform : global attribute transform (for xml:lang)
+ // info_attr_transform : global attribute transformation that is
+ // unconditionally called. Good for transformations that have complex
+ // start conditions
- // this might have bad implications for performance
+ $this->info_attr_transform[] = new HTMLPurifier_AttrTransform_Lang();
}
diff --git a/library/HTMLPurifier/Strategy/ValidateAttributes.php b/library/HTMLPurifier/Strategy/ValidateAttributes.php
index fa382cc3..8793f802 100644
--- a/library/HTMLPurifier/Strategy/ValidateAttributes.php
+++ b/library/HTMLPurifier/Strategy/ValidateAttributes.php
@@ -26,11 +26,16 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
$d_defs = $this->definition->info_global_attr;
foreach ($tokens as $key => $token) {
- if ($token->type !== 'start' && $token->type !== 'end') continue;
+ if ($token->type !== 'start' && $token->type !== 'empty') continue;
// DEFINITION CALL
$defs = $this->definition->info[$token->name]->attr;
+ // DEFINITION CALL
+ foreach ($this->definition->info_attr_transform as $transformer) {
+ $token = $transformer->transform($token);
+ }
+
$attr = $token->attributes;
$changed = false;
foreach ($attr as $attr_key => $value) {
diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php
index 87e37f73..ed46621b 100644
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@@ -59,7 +59,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
if (!isset($attributes[$new_key])) {
$attributes[$new_key] = $attributes[$key];
}
- unset($attributes[$key]);
+ if ($new_key !== $key) {
+ unset($attributes[$key]);
+ }
}
}
$this->attributes = $attributes;
@@ -72,6 +74,9 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
+ function copy() {
+ return new HTMLPurifier_Token_Start($this->name, $this->attributes);
+ }
}
/**
@@ -80,6 +85,9 @@ class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
+ function copy() {
+ return new HTMLPurifier_Token_Empty($this->name, $this->attributes);
+ }
}
/**
@@ -92,6 +100,9 @@ class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
+ function copy() {
+ return new HTMLPurifier_Token_End($this->name);
+ }
}
/**
@@ -120,6 +131,9 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
$this->data = $data;
$this->is_whitespace = ctype_space($data);
}
+ function copy() {
+ return new HTMLPurifier_Token_Text($this->data);
+ }
}
@@ -138,6 +152,9 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
function HTMLPurifier_Token_Comment($data) {
$this->data = $data;
}
+ function copy() {
+ return new HTMLPurifier_Token_Comment($this->data);
+ }
}
?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/AttrDef/LangTest.php b/tests/HTMLPurifier/AttrDef/LangTest.php
new file mode 100644
index 00000000..216a7a88
--- /dev/null
+++ b/tests/HTMLPurifier/AttrDef/LangTest.php
@@ -0,0 +1,83 @@
+def = new HTMLPurifier_AttrDef_Lang();
+
+ // basic good uses
+ $this->assertDef('en');
+ $this->assertDef('en-us');
+
+ $this->assertDef(' en ', 'en'); // trim
+ $this->assertDef('EN', 'en'); // case insensitivity
+
+ $this->assertDef('fr en', false); // multiple languages
+ $this->assertDef('%', false); // bad character
+
+ // test overlong language according to syntax
+ $this->assertDef('thisistoolongsoitgetscut', false);
+
+ // primary subtag rules
+ // I'm somewhat hesitant to allow x and i as primary language codes,
+ // because they usually are never used in real life. However,
+ // theoretically speaking, having them alone is permissble, so
+ // I'll be lenient. No XML parser is going to complain anyway.
+ $this->assertDef('x');
+ $this->assertDef('i');
+ // real world use-cases
+ $this->assertDef('x-klingon');
+ $this->assertDef('i-mingo');
+ // because the RFC only defines two and three letter primary codes,
+ // anything with a length of four or greater is invalid, despite
+ // the syntax stipulation of 1 to 8 characters. Because the RFC
+ // specifically states that this reservation is in order to allow
+ // for future versions to expand, the adoption of a new RFC will
+ // require these test cases to be rewritten, even if backwards-
+ // compatibility is largely retained (i.e. this is not forwards
+ // compatible)
+ $this->assertDef('four', false);
+ // for similar reasons, disallow any other one character language
+ $this->assertDef('f', false);
+
+ // second subtag rules
+ // one letter subtags prohibited until revision. This is, however,
+ // less volatile than the restrictions on the primary subtags.
+ // Also note that this test-case tests fix-behavior: chop
+ // off subtags until you get a valid language code.
+ $this->assertDef('en-a', 'en');
+ // 2-8 chars are permitted, but have special meaning that cannot
+ // be checked without maintaining country code lookup tables (for
+ // two characters) or special registration tables (for all above).
+ $this->assertDef('en-uk', true);
+
+ // further subtag rules: only syntactic constraints
+ $this->assertDef('en-us-edison');
+ $this->assertDef('en-us-toolonghaha', 'en-us');
+ $this->assertDef('en-us-a-silly-long-one');
+
+ // rfc 3066 stipulates that if a three letter and a two letter code
+ // are available, the two letter one MUST be used. Without a language
+ // code lookup table, we cannot implement this functionality.
+
+ // although the HTML protocol, technically speaking, allows you to
+ // omit language tags, this implicitly means that the parent element's
+ // language is the one applicable, which, in some cases, is incorrect.
+ // Thus, we allow und, only slightly defying the RFC's SHOULD NOT
+ // designation.
+ $this->assertDef('und');
+
+ // because attributes only allow one language, mul is allowed, complying
+ // with the RFC's SHOULD NOT designation.
+ $this->assertDef('mul');
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/AttrTransform/LangTest.php b/tests/HTMLPurifier/AttrTransform/LangTest.php
new file mode 100644
index 00000000..fa9273c9
--- /dev/null
+++ b/tests/HTMLPurifier/AttrTransform/LangTest.php
@@ -0,0 +1,60 @@
+ 'en'));
+ $expect[1] = new HTMLPurifier_Token_Start('span',
+ array('lang' => 'en',
+ 'xml:lang' => 'en'));
+
+ // empty tags must work too, also test attribute preservation
+ $inputs[2] = new HTMLPurifier_Token_Empty('img',
+ array('src' => 'seine.png',
+ 'lang' => 'fr'));
+ $expect[2] = new HTMLPurifier_Token_Empty('img',
+ array('src' => 'seine.png',
+ 'lang' => 'fr',
+ 'xml:lang' => 'fr'));
+
+ // copy xml:lang to lang
+ $inputs[3] = new HTMLPurifier_Token_Start('span',
+ array('xml:lang' => 'en'));
+ $expect[3] = new HTMLPurifier_Token_Start('span',
+ array('lang' => 'en',
+ 'xml:lang' => 'en'));
+
+ // both set, override lang with xml:lang
+ $inputs[4] = new HTMLPurifier_Token_Start('span',
+ array('lang' => 'fr',
+ 'xml:lang' => 'de'));
+ $expect[4] = new HTMLPurifier_Token_Start('span',
+ array('lang' => 'de',
+ 'xml:lang' => 'de'));
+
+ foreach ($inputs as $i => $input) {
+ $result = $transform->transform($input);
+ $this->assertEqual($expect[$i], $result, "Test $i: %s");
+ }
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php
index dbf7aea2..078cde7a 100644
--- a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php
+++ b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php
@@ -59,6 +59,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
$inputs[10] = 'PHP';
$expect[10] = $inputs[10];
+ // test lang (NEEDS CORRECTION!)
+ $inputs[11] = 'La soupe.';
+ $expect[11] = 'La soupe.';
+
$this->assertStrategyWorks($strategy, $inputs, $expect, $config);
}
diff --git a/tests/index.php b/tests/index.php
index 190c3b91..5ff54b6c 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -42,8 +42,10 @@ $test->addTestFile('HTMLPurifier/AttrDef/EnumTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/IDTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/ClassTest.php');
$test->addTestFile('HTMLPurifier/AttrDef/TextTest.php');
+$test->addTestFile('HTMLPurifier/AttrDef/LangTest.php');
$test->addTestFile('HTMLPurifier/IDAccumulatorTest.php');
$test->addTestFile('HTMLPurifier/TagTransformTest.php');
+$test->addTestFile('HTMLPurifier/AttrTransform/LangTest.php');
if (SimpleReporter::inCli()) $reporter = new TextReporter();
else $reporter = new HTMLReporter();