1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 14:16:32 +02:00

Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-07-23 21:07:30 +00:00
parent 7268987846
commit 5ce0ae7056
9 changed files with 196 additions and 54 deletions

View File

@@ -0,0 +1,68 @@
#!/usr/bin/php
<?php
/**
* Parses *.ent files into an entity lookup table, and then serializes and
* writes the whole kaboodle to a file. The resulting file should be versioned.
*/
chdir( dirname(__FILE__) );
// here's where the entity files are located, assuming working directory
// is the same as the location of this PHP file. Needs trailing slash.
$entity_dir = '../docs/entities/';
// defines the output file for the serialized content.
$output_file = '../library/HTMLPurifier/EntityLookup/data.txt';
function unichr($dec) {
if ($dec < 128) {
$utf = chr($dec);
} else if ($dec < 2048) {
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
} else {
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
}
return $utf;
}
if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
$dh = @opendir($entity_dir);
if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
$entity_files = array();
while (($file = readdir($dh)) !== false) {
if (@$file[0] === '.') continue;
if (substr(strrchr($file, "."), 1) !== 'ent') continue;
$entity_files[] = $file;
}
closedir($dh);
if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
$entity_table = array();
$regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
foreach ( $entity_files as $file ) {
$contents = file_get_contents($entity_dir . $file);
$matches = array();
preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
$entity_table[$match[1]] = unichr($match[2]);
}
}
$output = serialize($entity_table);
$fh = fopen($output_file, 'w');
fwrite($fh, $output);
fclose($fh);
echo "Completed successfully.";
?>