mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-06 14:16:32 +02:00
Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
68
maintenance/generate-entity-file.php
Normal file
68
maintenance/generate-entity-file.php
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/php
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Parses *.ent files into an entity lookup table, and then serializes and
|
||||
* writes the whole kaboodle to a file. The resulting file should be versioned.
|
||||
*/
|
||||
|
||||
chdir( dirname(__FILE__) );
|
||||
|
||||
// here's where the entity files are located, assuming working directory
|
||||
// is the same as the location of this PHP file. Needs trailing slash.
|
||||
$entity_dir = '../docs/entities/';
|
||||
|
||||
// defines the output file for the serialized content.
|
||||
$output_file = '../library/HTMLPurifier/EntityLookup/data.txt';
|
||||
|
||||
function unichr($dec) {
|
||||
if ($dec < 128) {
|
||||
$utf = chr($dec);
|
||||
} else if ($dec < 2048) {
|
||||
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
|
||||
$utf .= chr(128 + ($dec % 64));
|
||||
} else {
|
||||
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
|
||||
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
|
||||
$utf .= chr(128 + ($dec % 64));
|
||||
}
|
||||
return $utf;
|
||||
}
|
||||
|
||||
if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
|
||||
if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
|
||||
|
||||
$dh = @opendir($entity_dir);
|
||||
if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
|
||||
|
||||
$entity_files = array();
|
||||
while (($file = readdir($dh)) !== false) {
|
||||
if (@$file[0] === '.') continue;
|
||||
if (substr(strrchr($file, "."), 1) !== 'ent') continue;
|
||||
$entity_files[] = $file;
|
||||
}
|
||||
closedir($dh);
|
||||
|
||||
if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
|
||||
|
||||
$entity_table = array();
|
||||
$regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
|
||||
|
||||
foreach ( $entity_files as $file ) {
|
||||
$contents = file_get_contents($entity_dir . $file);
|
||||
$matches = array();
|
||||
preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
|
||||
foreach ($matches as $match) {
|
||||
$entity_table[$match[1]] = unichr($match[2]);
|
||||
}
|
||||
}
|
||||
|
||||
$output = serialize($entity_table);
|
||||
|
||||
$fh = fopen($output_file, 'w');
|
||||
fwrite($fh, $output);
|
||||
fclose($fh);
|
||||
|
||||
echo "Completed successfully.";
|
||||
|
||||
?>
|
Reference in New Issue
Block a user