fixes HTM files parsing

This commit is contained in:
diml 2008-04-07 20:36:47 +00:00
parent e1778c0c5e
commit 0f4471f5d3

View File

@ -32,14 +32,24 @@ function get_text_for_indexing_htm(&$resource){
$meta_attributes = $matches[2];
$suffix = $matches[3];
if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){
preg_match('/content="[^"]+"/i', $meta_attributes, $matches);
$text = $prefix.' '.$matches[0].' '.$suffix;
preg_match('/content="([^"]+)"/i', $meta_attributes, $matches);
$text = $prefix.' '.$matches[1].' '.$suffix;
}
}
// filter all html tags
// $text = clean_text($text, FORMAT_PLAIN);
// NOTE : this is done in ResourceSearchDocument __constructor
$text = preg_replace("/<!--[^>]*?-->/", '', $text);
// brutally filters all html tags
$text = preg_replace("/<[^>]*>/", '', $text);
$text = preg_replace("/<!--[^>]*-->/", '', $text);
$text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
$text = mb_convert_encoding($text, 'UTF-8', 'AUTO');
/*
* debug code for tracing input
echo "<hr/>";
$FILE = fopen("filetrace.log", 'w');
fwrite($FILE, $text);
fclose($FILE);
echo "<hr/>";
*/
if (!empty($CFG->block_search_limit_index_body)){
$text = shorten($text, $CFG->block_search_limit_index_body);