fixes HTM files parsing

2025-01-18 22:08:20 +01:00 · 2008-04-07 20:36:47 +00:00 · 2008-04-07 20:36:47 +00:00 · 0f4471f5d3
commit 0f4471f5d3
parent e1778c0c5e
1 changed files with 16 additions and 6 deletions
--- a/search/documents/physical_htm.php
+++ b/search/documents/physical_htm.php
@ -32,14 +32,24 @@ function get_text_for_indexing_htm(&$resource){
        $meta_attributes = $matches[2];
        $suffix = $matches[3];
        if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){
-            preg_match('/content="[^"]+"/i', $meta_attributes, $matches);
-            $text = $prefix.' '.$matches[0].' '.$suffix;
+            preg_match('/content="([^"]+)"/i', $meta_attributes, $matches);
+            $text = $prefix.' '.$matches[1].' '.$suffix;
        }
    }
-    // filter all html tags
-    // $text = clean_text($text, FORMAT_PLAIN);
-    // NOTE : this is done in ResourceSearchDocument __constructor
-    $text = preg_replace("/<!--[^>]*?-->/", '', $text);
+    // brutally filters all html tags
+    $text = preg_replace("/<[^>]*>/", '', $text);
+    $text = preg_replace("/<!--[^>]*-->/", '', $text);
+    $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
+    $text = mb_convert_encoding($text, 'UTF-8', 'AUTO');
+    
+    /*
+    * debug code for tracing input
+    echo "<hr/>";
+    $FILE = fopen("filetrace.log", 'w');
+    fwrite($FILE, $text);
+    fclose($FILE);
+    echo "<hr/>";
+    */
    
    if (!empty($CFG->block_search_limit_index_body)){
        $text = shorten($text, $CFG->block_search_limit_index_body);