moodle/search/documents/physical_htm.php
2008-04-07 20:36:47 +00:00

59 lines
1.8 KiB
PHP

<?php
/**
* Global Search Engine for Moodle
*
* @package search
* @category core
* @subpackage document_wrappers
* @author Valery Fremaux [valery.fremaux@club-internet.fr] > 1.8
* @date 2008/03/31
* @license http://www.gnu.org/copyleft/gpl.html GNU Public License
*
* this is a format handler for getting text out of a proprietary binary format
* so it can be indexed by Lucene search engine
*/
/**
* @param object $resource
* @uses CFG, USER
*/
function get_text_for_indexing_htm(&$resource){
global $CFG, $USER;
// SECURITY : do not allow non admin execute anything on system !!
if (!isadmin($USER->id)) return;
// just get text
$text = implode('', file("{$CFG->dataroot}/{$resource->course}/{$resource->reference}"));
// extract keywords and other interesting meta information and put it back as real content for indexing
if (preg_match('/(.*)<meta ([^>]*)>(.*)/is', $text, $matches)){
$prefix = $matches[1];
$meta_attributes = $matches[2];
$suffix = $matches[3];
if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){
preg_match('/content="([^"]+)"/i', $meta_attributes, $matches);
$text = $prefix.' '.$matches[1].' '.$suffix;
}
}
// brutally filters all html tags
$text = preg_replace("/<[^>]*>/", '', $text);
$text = preg_replace("/<!--[^>]*-->/", '', $text);
$text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
$text = mb_convert_encoding($text, 'UTF-8', 'AUTO');
/*
* debug code for tracing input
echo "<hr/>";
$FILE = fopen("filetrace.log", 'w');
fwrite($FILE, $text);
fclose($FILE);
echo "<hr/>";
*/
if (!empty($CFG->block_search_limit_index_body)){
$text = shorten($text, $CFG->block_search_limit_index_body);
}
return $text;
}
?>