moodle/search/documents/physical_ppt.php
diml 27af904b85 fixes result list ordering issue
engine give nicer results with adequate pagination.
Adding icon and reference to course origine.

Generalizing replacement of isadmin by doanithing capability check

should contribute to close many global search related issues from

http://tracker.moodle.org/browse/MDL-14646

startpoint
2008-05-31 17:09:59 +00:00

94 lines
3.4 KiB
PHP

<?php
/**
* Global Search Engine for Moodle
*
* @package search
* @category core
* @subpackage document_wrappers
* @author Valery Fremaux [valery.fremaux@club-internet.fr] > 1.8
* @date 2008/03/31
* @license http://www.gnu.org/copyleft/gpl.html GNU Public License
*
* this is a format handler for getting text out of a proprietary binary format
* so it can be indexed by Lucene search engine
*/
/*
* first implementation is a trivial heuristic based on ppt character stream :
* text sequence always starts with a 00 9F 0F 04 sequence followed by a 15 bytes
* sequence
* In this sequence is a A8 0F or A0 0F or AA 0F followed by a little-indian encoding of text buffer size
* A8 0F denotes for ASCII text (local system monobyte encoding)
* A0 0F denotes for UTF-16 encoding
* AA 0F are non textual sequences
* texts are either in ASCII or UTF-16
* text ends on a new sequence start, or on a 00 00 NULL UTF-16 end of stream
*
* based on these following rules, here is a little empiric texte extractor for PPT
*/
/**
* @param object $resource
* @uses $CFG
*/
function get_text_for_indexing_ppt(&$resource, $directfile = ''){
global $CFG;
$indextext = null;
// SECURITY : do not allow non admin execute anything on system !!
if (!has_capability('moodle/site:doanything', get_context_instance(CONTEXT_SYSTEM))) return;
if ($directfile == ''){
$text = implode('', file("{$CFG->dataroot}/{$resource->course}/{$resource->reference}"));
} else {
$text = implode('', file("{$CFG->dataroot}/{$directfile}"));
}
$remains = $text;
$fragments = array();
while (preg_match('/\x00\x9F\x0F\x04.{9}(......)(.*)/s', $remains, $matches)){
$unpacked = unpack("ncode/Llength", $matches[1]);
$sequencecode = $unpacked['code'];
$length = $unpacked['length'];
// print "length : ".$length." ; segment type : ".sprintf("%x", $sequencecode)."<br/>";
$followup = $matches[2];
// local system encoding sequence
if ($sequencecode == 0xA80F){
$aFragment = substr($followup, 0, $length);
$remains = substr($followup, $length);
$fragments[] = $aFragment;
}
// denotes unicode encoded sequence
elseif ($sequencecode == 0xA00F){
$aFragment = substr($followup, 0, $length);
// $aFragment = mb_convert_encoding($aFragment, 'UTF-16', 'UTF-8');
$aFragment = preg_replace('/\xA0\x00\x19\x20/s', "'", $aFragment); // some quotes
$aFragment = preg_replace('/\x00/s', "", $aFragment);
$remains = substr($followup, $length);
$fragments[] = $aFragment;
}
else{
$remains = $followup;
}
}
$indextext = implode(' ', $fragments);
$indextext = preg_replace('/\x19\x20/', "'", $indextext); // some quotes
$indextext = preg_replace('/\x09/', '', $indextext); // some extra chars
$indextext = preg_replace('/\x0D/', "\n", $indextext); // some quotes
$indextext = preg_replace('/\x0A/', "\n", $indextext); // some quotes
$indextextprint = implode('<hr/>', $fragments);
// debug code
// $logppt = fopen("C:/php5/logs/pptlog", "w");
// fwrite($logppt, $indextext);
// fclose($logppt);
if (!empty($CFG->block_search_limit_index_body)){
$indextext = shorten($text, $CFG->block_search_limit_index_body);
}
$indextext = mb_convert_encoding($indextext, 'UTF8', 'auto');
return $indextext;
}
?>