Initial commit

This commit is contained in:
mchampan 2006-06-25 23:07:36 +00:00
parent 1cd1252752
commit 682d403259
55 changed files with 7732 additions and 0 deletions

View File

@ -0,0 +1,70 @@
<?php
/* This is the global search shortcut block - a single query can be entered, and
the user will be redirected to the query page where they can enter more
advanced queries, and view the results of their search. When searching from
this block, the broadest possible selection of documents is searched.
Author: Michael Champanis (mchampan)
Date: 2006 06 23
Todo: make strings -> get_string()
*/
class block_search extends block_base {
function init() {
$this->title = "Global Search"; //get_string()
$this->version = 20060625;
} //init
// only one instance of this block is required
function instance_allow_multiple() {
return false;
} //instance_allow_multiple
// label and button values can be set in admin
function has_config() {
return true;
} //has_config
function get_content() {
global $CFG;
//cache block contents
if ($this->content !== NULL) {
return $this->content;
} //if
$this->content = new stdClass;
//lazy check for the moment
if (check_php_version("5.0.0")) {
//fetch values if defined in admin, otherwise use defaults
$label = (isset($CFG->block_search_text)) ? $CFG->block_search_text : "Search Moodle";
$button = (isset($CFG->block_search_button)) ? $CFG->block_search_button : "Go";
//basic search form
$this->content->text =
'<form name="query" method="post" action="search/query.php">'
. "<label for=''>$label</label>"
. '<input type="text" name="query_string" length="50" value=""/>'
. '<input type="submit" value="'.$button.'"/>'
. '</form>';
} else {
$this->content->text = "Sorry folks, PHP 5 is needed for the new search module.";
} //else
//no footer, thanks
$this->content->footer = '';
return $this->content;
} //get_content
function specialisation() {
//empty!
} //specialisation
} //block_search
?>

View File

@ -0,0 +1,19 @@
<div style="text-align:center;">
<label for="block_search_text">Search label</label>
<input type="text" name="block_search_text" value="<?php
if(isset($CFG->block_search_text)) {
p($CFG->block_search_text);
} else {
p("Search Moodle");
} ?>"/><br>
<label for="block_search_button">Button label</label>
<input type="text" name="block_search_button" value="<?php
if(isset($CFG->block_search_button)) {
p($CFG->block_search_button);
} else {
p("Go");
} ?>"/><br><br>
<input type="submit" value="<?php print_string('savechanges'); ?>" />
</div>

View File

@ -352,6 +352,126 @@ function wiki_get_entries(&$wiki, $byindex=NULL) {
}
}
/*==== Global search modifications
* Author: Michael Champanis (mchampan)
* Last date: 2006 06 25
* These modifications allow wiki documents to be indexed in the new
* search engine module - they are probably not final, and as such
* shouldn't be used by other stuff for the time being
**/
//rescued and converted from ewikimoodlelib.php
//retrieves latest version of a page
function wiki_get_latest_page(&$entry, $pagename, $version=0) {
global $CFG;
//need something like this in datalib.php?
switch ($CFG->dbtype) {
case 'mysql':
$f = 'mysql_real_escape_string';
break;
case 'postgres7':
$f = 'pg_escape_string';
break;
default:
$f = 'addslashes';
} //switch
$pagename = "'".$f($pagename)."'";
if ($version > 0 and is_int($version)) {
$version = "AND (version=$version)";
} else {
$version = '';
} //else
$select = "(pagename=$pagename) AND wiki=".$entry->id." $version ";
$sort = 'version DESC';
//change this to recordset_select, as per http://docs.moodle.org/en/Datalib_Notes
if ($result_arr = get_records_select('wiki_pages', $select, $sort, '*', 0, 1)) {
foreach ($result_arr as $obj) {
$result_obj = $obj;
} //foreach
} //if
if (isset($result_obj)) {
$result_obj->meta = @unserialize($result_obj->meta);
return $result_obj;
} else {
return false;
} //else
} //wiki_get_latest_page
//fetches all pages, including old versions
function wiki_get_pages(&$entry) {
return get_records('wiki_pages', 'wiki', $entry->id);
} //wiki_get_pages
//fetches all the latest versions of all the pages
function wiki_get_latest_pages(&$entry) {
//== (My)SQL for this
/* select * from wiki_pages
inner join
(select wiki_pages.pagename, max(wiki_pages.version) as ver
from wiki_pages group by pagename) as a
on ((wiki_pages.version = a.ver) and
(wiki_pages.pagename like a.pagename)) */
$pages = array();
//http://moodle.org/bugs/bug.php?op=show&bugid=5877&pos=0
//if ($ids = get_records('wiki_pages', 'wiki', $entry->id, '', 'distinct pagename')) {
if ($rs = get_recordset('wiki_pages', 'wiki', $entry->id, '', 'distinct pagename')) {
$ids = $rs->GetRows();
//--
foreach ($ids as $id) {
$pages[] = wiki_get_latest_page($entry, $id[0]);
} //foreach
} else {
return false;
} //else
return $pages;
} //wiki_get_latest_pages
function wiki_iterator() {
return get_all_instances_in_courses("wiki", get_courses());
} //wiki_search_index
function wiki_get_content_for_index(&$wiki) {
$documents = array();
$entries = wiki_get_entries($wiki);
foreach($entries as $entry) {
//all pages
//$pages = wiki_get_pages($entry);
//latest pages
$pages = wiki_get_latest_pages($entry);
$i = 0;
if (is_array($pages)) {
foreach($pages as $page) {
if (strlen($page->content) > 0) {
$i++;
$documents[] = new WikiSearchDocument($page, $entry->wikiid, $entry->course, $entry->userid, $entry->groupid);
} //if
} //foreach
//print "$entry->id : $i"; print "<br>";
} else {
print $pages;
} //else
} //foreach
return $documents;
} //wiki_get_content_for_index
/*==== Global search modifications end */
function wiki_get_default_entry(&$wiki, &$course, $userid=0, $groupid=0) {
/// Returns the wiki entry according to the wiki type.
/// Optionally, will return wiki entry for $userid student wiki, or

22
search/README.txt Normal file
View File

@ -0,0 +1,22 @@
This is the initial release (prototype) of Moodle's new search module -
so basically watch out for sharp edges.
The structure has not been finalised, but this is what is working at the
moment, when I start looking at other content to index, it will most likely
change. I don't recommend trying to make your own content modules indexable,
at least not until the whole flow is finalised. I will be implementing the
functions needed to index all of the default content modules on Moodle, so
expect that around mid-August.
Wiki pages were my goal for this release, they can be indexed and searched,
but not updated or deleted at this stage (was waiting for ZF 0.14 actually).
I need to check the PostgreSQL sql file, I don't have a PG7 install lying
around to test on, so the script is untested.
To index for the first time, login as an admin user and browse to /search/index.php
or /search/stats.php - there will be a message and a link telling you to go index.
-- Michael Champanis (mchampan)
cynnical@gmail.com
Summer of Code 2006

30
search/Zend/Exception.php Executable file
View File

@ -0,0 +1,30 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Exception extends Exception
{}

15
search/Zend/IMPORTANT.txt Normal file
View File

@ -0,0 +1,15 @@
We are running cutting-edge (i.e. HEAD) Zend Framework:
URL: http://framework.zend.com/svn/framework/trunk
Revision: 696
Last Changed Rev: 696
Last Changed Date: 2006-06-23 02:14:54 +0200 (Fri, 23 Jun 2006)
This Zend Framework present in this directory only contains the minimum
to run Zend_Search_Lucene - I don't foresee any problems, since the license
is new BSD...
To obtain a full Zend Framework package, please visit:
http://framework.zend.com/
Or alternatively check it out from SVN:
svn checkout http://framework.zend.com/svn/framework/trunk

27
search/Zend/LICENSE.txt Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2006, Zend Technologies USA, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Zend Technologies USA, Inc. nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,36 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Framework base exception
*/
require_once 'Zend/Exception.php';
/**
* @category Zend
* @package Zend_Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Exception extends Zend_Exception
{}

View File

@ -0,0 +1,614 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
/** Zend_Search_Lucene_Storage_Directory */
require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_TermInfo */
require_once 'Zend/Search/Lucene/Index/TermInfo.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_FieldInfo */
require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
/** Zend_Search_Lucene_Index_Writer */
require_once 'Zend/Search/Lucene/Index/Writer.php';
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
/** Zend_Search_Lucene_Search_QueryHit */
require_once 'Zend/Search/Lucene/Search/QueryHit.php';
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene
{
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* File system adapter closing option
*
* @var boolean
*/
private $_closeDirOnExit = true;
/**
* Writer for this index, not instantiated unless required.
*
* @var Zend_Search_Lucene_Index_Writer
*/
private $_writer = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos = array();
/**
* Number of documents in this index.
*
* @var integer
*/
private $_docCount = 0;
/**
* Flag for index changes
*
* @var boolean
*/
private $_hasChanges = false;
/**
* Opens the index.
*
* IndexReader constructor needs Directory as a parameter. It should be
* a string with a path to the index folder or a Directory object.
*
* @param mixed $directory
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($directory = null, $create = false)
{
if ($directory === null) {
throw new Zend_Search_Exception('No index directory specified');
}
if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) {
$this->_directory = $directory;
$this->_closeDirOnExit = false;
} else {
$this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
$this->_closeDirOnExit = true;
}
if ($create) {
$this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true);
} else {
$this->_writer = null;
}
$this->_segmentInfos = array();
$segmentsFile = $this->_directory->getFileObject('segments');
$format = $segmentsFile->readInt();
if ($format != (int)0xFFFFFFFF) {
throw new Zend_Search_Lucene_Exception('Wrong segments file format');
}
// read version
$segmentsFile->readLong();
// read counter
$segmentsFile->readInt();
$segments = $segmentsFile->readInt();
$this->_docCount = 0;
// read segmentInfos
for ($count = 0; $count < $segments; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
$this->_docCount += $segSize;
$this->_segmentInfos[$count] =
new Zend_Search_Lucene_Index_SegmentInfo($segName,
$segSize,
$this->_directory);
}
}
/**
* Object destructor
*/
public function __destruct()
{
$this->commit();
if ($this->_closeDirOnExit) {
$this->_directory->close();
}
}
/**
* Returns an instance of Zend_Search_Lucene_Index_Writer for the index
*
* @return Zend_Search_Lucene_Index_Writer
*/
public function getIndexWriter()
{
if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
$this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
}
return $this->_writer;
}
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @return Zend_Search_Lucene_Storage_Directory
*/
public function getDirectory()
{
return $this->_directory;
}
/**
* Returns the total number of documents in this index.
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array ZSearchHit
*/
public function find($query)
{
if (is_string($query)) {
$query = Zend_Search_Lucene_Search_QueryParser::parse($query);
}
if (!$query instanceof Zend_Search_Lucene_Search_Query) {
throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
}
$this->commit();
$hits = array();
$scores = array();
$docNum = $this->count();
for( $count=0; $count < $docNum; $count++ ) {
$docScore = $query->score( $count, $this);
if( $docScore != 0 ) {
$hit = new Zend_Search_Lucene_Search_QueryHit($this);
$hit->id = $count;
$hit->score = $docScore;
$hits[] = $hit;
$scores[] = $docScore;
}
}
array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits);
return $hits;
}
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false)
{
$result = array();
foreach( $this->_segmentInfos as $segmentInfo ) {
$result = array_merge($result, $segmentInfo->getFields($indexed));
}
return $result;
}
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @return Zend_Search_Lucene_Document
*/
public function getDocument($id)
{
if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
/* @var $id Zend_Search_Lucene_Search_QueryHit */
$id = $id->id;
}
if ($id >= $this->_docCount) {
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
$segCount = 0;
$nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
while( $nextSegmentStartId <= $id ) {
$segCount++;
$nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
}
$segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
$fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx');
$fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
$fieldValuesPosition = $fdxFile->readLong();
$fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt');
$fdtFile->seek( $fieldValuesPosition, SEEK_CUR );
$fieldCount = $fdtFile->readVInt();
$doc = new Zend_Search_Lucene_Document();
for( $count = 0; $count < $fieldCount; $count++ ) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
$fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum);
if( !($bits & 2) ) { // Text data
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readString(),
true,
$fieldInfo->isIndexed,
$bits & 1 );
} else {
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readBinary(),
true,
$fieldInfo->isIndexed,
$bits & 1 );
}
$doc->addField($field);
}
return $doc;
}
/**
* Returns an array of all the documents which contain term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term)
{
$result = array();
$segmentStartDocId = 0;
foreach ($this->_segmentInfos as $segInfo) {
$termInfo = $segInfo->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
$segmentStartDocId += $segInfo->count();
continue;
}
$frqFile = $segInfo->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$docId = 0;
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
$result[] = $segmentStartDocId + $docId;
}
$segmentStartDocId += $segInfo->count();
}
return $result;
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @return array
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term)
{
$result = array();
$segmentStartDocId = 0;
foreach( $this->_segmentInfos as $segInfo ) {
$termInfo = $segInfo->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
$segmentStartDocId += $segInfo->count();
continue;
}
$frqFile = $segInfo->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$freqs = array();
$docId = 0;
for( $count = 0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $frqFile->readVInt();
}
}
$prxFile = $segInfo->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer,SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
$result[ $segmentStartDocId + $docId ] = $positions;
}
$segmentStartDocId += $segInfo->count();
}
return $result;
}
/**
* Returns the number of documents in this index containing the $term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return integer
*/
public function docFreq(Zend_Search_Lucene_Index_Term $term)
{
$result = 0;
foreach ($this->_segmentInfos as $segInfo) {
$termInfo = $segInfo->getTermInfo($term);
if ($termInfo !== null) {
$result += $termInfo->docFreq;
}
}
return $result;
}
/**
* Retrive similarity used by index reader
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public function getSimilarity()
{
return Zend_Search_Lucene_Search_Similarity::getDefault();
}
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return Zend_Search_Lucene_Document
*/
public function norm( $id, $fieldName )
{
if ($id >= $this->_docCount) {
return null;
}
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segInfo) {
if ($segmentStartId + $segInfo->count() > $id) {
break;
}
$segmentStartId += $segInfo->count();
}
if ($segInfo->isDeleted($id - $segmentStartId)) {
return 0;
}
return $segInfo->norm($id - $segmentStartId, $fieldName);
}
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions()
{
foreach ($this->_segmentInfos as $segmentInfo) {
if ($segmentInfo->hasDeletions()) {
return true;
}
}
return false;
}
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @throws Zend_Search_Lucene_Exception
*/
public function delete($id)
{
if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
/* @var $id Zend_Search_Lucene_Search_QueryHit */
$id = $id->id;
}
if ($id >= $this->_docCount) {
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
$segCount = 0;
$nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
while( $nextSegmentStartId <= $id ) {
$segCount++;
$nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
}
$this->_hasChanges = true;
$segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
$this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId);
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
$this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
}
$this->_writer->addDocument($document);
}
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*
* @todo delete() and undeleteAll processing.
*/
public function commit()
{
if ($this->_hasChanges) {
foreach ($this->_segmentInfos as $segInfo) {
$segInfo->writeChanges();
}
$this->_hasChanges = false;
}
if ($this->_writer !== null) {
foreach ($this->_writer->commit() as $segmentName => $segmentInfo) {
if ($segmentInfo !== null) {
$this->_segmentInfos[] = $segmentInfo;
$this->_docCount += $segmentInfo->count();
} else {
foreach ($this->_segmentInfos as $segId => $segInfo) {
if ($segInfo->getName() == $segmentName) {
unset($this->_segmentInfos[$segId]);
}
}
}
}
}
}
/*************************************************************************
@todo UNIMPLEMENTED
*************************************************************************/
/**
* Returns an array of all terms in this index.
*
* @todo Implementation
* @return array
*/
public function terms()
{
return array();
}
/**
* Undeletes all documents currently marked as deleted in this index.
*
* @todo Implementation
*/
public function undeleteAll()
{}
}

View File

@ -0,0 +1,96 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
/**
* An Analyzer is used to analyze text.
* It thus represents a policy for extracting index terms from text.
*
* Note:
* Lucene Java implementation is oriented to streams. It provides effective work
* with a huge documents (more then 20Mb).
* But engine itself is not oriented such documents.
* Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The Analyzer implementation used by default.
*
* @var Zend_Search_Lucene_Analysis_Analyzer
*/
static private $_defaultImpl;
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
* @param string $data
* @return array
*/
abstract public function tokenize($data);
/**
* Set the default Analyzer implementation used by indexing code.
*
* @param Zend_Search_Lucene_Analysis_Analyzer $similarity
*/
static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
/**
* Return the default Analyzer implementation used by indexing code.
*
* @return Zend_Search_Lucene_Analysis_Analyzer
*/
static public function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
}
return self::$_defaultImpl;
}
}

View File

@ -0,0 +1,75 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface.
* There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis
* subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer.
*
* @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The set of Token filters applied to the Token stream.
* Array of Zend_Search_Lucene_Analysis_TokenFilter objects.
*
* @var array
*/
private $_filters = array();
/**
* Add Token filter to the Analyzer
*
* @param Zend_Search_Lucene_Analysis_TokenFilter $filter
*/
public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter)
{
$this->_filters[] = $filter;
}
/**
* Apply filters to the token.
*
* @param Zend_Search_Lucene_Analysis_Token $token
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $token)
{
foreach ($this->_filters as $filter) {
$token = $filter->normalize($token);
}
return $token;
}
}

View File

@ -0,0 +1,78 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
* @param string $data
* @return array
*/
public function tokenize($data)
{
$tokenStream = array();
$position = 0;
while ($position < strlen($data)) {
// skip white space
while ($position < strlen($data) && !ctype_alpha( $data{$position} )) {
$position++;
}
$termStartPosition = $position;
// read token
while ($position < strlen($data) && ctype_alpha( $data{$position} )) {
$position++;
}
// Empty token, end of stream.
if ($position == $termStartPosition) {
break;
}
$token = new Zend_Search_Lucene_Analysis_Token(substr($data,
$termStartPosition,
$position-$termStartPosition),
$termStartPosition,
$position);
$tokenStream[] = $this->normalize($token);
}
return $tokenStream;
}
}

View File

@ -0,0 +1,46 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text
{
public function __construct()
{
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
}
}

View File

@ -0,0 +1,171 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Token
{
/**
* The text of the term.
*
* @var string
*/
private $_termText;
/**
* Start in source text.
*
* @var integer
*/
private $_startOffset;
/**
* End in source text
*
* @var integer
*/
private $_endOffset;
/**
* Lexical type.
*
* @var string
*/
private $_type;
/**
* The position of this token relative to the previous Token.
*
* The default value is one.
*
* Some common uses for this are:
* Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* @var integer
*/
private $_positionIncrement;
/**
* Object constructor
*
* @param string $text
* @param integer $start
* @param integer $end
* @param string $type
*/
public function __construct($text, $start, $end, $type = 'word' )
{
$this->_termText = $text;
$this->_startOffset = $start;
$this->_endOffset = $end;
$this->_type = $type;
$this->_positionIncrement = 1;
}
/**
* positionIncrement setter
*
* @param integer $positionIncrement
*/
public function setPositionIncrement($positionIncrement)
{
$this->_positionIncrement = $positionIncrement;
}
/**
* Returns the position increment of this Token.
*
* @return integer
*/
public function getPositionIncrement()
{
return $this->_positionIncrement;
}
/**
* Returns the Token's term text.
*
* @return string
*/
public function getTermText()
{
return $this->_termText;
}
/**
* Returns this Token's starting offset, the position of the first character
* corresponding to this token in the source text.
*
* Note:
* The difference between getEndOffset() and getStartOffset() may not be equal
* to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered
* by a stemmer or some other filter.
*
* @return integer
*/
public function getStartOffset()
{
return $this->_startOffset;
}
/**
* Returns this Token's ending offset, one greater than the position of the
* last character corresponding to this token in the source text.
*
* @return integer
*/
public function getEndOffset()
{
return $this->_endOffset;
}
/**
* Returns this Token's lexical type. Defaults to 'word'.
*
* @return string
*/
public function getType()
{
return $this->_type;
}
}

View File

@ -0,0 +1,47 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/**
* Token filter converts (normalizes) Token ore removes it from a token stream.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken);
}

View File

@ -0,0 +1,57 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
$newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ),
$srcToken->getStartOffset(),
$srcToken->getEndOffset(),
$srcToken->getType());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}

View File

@ -0,0 +1,111 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Field */
require_once 'Zend/Search/Lucene/Field.php';
/**
* A Document is a set of fields. Each field has a name and a textual value.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document
{
/**
* Associative array Zend_Search_Lucene_Field objects where the keys to the
* array are the names of the fields.
*
* @var array
*/
protected $_fields = array();
public $boost = 1.0;
/**
* Proxy method for getFieldValue(), provides more convenient access to
* the string value of a field.
*
* @param $offset
* @return string
*/
public function __get($offset)
{
return $this->getFieldValue($offset);
}
/**
* Add a field object to this document.
*
* @param Zend_Search_Lucene_Field $field
*/
public function addField(Zend_Search_Lucene_Field $field)
{
$this->_fields[$field->name] = $field;
}
/**
* Return an array with the names of the fields in this document.
*
* @return array
*/
public function getFieldNames()
{
return array_keys($this->_fields);
}
/**
* Returns Zend_Search_Lucene_Field object for a named field in this document.
*
* @param string $fieldName
* @return Zend_Search_Lucene_Field
*/
public function getField($fieldName)
{
if (!array_key_exists($fieldName, $this->_fields)) {
throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
}
return $this->_fields[$fieldName];
}
/**
* Returns the string value of a named field in this document.
*
* @see __get()
* @return string
*/
public function getFieldValue($fieldName)
{
return $this->getField($fieldName)->stringValue;
}
}

View File

@ -0,0 +1,32 @@
<?php
class EncodingConverter {
private $last_error,
$in_encoding,
$out_encoding;
function __construct($in_encoding, $out_encoding) {
$this->in_encoding = $in_encoding;
$this->out_encoding = $out_encoding;
} //constructor
function handleError($err, $msg) {
$this->last_error = $msg;
} //handleError
function convert($str) {
$this->last_error = FALSE;
set_error_handler(array(&$this, 'handleError'));
$ret = iconv($this->in_encoding, $this->out_encoding, $str);
restore_error_handler();
return $ret;
} //convert
function getLastError() {
return $this->last_error;
} //getLastError
} //EncodingConverter
?>

View File

@ -0,0 +1,36 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Framework base exception
*/
require_once 'Zend/Search/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Exception extends Zend_Search_Exception
{}

View File

@ -0,0 +1,161 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A field is a section of a Document. Each field has two parts,
* a name and a value. Values may be free text or they may be atomic
* keywords, which are not further processed. Such keywords may
* be used to represent dates, urls, etc. Fields are optionally
* stored in the index, so that they may be returned with hits
* on the document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
//require_once('EncodingConverter.php');
class Zend_Search_Lucene_Field
{
public $kind;
public $name = 'body';
public $stringValue = null;
public $isStored = false;
public $isIndexed = true;
public $isTokenized = true;
public $isBinary = false;
public $storeTermVector = false;
public $boost = 1.0;
public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false)
{
$this->name = $name;
if (!$isBinary) {
/*
$econv = new EncodingConverter(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT');
$this->stringValue = $econv->convert($stringValue);
if ($econv->getLastError()) {
echo "Error: ".$econv->getLastError();
echo "<br>";
echo "x".$stringValue."x";
exit();
} else {
}*/
/**
* @todo Correct UTF-8 string should be required in future
* Until full UTF-8 support is not completed, string should be normalized to ANSII encoding
*/
$this->stringValue = iconv('ISO-8859-1', 'ASCII//TRANSLIT', $stringValue);
//$this->stringValue = iconv(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT', $stringValue);
} else {
$this->stringValue = $stringValue;
}
$this->isStored = $isStored;
$this->isIndexed = $isIndexed;
$this->isTokenized = $isTokenized;
$this->isBinary = $isBinary;
$this->storeTermVector = false;
$this->boost = 1.0;
}
/**
* Constructs a String-valued Field that is not tokenized, but is indexed
* and stored. Useful for non-text fields, e.g. date or url.
*
* @param string $name
* @param string $value
* @return Zend_Search_Lucene_Field
*/
static public function Keyword($name, $value)
{
return new self($name, $value, true, true, false);
}
/**
* Constructs a String-valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @return Zend_Search_Lucene_Field
*/
static public function UnIndexed($name, $value)
{
return new self($name, $value, true, false, false);
}
/**
* Constructs a Binary String valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @return Zend_Search_Lucene_Field
*/
static public function Binary($name, $value)
{
return new self($name, $value, true, false, false, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* and is stored in the index, for return with hits. Useful for short text
* fields, like "title" or "subject". Term vector will not be stored for this field.
*
* @param string $name
* @param string $value
* @return Zend_Search_Lucene_Field
*/
static public function Text($name, $value)
{
return new self($name, $value, true, true, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* but that is not stored in the index.
*
* @param string $name
* @param string $value
* @return Zend_Search_Lucene_Field
*/
static public function UnStored($name, $value)
{
return new self($name, $value, false, true, true);
}
}

View File

@ -0,0 +1,45 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_FieldInfo
{
public $name;
public $isIndexed;
public $number;
public $storeTermVector;
public function __construct( $name, $isIndexed, $number, $storeTermVector )
{
$this->name = $name;
$this->isIndexed = $isIndexed;
$this->number = $number;
$this->storeTermVector = $storeTermVector;
}
}

View File

@ -0,0 +1,575 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentInfo
{
/**
* Number of docs in a segment
*
* @var integer
*/
private $_docCount;
/**
* Segment name
*
* @var string
*/
private $_name;
/**
* Term Dictionary Index
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
private $_termDictionary;
/**
* Term Dictionary Index TermInfos
* Array of the Zend_Search_Lucene_Index_TermInfo objects
*
* @var array
*/
private $_termDictionaryInfos;
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
private $_fields;
/**
* Field positions in a dictionary.
* (Term dictionary contains filelds ordered by names)
*
* @var array
*/
private $_fieldsDicPositions;
/**
* Associative array where the key is the file name and the value is data offset
* in a compound segment file (.csf).
*
* @var array
*/
private $_segFiles;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory_Filesystem
*/
private $_directory;
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
private $_norms = array();
/**
* List of deleted documents.
* bitset if bitset extension is loaded or array otherwise.
*
* @var mixed
*/
private $_deleted;
/**
* $this->_deleted update flag
*
* @var boolean
*/
private $_deletedDirty = false;
/**
* Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
* Documents count and Directory as a parameter.
*
* @param string $name
* @param integer $docCount
* @param Zend_Search_Lucene_Storage_Directory $directory
*/
public function __construct($name, $docCount, $directory)
{
$this->_name = $name;
$this->_docCount = $docCount;
$this->_directory = $directory;
$this->_termDictionary = null;
$this->_segFiles = array();
if ($this->_directory->fileExists($name . '.cfs')) {
$cfsFile = $this->_directory->getFileObject($name . '.cfs');
$segFilesCount = $cfsFile->readVInt();
for ($count = 0; $count < $segFilesCount; $count++) {
$dataOffset = $cfsFile->readLong();
$fileName = $cfsFile->readString();
$this->_segFiles[$fileName] = $dataOffset;
}
}
$fnmFile = $this->openCompoundFile('.fnm');
$fieldsCount = $fnmFile->readVInt();
$fieldNames = array();
$fieldNums = array();
$this->_fields = array();
for ($count=0; $count < $fieldsCount; $count++) {
$fieldName = $fnmFile->readString();
$fieldBits = $fnmFile->readByte();
$this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
$fieldBits & 1,
$count,
$fieldBits & 2 );
if ($fieldBits & 0x10) {
// norms are omitted for the indexed field
$this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
}
$fieldNums[$count] = $count;
$fieldNames[$count] = $fieldName;
}
array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
$this->_fieldsDicPositions = array_flip($fieldNums);
try {
$delFile = $this->openCompoundFile('.del');
$byteCount = $delFile->readInt();
$byteCount = ceil($byteCount/8);
$bitCount = $delFile->readInt();
if ($bitCount == 0) {
$delBytes = '';
} else {
$delBytes = $delFile->readBytes($byteCount);
}
if (extension_loaded('bitset')) {
$this->_deleted = $delBytes;
} else {
$this->_deleted = array();
for ($count = 0; $count < $byteCount; $count++) {
$byte = ord($delBytes{$count});
for ($bit = 0; $bit < 8; $bit++) {
if ($byte & (1<<$bit)) {
$this->_deleted[$count*8 + $bit] = 1;
}
}
}
}
} catch(Zend_Search_Exception $e) {
if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
$this->_deleted = null;
} else {
throw $e;
}
}
}
/**
* Opens index file stoted within compound index file
*
* @param string $extension
* @throws Zend_Search_Lucene_Exception
* @return Zend_Search_Lucene_Storage_File
*/
public function openCompoundFile($extension)
{
$filename = $this->_name . $extension;
// Try to open common file first
if ($this->_directory->fileExists($filename)) {
return $this->_directory->getFileObject($filename);
}
if( !isset($this->_segFiles[$filename]) ) {
throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
. $filename . ' file.' );
}
$file = $this->_directory->getFileObject( $this->_name.".cfs" );
$file->seek($this->_segFiles[$filename]);
return $file;
}
/**
* Returns field index or -1 if field is not found
*
* @param string $fieldName
* @return integer
*/
public function getFieldNum($fieldName)
{
foreach( $this->_fields as $field ) {
if( $field->name == $fieldName ) {
return $field->number;
}
}
return -1;
}
/**
* Returns field info for specified field
*
* @param integer $fieldNum
* @return ZSearchFieldInfo
*/
public function getField($fieldNum)
{
return $this->_fields[$fieldNum];
}
/**
* Returns array of fields.
* if $indexed parameter is true, then returns only indexed fields.
*
* @param boolean $indexed
* @return array
*/
public function getFields($indexed = false)
{
$result = array();
foreach( $this->_fields as $field ) {
if( (!$indexed) || $field->isIndexed ) {
$result[ $field->name ] = $field->name;
}
}
return $result;
}
/**
* Returns the total number of documents in this segment.
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Get field position in a fields dictionary
*
* @param integer $fieldNum
* @return integer
*/
private function _getFieldPosition($fieldNum) {
// Treat values which are not in a translation table as a 'direct value'
return isset($this->_fieldsDicPositions[$fieldNum]) ?
$this->_fieldsDicPositions[$fieldNum] : $fieldNum;
}
/**
* Loads Term dictionary from TermInfoIndex file
*/
protected function _loadDictionary()
{
if ($this->_termDictionary !== null) {
return;
}
$this->_termDictionary = array();
$this->_termDictionaryInfos = array();
$tiiFile = $this->openCompoundFile('.tii');
$tiVersion = $tiiFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
}
$indexTermCount = $tiiFile->readLong();
$tiiFile->readInt(); // IndexInterval
$skipInterval = $tiiFile->readInt();
$prevTerm = '';
$freqPointer = 0;
$proxPointer = 0;
$indexPointer = 0;
for ($count = 0; $count < $indexTermCount; $count++) {
$termPrefixLength = $tiiFile->readVInt();
$termSuffix = $tiiFile->readString();
$termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;
$termFieldNum = $tiiFile->readVInt();
$docFreq = $tiiFile->readVInt();
$freqPointer += $tiiFile->readVInt();
$proxPointer += $tiiFile->readVInt();
if( $docFreq >= $skipInterval ) {
$skipDelta = $tiiFile->readVInt();
} else {
$skipDelta = 0;
}
$indexPointer += $tiiFile->readVInt();
$this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
$this->_termDictionaryInfos[] =
new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
$prevTerm = $termValue;
}
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* Scans terms dictionary and returns term info
*
* @param Zend_Search_Lucene_Index_Term $term
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function getTermInfo($term)
{
$this->_loadDictionary();
$searchField = $this->getFieldNum($term->field);
if ($searchField == -1) {
return null;
}
$searchDicField = $this->_getFieldPosition($searchField);
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm->field);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($term->text, $midTerm->text);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
return $this->_termDictionaryInfos[$mid]; // We got it!
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
return null;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];
$tisFile = $this->openCompoundFile('.tis');
$tiVersion = $tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$termCount = $tisFile->readLong();
$indexInterval = $tisFile->readInt();
$skipInterval = $tisFile->readInt();
$tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);
$termValue = $prevTerm->text;
$termFieldNum = $prevTerm->field;
$freqPointer = $prevTermInfo->freqPointer;
$proxPointer = $prevTermInfo->proxPointer;
for ($count = $prevPosition*$indexInterval + 1;
$count < $termCount &&
( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
($this->_getFieldPosition($termFieldNum) == $searchDicField &&
strcmp($termValue, $term->text) < 0) );
$count++) {
$termPrefixLength = $tisFile->readVInt();
$termSuffix = $tisFile->readString();
$termFieldNum = $tisFile->readVInt();
$termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;
$docFreq = $tisFile->readVInt();
$freqPointer += $tisFile->readVInt();
$proxPointer += $tisFile->readVInt();
if( $docFreq >= $skipInterval ) {
$skipOffset = $tisFile->readVInt();
} else {
$skipOffset = 0;
}
}
if ($termFieldNum == $searchField && $termValue == $term->text) {
return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
} else {
return null;
}
}
/**
* Returns normalization factor for specified documents
*
* @param integer $id
* @param string $fieldName
* @return string
*/
public function norm($id, $fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ( !($this->_fields[$fieldNum]->isIndexed) ) {
return null;
}
if ( !isset( $this->_norms[$fieldNum] )) {
$fFile = $this->openCompoundFile('.f' . $fieldNum);
$this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
}
return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
}
/**
* Returns true if any documents have been deleted from this index segment.
*
* @return boolean
*/
public function hasDeletions()
{
return $this->_deleted !== null;
}
/**
* Deletes a document from the index segment.
* $id is an internal document id
*
* @param integer
*/
public function delete($id)
{
$this->_deletedDirty = true;
if (extension_loaded('bitset')) {
if ($this->_deleted === null) {
$this->_deleted = bitset_empty($id);
}
bitset_incl($this->_deleted, $id);
} else {
if ($this->_deleted === null) {
$this->_deleted = array();
}
$this->_deleted[$id] = 1;
}
}
/**
* Checks, that document is deleted
*
* @param integer
* @return boolean
*/
public function isDeleted($id)
{
if ($this->_deleted === null) {
return false;
}
if (extension_loaded('bitset')) {
return bitset_in($this->_deleted, $id);
} else {
return isset($this->_deleted[$id]);
}
}
/**
* Write changes if it's necessary.
*/
public function writeChanges()
{
if (!$this->_deletedDirty) {
return;
}
if (extension_loaded('bitset')) {
$delBytes = $this->_deleted;
$bitCount = count(bitset_to_array($delBytes));
} else {
$byteCount = floor($this->_docCount/8)+1;
$delBytes = str_repeat(chr(0), $byteCount);
for ($count = 0; $count < $byteCount; $count++) {
$byte = 0;
for ($bit = 0; $bit < 8; $bit++) {
if (isset($this->_deleted[$count*8 + $bit])) {
$byte |= (1<<$bit);
}
}
$delBytes{$count} = chr($byte);
}
$bitCount = count($this->_deleted);
}
$delFile = $this->_directory->createFile($this->_name . '.del');
$delFile->writeInt($this->_docCount);
$delFile->writeInt($bitCount);
$delFile->writeBytes($delBytes);
$this->_deletedDirty = false;
}
}

View File

@ -0,0 +1,519 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.
*
* @var integer
*/
static public $indexInterval = 128;
/** Expert: The fraction of TermDocs entries stored in skip tables.
* Larger values result in smaller indexes, greater acceleration, but fewer
* accelerable cases, while smaller values result in bigger indexes,
* less acceleration and more
* accelerable cases. More detailed experiments would be useful here.
*
* 0x0x7FFFFFFF indicates that we don't use skip data
* Default value is 16
*
* @var integer
*/
static public $skipInterval = 0x7FFFFFFF;
/**
* Number of docs in a segment
*
* @var integer
*/
private $_docCount;
/**
* Segment name
*
* @var string
*/
private $_name;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory;
/**
* List of the index files.
* Used for automatic compound file generation
*
* @var unknown_type
*/
private $_files;
/**
* Term Dictionary
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
private $_termDictionary;
/**
* Documents, which contain the term
*
* @var array
*/
private $_termDocs;
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
private $_fields;
/**
* Sizes of the indexed fields.
* Used for normalization factors calculation.
*
* @var array
*/
private $_fieldLengths;
/**
* '.fdx' file - Stored Fields, the field index.
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_fdxFile;
/**
* '.fdt' file - Stored Fields, the field data.
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_fdtFile;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct($directory, $name)
{
$this->_directory = $directory;
$this->_name = $name;
$this->_docCount = 0;
$this->_fields = array();
$this->_termDocs = array();
$this->_files = array();
$this->_norms = array();
$this->_fieldLengths = array();
$this->_termDictionary = array();
$this->_fdxFile = null;
$this->_fdtFile = null;
}
/**
* Add field to the segment
*
* @param Zend_Search_Lucene_Field $field
*/
private function _addFieldInfo(Zend_Search_Lucene_Field $field)
{
if (!isset($this->_fields[$field->name])) {
$this->_fields[$field->name] =
new Zend_Search_Lucene_Index_FieldInfo($field->name,
$field->isIndexed,
count($this->_fields),
$field->storeTermVector);
} else {
$this->_fields[$field->name]->isIndexed |= $field->isIndexed;
$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
}
}
/**
* Adds a document to this segment.
*
* @param Zend_Search_Lucene_Document $document
* @throws Zend_Search_Lucene_Exception
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
$storedFields = array();
foreach ($document->getFieldNames() as $fieldName) {
$field = $document->getField($fieldName);
$this->_addFieldInfo($field);
if ($field->storeTermVector) {
/**
* @todo term vector storing support
*/
throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
}
if ($field->isIndexed) {
if ($field->isTokenized) {
$tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
} else {
$tokenList = array();
$tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
}
$this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
$position = 0;
foreach ($tokenList as $token) {
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$position += $token->getPositionIncrement();
$this->_termDocs[$termKey][$this->_docCount][] = $position;
}
}
if ($field->isStored) {
$storedFields[] = $field;
}
}
if (count($storedFields) != 0) {
if (!isset($this->_fdxFile)) {
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
$this->_fdxFile->writeLong($this->_fdtFile->tell());
$this->_fdtFile->writeVInt(count($storedFields));
foreach ($storedFields as $field) {
$this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
$fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
($field->isBinary ? 0x02 : 0x00) |
0x00; /* 0x04 - third bit, compressed (ZLIB) */
$this->_fdtFile->writeByte($fieldBits);
if ($field->isBinary) {
$this->_fdtFile->writeVInt(strlen($field->stringValue));
$this->_fdtFile->writeBytes($field->stringValue);
} else {
$this->_fdtFile->writeString($field->stringValue);
}
}
}
$this->_docCount++;
}
/**
* Dump Field Info (.fnm) segment file
*/
private function _dumpFNM()
{
$fnmFile = $this->_directory->createFile($this->_name . '.fnm');
$fnmFile->writeVInt(count($this->_fields));
foreach ($this->_fields as $field) {
$fnmFile->writeString($field->name);
$fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
($field->storeTermVector ? 0x02 : 0x00)
// not supported yet 0x04 /* term positions are stored with the term vectors */ |
// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
);
if ($field->isIndexed) {
$fieldNum = $this->_fields[$field->name]->number;
$fieldName = $field->name;
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
$norm = '';
for ($count = 0; $count < $this->_docCount; $count++) {
$numTokens = isset($this->_fieldLengths[$fieldName][$count]) ?
$this->_fieldLengths[$fieldName][$count] : 0;
$norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens)));
}
$normFileName = $this->_name . '.f' . $fieldNum;
$fFile = $this->_directory->createFile($normFileName);
$fFile->writeBytes($norm);
$this->_files[] = $normFileName;
}
}
$this->_files[] = $this->_name . '.fnm';
}
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
*
* @param Zend_Search_Lucene_Storage_File $dicFile
* @param Zend_Search_Lucene_Index_Term $prevTerm
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
* @param Zend_Search_Lucene_Index_TermInfo $termInfo
*/
private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
&$prevTerm, Zend_Search_Lucene_Index_Term $term,
&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
$prefixLength = 0;
while ($prefixLength < strlen($prevTerm->text) &&
$prefixLength < strlen($term->text) &&
$prevTerm->text{$prefixLength} == $term->text{$prefixLength}
) {
$prefixLength++;
}
// Write preffix length
$dicFile->writeVInt($prefixLength);
// Write suffix
$dicFile->writeString( substr($term->text, $prefixLength) );
} else {
// Write preffix length
$dicFile->writeVInt(0);
// Write suffix
$dicFile->writeString($term->text);
}
// Write field number
$dicFile->writeVInt($term->field);
// DocFreq (the count of documents which contain the term)
$dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer);
} else {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
}
// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
if ($termInfo->skipOffset != 0) {
$dicFile->writeVInt($termInfo->skipOffset);
}
$prevTermInfo = $termInfo;
}
/**
* Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
*/
private function _dumpDictionary()
{
$termKeys = array_keys($this->_termDictionary);
sort($termKeys, SORT_STRING);
$tisFile = $this->_directory->createFile($this->_name . '.tis');
$tisFile->writeInt((int)0xFFFFFFFE);
$tisFile->writeLong(count($termKeys));
$tisFile->writeInt(self::$indexInterval);
$tisFile->writeInt(self::$skipInterval);
$tiiFile = $this->_directory->createFile($this->_name . '.tii');
$tiiFile->writeInt((int)0xFFFFFFFE);
$tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval));
$tiiFile->writeInt(self::$indexInterval);
$tiiFile->writeInt(self::$skipInterval);
/** Dump dictionary header */
$tiiFile->writeVInt(0); // preffix length
$tiiFile->writeString(''); // suffix
$tiiFile->writeInt((int)0xFFFFFFFF); // field number
$tiiFile->writeByte((int)0x0F);
$tiiFile->writeVInt(0); // DocFreq
$tiiFile->writeVInt(0); // FreqDelta
$tiiFile->writeVInt(0); // ProxDelta
$tiiFile->writeVInt(20); // IndexDelta
$frqFile = $this->_directory->createFile($this->_name . '.frq');
$prxFile = $this->_directory->createFile($this->_name . '.prx');
$termCount = 1;
$prevTerm = null;
$prevTermInfo = null;
$prevIndexTerm = null;
$prevIndexTermInfo = null;
$prevIndexPosition = 20;
foreach ($termKeys as $termId) {
$freqPointer = $frqFile->tell();
$proxPointer = $prxFile->tell();
$prevDoc = 0;
foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
$docDelta = ($docId - $prevDoc)*2;
$prevDoc = $docId;
if (count($termPositions) > 1) {
$frqFile->writeVInt($docDelta);
$frqFile->writeVInt(count($termPositions));
} else {
$frqFile->writeVInt($docDelta + 1);
}
$prevPosition = 0;
foreach ($termPositions as $position) {
$prxFile->writeVInt($position - $prevPosition);
$prevPosition = $position;
}
}
if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
/**
* @todo Write Skip Data to a freq file.
* It's not used now, but make index more optimal
*/
$skipOffset = $frqFile->tell() - $freqPointer;
} else {
$skipOffset = 0;
}
$term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
$this->_fields[$this->_termDictionary[$termId]->field]->number);
$termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
$freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);
if ($termCount % self::$indexInterval == 0) {
$this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);
$indexPosition = $tisFile->tell();
$tiiFile->writeVInt($indexPosition - $prevIndexPosition);
$prevIndexPosition = $indexPosition;
}
$termCount++;
}
$this->_files[] = $this->_name . '.tis';
$this->_files[] = $this->_name . '.tii';
$this->_files[] = $this->_name . '.frq';
$this->_files[] = $this->_name . '.prx';
}
/**
* Generate compound index file
*/
private function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array();
foreach ($this->_files as $fileName) {
$dataOffsetPointers[$fileName] = $cfsFile->tell();
$cfsFile->writeLong(0); // write dummy data
$cfsFile->writeString($fileName);
}
foreach ($this->_files as $fileName) {
// Get actual data offset
$dataOffset = $cfsFile->tell();
// Seek to the data offset pointer
$cfsFile->seek($dataOffsetPointers[$fileName]);
// Write actual data offset value
$cfsFile->writeLong($dataOffset);
// Seek back to the end of file
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$data = $dataFile->readBytes($this->_directory->fileLength($fileName));
$cfsFile->writeBytes($data);
$this->_directory->deleteFile($fileName);
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_dumpDictionary();
$this->_generateCFS();
return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
$this->_docCount,
$this->_directory);
}
}

View File

@ -0,0 +1,72 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Term represents a word from text. This is the unit of search. It is
* composed of two elements, the text of the word, as a string, and the name of
* the field that the text occured in, an interned string.
*
* Note that terms may represent more than words from text fields, but also
* things like dates, email addresses, urls, etc.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Term
{
/**
* Field name or field number (depending from context)
*
* @var mixed
*/
public $field;
/**
* Term value
*
* @var string
*/
public $text;
/**
* @todo docblock
*/
public function __construct( $text, $field = 'contents' )
{
$this->field = $field;
$this->text = $text;
}
/**
* @todo docblock
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
}

View File

@ -0,0 +1,79 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermInfo
{
/**
* The number of documents which contain the term.
*
* @var integer
*/
public $docFreq;
/**
* Data offset in a Frequencies file.
*
* @var integer
*/
public $freqPointer;
/**
* Data offset in a Positions file.
*
* @var integer
*/
public $proxPointer;
/**
* ScipData offset in a Frequencies file.
*
* @var integer
*/
public $skipOffset;
/**
* Term offset of the _next_ term in a TermDictionary file.
* Used only for Term Index
*
* @var integer
*/
public $indexPointer;
public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
{
$this->docFreq = $docFreq;
$this->freqPointer = $freqPointer;
$this->proxPointer = $proxPointer;
$this->skipOffset = $skipOffset;
$this->indexPointer = $indexPointer;
}
}

View File

@ -0,0 +1,331 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Writer
{
/**
* @todo Implement segment merger
* @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage.
* @todo Implement Analyzer substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
* @todo Directory lock processing
*/
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* Index version
* Counts how often the index has been changed by adding or deleting docs
*
* @var integer
*/
private $_version;
/**
* Segment name counter.
* Used to name new segments .
*
* @var integer
*/
private $_segmentNameCounter;
/**
* Number of the segments in the index
*
* @var inteher
*/
private $_segments;
/**
* Determines how often segment indices
* are merged by addDocument().
*
* @var integer
*/
public $mergeFactor;
/**
* Determines the minimal number of documents required before
* the buffered in-memory documents are merging and a new Segment
* is created.
*
* @var integer
*/
public $minMergeDocs;
/**
* Determines the largest number of documents ever merged by addDocument().
*
* @var integer
*/
public $maxMergeDocs;
/**
* List of the segments, created by index writer
* Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
* @var array
*/
private $_newSegments;
/**
* Current segment to add documents
*
* @var Zend_Search_Lucene_Index_SegmentWriter
*/
private $_currentSegment;
/**
* List of indexfiles extensions
*
* @var array
*/
private static $_indexExtensions = array('.cfs' => '.cfs',
'.fnm' => '.fnm',
'.fdx' => '.fdx',
'.fdt' => '.fdt',
'.tis' => '.tis',
'.tii' => '.tii',
'.frq' => '.frq',
'.prx' => '.prx',
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
'.del' => '.del' );
/**
* Opens the index for writing
*
* IndexWriter constructor needs Directory as a parameter. It should be
* a string with a path to the index folder or a Directory object.
* Second constructor parameter create is optional - true to create the
* index or overwrite the existing one.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param boolean $create
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false)
{
$this->_directory = $directory;
if ($create) {
foreach ($this->_directory->fileList() as $file) {
if ($file == 'deletable' ||
$file == 'segments' ||
isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
$this->_directory->deleteFile($file);
}
}
$segmentsFile = $this->_directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
// write version
$segmentsFile->writeLong(0);
// write name counter
$segmentsFile->writeInt(0);
// write segment counter
$segmentsFile->writeInt(0);
$deletableFile = $this->_directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
$this->_version = 0;
$this->_segmentNameCounter = 0;
$this->_segments = 0;
} else {
$segmentsFile = $this->_directory->getFileObject('segments');
$format = $segmentsFile->readInt();
if ($format != (int)0xFFFFFFFF) {
throw new Zend_Search_Lucene_Exception('Wrong segments file format');
}
// read version
$this->_version = $segmentsFile->readLong();
// read counter
$this->_segmentNameCounter = $segmentsFile->readInt();
// read segment counter
$this->_segments = $segmentsFile->readInt();
}
$this->_newSegments = array();
$this->_currentSegment = null;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
if ($this->_currentSegment === null) {
$this->_currentSegment =
new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
$this->_version++;
}
/**
* Update segments file by adding current segment to a list
* @todo !!!!!Finish the implementation
*
* @throws Zend_Search_Lucene_Exception
*/
private function _updateSegments()
{
$segmentsFile = $this->_directory->getFileObject('segments');
$newSegmentFile = $this->_directory->createFile('segments.new');
$newSegmentFile->writeInt((int)0xFFFFFFFF);
$newSegmentFile->writeLong($this->_version);
$newSegmentFile->writeInt($this->_segmentNameCounter);
$this->_segments += count($this->_newSegments);
$newSegmentFile->writeInt($this->_segments);
$segmentsFile->seek(20);
$newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20));
foreach ($this->_newSegments as $segmentName => $segmentInfo) {
$newSegmentFile->writeString($segmentName);
$newSegmentFile->writeInt($segmentInfo->count());
}
$this->_directory->renameFile('segments.new', 'segments');
}
/**
* Commit current changes
* returns array of new segments
*
* @return array
*/
public function commit()
{
if ($this->_currentSegment !== null) {
$newSegment = $this->_currentSegment->close();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->_currentSegment = null;
}
if (count($this->_newSegments) != 0) {
$this->_updateSegments();
}
$result = $this->_newSegments;
$this->_newSegments = array();
return $result;
}
/**
* Merges the provided indexes into this index.
*
* @param array $readers
* @return void
*/
public function addIndexes($readers)
{
/**
* @todo implementation
*/
}
/**
* Returns the number of documents currently in this index.
*
* @return integer
*/
public function docCount($readers)
{
/**
* @todo implementation
*/
}
/**
* Flushes all changes to an index and closes all associated files.
*
*/
public function close()
{
/**
* @todo implementation
*/
}
/**
* Merges all segments together into a single segment, optimizing
* an index for search.
*
* return void
*/
public function optimize()
{
/**
* @todo implementation
*/
}
/**
* Get name for new segment
*
* @return string
*/
private function _newSegmentName()
{
return '_' . base_convert($this->_segmentNameCounter++, 10, 36);
}
}

View File

@ -0,0 +1,100 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Query
{
/**
* query boost factor
*
* @var float
*/
private $_boost = 1.0;
/**
* Query weight
*
* @var Zend_Search_Lucene_Search_Weight
*/
protected $_weight;
/**
* Gets the boost for this clause. Documents matching
* this clause will (in addition to the normal weightings) have their score
* multiplied by boost. The boost is 1.0 by default.
*
* @return float
*/
public function getBoost()
{
return $this->_boost;
}
/**
* Sets the boost for this query clause to $boost.
*
* @param float $boost
*/
public function setBoost($boost)
{
$this->_boost = $boost;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
abstract public function score($docId, $reader);
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene $reader
* @return Zend_Search_Lucene_Search_Weight
*/
abstract protected function _createWeight($reader);
/**
* Constructs an initializes a Weight for a query.
*
* @param Zend_Search_Lucene $reader
*/
protected function _initWeight($reader)
{
$this->_weight = $this->_createWeight($reader);
$sum = $this->_weight->sumOfSquaredWeights();
$queryNorm = $reader->getSimilarity()->queryNorm($sum);
$this->_weight->normalize($queryNorm);
}
}

View File

@ -0,0 +1,439 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term
*
* @var array
*/
private $_terms = array();
/**
* Term signs.
* If true then term is required.
* If false then term is prohibited.
* If null then term is neither prohibited, nor required
*
* If array is null then all terms are required
*
* @var array
*/
private $_signs = array();
/**
* Result vector.
* Bitset or array of document IDs
* (depending from Bitset extension availability).
*
* @var mixed
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* A score factor based on the fraction of all query terms
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Terms weights
* array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights = array();
/**
* Class constructor. Create a new multi-term query object.
*
* @param array $terms Array of Zend_Search_Lucene_Index_Term objects
* @param array $signs Array of signs. Sign is boolean|null.
* @return void
*/
public function __construct($terms = null, $signs = null)
{
/**
* @todo Check contents of $terms and $signs before adding them.
*/
if (is_array($terms)) {
$this->_terms = $terms;
$this->_signs = null;
// Check if all terms are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
continue;
}
}
}
}
}
/**
* Add a $term (Zend_Search_Lucene_Index_Term) to this query.
*
* The sign is specified as:
* TRUE - term is required
* FALSE - term is prohibited
* NULL - term is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean|null $sign
* @return void
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) {
$this->_terms[] = $term;
/**
* @todo This is not good. Sometimes $this->_signs is an array, sometimes
* it is null, even when there are terms. It will be changed so that
* it is always an array.
*/
if ($this->_signs === null) {
if ($sign !== null) {
$this->_signs = array();
foreach ($this->_terms as $term) {
$this->_signs[] = null;
}
$this->_signs[] = $sign;
}
} else {
$this->_signs[] = $sign;
}
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Return terms signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene $reader
* @return Zend_Search_Lucene_Search_Weight
*/
protected function _createWeight($reader)
{
return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
}
/**
* Calculate result vector for Conjunction query
* (like '+something +another')
*
* @param Zend_Search_Lucene $reader
*/
private function _calculateConjunctionResult($reader)
{
if (extension_loaded('bitset')) {
foreach( $this->_terms as $termId=>$term ) {
if($this->_resVector === null) {
$this->_resVector = bitset_from_array($reader->termDocs($term));
} else {
$this->_resVector = bitset_intersection(
$this->_resVector,
bitset_from_array($reader->termDocs($term)) );
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
} else {
foreach( $this->_terms as $termId=>$term ) {
if($this->_resVector === null) {
$this->_resVector = array_flip($reader->termDocs($term));
} else {
$termDocs = array_flip($reader->termDocs($term));
foreach($this->_resVector as $key=>$value) {
if (!isset( $termDocs[$key] )) {
unset( $this->_resVector[$key] );
}
}
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
}
}
/**
* Calculate result vector for non Conjunction query
* (like '+something -another')
*
* @param Zend_Search_Lucene $reader
*/
private function _calculateNonConjunctionResult($reader)
{
if (extension_loaded('bitset')) {
$required = null;
$neither = bitset_empty();
$prohibited = bitset_empty();
foreach ($this->_terms as $termId => $term) {
$termDocs = bitset_from_array($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
if ($required !== null) {
$required = bitset_intersection($required, $termDocs);
} else {
$required = $termDocs;
}
} elseif ($this->_signs[$termId] === false) {
// prohibited
$prohibited = bitset_union($prohibited, $termDocs);
} else {
// neither required, nor prohibited
$neither = bitset_union($neither, $termDocs);
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
if ($required === null) {
$required = $neither;
}
$this->_resVector = bitset_intersection( $required,
bitset_invert($prohibited, $reader->count()) );
} else {
$required = null;
$neither = array();
$prohibited = array();
foreach ($this->_terms as $termId => $term) {
$termDocs = array_flip($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
if ($required !== null) {
// substitute for bitset_intersection
foreach ($required as $key => $value) {
if (!isset( $termDocs[$key] )) {
unset($required[$key]);
}
}
} else {
$required = $termDocs;
}
} elseif ($this->_signs[$termId] === false) {
// prohibited
// substitute for bitset_union
foreach ($termDocs as $key => $value) {
$prohibited[$key] = $value;
}
} else {
// neither required, nor prohibited
// substitute for bitset_union
foreach ($termDocs as $key => $value) {
$neither[$key] = $value;
}
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
if ($required === null) {
$required = $neither;
}
foreach ($required as $key=>$value) {
if (isset( $prohibited[$key] )) {
unset($required[$key]);
}
}
$this->_resVector = $required;
}
}
/**
* Score calculator for conjunction queries (all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function _conjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
count($this->_terms) );
}
$score = 0.0;
foreach ($this->_terms as $termId=>$term) {
$score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
return $score * $this->_coord;
}
/**
* Score calculator for non conjunction queries (not all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function _nonConjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0.0;
$matchedTerms = 0;
foreach ($this->_terms as $termId=>$term) {
// Check if term is
if ($this->_signs[$termId] !== false && // not prohibited
isset($this->_termsPositions[$termId][$docId]) // matched
) {
$matchedTerms++;
$score +=
$reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
}
return $score * $this->_coord[$matchedTerms];
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function score($docId, $reader)
{
if($this->_resVector === null) {
if ($this->_signs === null) {
$this->_calculateConjunctionResult($reader);
} else {
$this->_calculateNonConjunctionResult($reader);
}
$this->_initWeight($reader);
}
if ( (extension_loaded('bitset')) ?
bitset_in($this->_resVector, $docId) :
isset($this->_resVector[$docId]) ) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
}

View File

@ -0,0 +1,426 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Query
*/
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm
*/
require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
/**
* A Query that matches documents containing a particular sequence of terms.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term objects.
*
* @var array
*/
private $_terms;
/**
* Term positions (relative positions of terms within the phrase).
* Array of integers
*
* @var array
*/
private $_offsets;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var unknown_type
*/
private $_slop;
/**
* Result vector.
* Bitset or array of document IDs
* (depending from Bitset extension availability).
*
* @var mixed
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* Class constructor. Create a new prase query.
*
* @param string $field Field to search.
* @param array $terms Terms to search Array of strings.
* @param array $offsets Relative term positions. Array of integers.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $offsets = null, $field = null)
{
$this->_slop = 0;
if (is_array($terms)) {
$this->_terms = array();
foreach ($terms as $termId => $termText) {
$this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
new Zend_Search_Lucene_Index_Term($termText);
}
} else if ($terms === null) {
$this->_terms = array();
} else {
throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
}
if (is_array($offsets)) {
if (count($this->_terms) != count($offsets)) {
throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
}
$this->_offsets = $offsets;
} else if ($offsets === null) {
$this->_offsets = array();
foreach ($this->_terms as $termId => $term) {
$position = count($this->_offsets);
$this->_offsets[$termId] = $position;
}
} else {
throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
}
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately
* after the last term added.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $position
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
$term->field . ':' . $term->text);
}
$this->_terms[] = $term;
if ($position !== null) {
$this->_offsets[] = $position;
} else if (count($this->_offsets) != 0) {
$this->_offsets[] = end($this->_offsets) + 1;
} else {
$this->_offsets[] = 0;
}
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene $reader
* @return Zend_Search_Lucene_Search_Weight
*/
protected function _createWeight($reader)
{
return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
}
/**
* Calculate result vector
*
* @param Zend_Search_Lucene $reader
*/
private function _calculateResult($reader)
{
if (extension_loaded('bitset')) {
foreach( $this->_terms as $termId=>$term ) {
if($this->_resVector === null) {
$this->_resVector = bitset_from_array($reader->termDocs($term));
} else {
$this->_resVector = bitset_intersection(
$this->_resVector,
bitset_from_array($reader->termDocs($term)) );
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
} else {
foreach( $this->_terms as $termId=>$term ) {
if($this->_resVector === null) {
$this->_resVector = array_flip($reader->termDocs($term));
} else {
$termDocs = array_flip($reader->termDocs($term));
foreach($this->_resVector as $key=>$value) {
if (!isset( $termDocs[$key] )) {
unset( $this->_resVector[$key] );
}
}
}
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
}
}
/**
* Score calculator for exact phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @return float
*/
public function _exactPhraseFreq($docId)
{
$freq = 0;
// Term Id with lowest cardinality
$lowCardTermId = null;
// Calculate $lowCardTermId
foreach ($this->_terms as $termId => $term) {
if ($lowCardTermId === null ||
count($this->_termsPositions[$termId][$docId]) <
count($this->_termsPositions[$lowCardTermId][$docId]) ) {
$lowCardTermId = $termId;
}
}
// Walk through positions of the term with lowest cardinality
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
// We expect phrase to be found
$freq++;
// Walk through other terms
foreach ($this->_terms as $termId => $term) {
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos +
($this->_offsets[$termId] -
$this->_offsets[$lowCardTermId]);
if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
$freq--; // Phrase wasn't found.
break;
}
}
}
}
return $freq;
}
/**
* Score calculator for sloppy phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader)
{
$freq = 0;
$phraseQueue = array();
$phraseQueue[0] = array(); // empty phrase
$lastTerm = null;
// Walk through the terms to create phrases.
foreach ($this->_terms as $termId => $term) {
$queueSize = count($phraseQueue);
$firstPass = true;
// Walk through the term positions.
// Each term position produces a set of phrases.
foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
if ($firstPass) {
for ($count = 0; $count < $queueSize; $count++) {
$phraseQueue[$count][$termId] = $termPosition;
}
} else {
for ($count = 0; $count < $queueSize; $count++) {
if ($lastTerm !== null &&
abs( $termPosition - $phraseQueue[$count][$lastTerm] -
($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
continue;
}
$newPhraseId = count($phraseQueue);
$phraseQueue[$newPhraseId] = $phraseQueue[$count];
$phraseQueue[$newPhraseId][$termId] = $termPosition;
}
}
$firstPass = false;
}
$lastTerm = $termId;
}
foreach ($phraseQueue as $phrasePos) {
$minDistance = null;
for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
$distance = 0;
$start = reset($phrasePos) - reset($this->_offsets) + $shift;
foreach ($this->_terms as $termId => $term) {
$distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
if($distance > $this->_slop) {
break;
}
}
if ($minDistance === null || $distance < $minDistance) {
$minDistance = $distance;
}
}
if ($minDistance <= $this->_slop) {
$freq += $reader->getSimilarity()->sloppyFreq($minDistance);
}
}
return $freq;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function score($docId, $reader)
{
// optimize zero-term case
if (count($this->_terms) == 0) {
return 0;
}
if($this->_resVector === null) {
$this->_calculateResult($reader);
$this->_initWeight($reader);
}
if ( (extension_loaded('bitset')) ?
bitset_in($this->_resVector, $docId) :
isset($this->_resVector[$docId]) ) {
if ($this->_slop == 0) {
$freq = $this->_exactPhraseFreq($docId);
} else {
$freq = $this->_sloppyPhraseFreq($docId, $reader);
}
/*
return $reader->getSimilarity()->tf($freq) *
$this->_weight->getValue() *
$reader->norm($docId, reset($this->_terms)->field);
*/
if ($freq != 0) {
$tf = $reader->getSimilarity()->tf($freq);
$weight = $this->_weight->getValue();
$norm = $reader->norm($docId, reset($this->_terms)->field);
return $tf*$weight*$norm;
}
} else {
return 0;
}
}
}

View File

@ -0,0 +1,128 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Term */
require_once 'Zend/Search/Lucene/Search/Weight/Term.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query
{
/**
* Term to find.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* Term sign.
* If true then term is required
* If false then term is prohibited.
*
* @var bool
*/
private $_sign;
/**
* Documents vector.
* Bitset or array of document IDs
* (depending from Bitset extension availability).
*
* @var mixed
*/
private $_docVector = null;
/**
* Term positions vector.
* Array: docId => array( pos1, pos2, ... )
*
* @var array
*/
private $_termPositions;
/**
* Zend_Search_Lucene_Search_Query_Term constructor
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean $sign
*/
public function __construct( $term, $sign = true )
{
$this->_term = $term;
$this->_sign = $sign;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene $reader
* @return Zend_Search_Lucene_Search_Weight
*/
protected function _createWeight($reader)
{
return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader);
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene $reader
* @return float
*/
public function score( $docId, $reader )
{
if($this->_docVector===null) {
if (extension_loaded('bitset')) {
$this->_docVector = bitset_from_array( $reader->termDocs($this->_term) );
} else {
$this->_docVector = array_flip($reader->termDocs($this->_term));
}
$this->_termPositions = $reader->termPositions($this->_term);
$this->_initWeight($reader);
}
$match = extension_loaded('bitset') ? bitset_in($this->_docVector, $docId) :
isset($this->_docVector[$docId]);
if ($this->_sign && $match) {
return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) *
$this->_weight->getValue() *
$reader->norm($docId, $this->_term->field);
} else {
return 0;
}
}
}

View File

@ -0,0 +1,108 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryHit
{
/**
* Object handle of the index
* @var Zend_Search_Lucene
*/
protected $_index = null;
/**
* Object handle of the document associated with this hit
* @var Zend_Search_Lucene_Document
*/
protected $_document = null;
/**
* Number of the document in the index
* @var integer
*/
public $id;
/**
* Score of the hit
* @var float
*/
public $score;
/**
* Constructor - pass object handle of Zend_Search_Lucene index that produced
* the hit so the document can be retrieved easily from the hit.
*
* @param Zend_Search_Lucene $index
*/
public function __construct(Zend_Search_Lucene $index)
{
$this->_index = $index;
}
/**
* Convenience function for getting fields from the document
* associated with this hit.
*
* @param string $offset
* @return string
*/
public function __get($offset)
{
return $this->getDocument()->getFieldValue($offset);
}
/**
* Return the document object for this hit
*
* @return Zend_Search_Lucene_Document
*/
public function getDocument()
{
if (!$this->_document instanceof Zend_Search_Lucene_Document) {
$this->_document = $this->_index->getDocument($this->id);
}
return $this->_document;
}
/**
* Return the index object for this hit
*
* @return Zend_Search_Lucene
*/
public function getIndex()
{
return $this->_index;
}
}

View File

@ -0,0 +1,142 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_QueryTokenizer */
require_once 'Zend/Search/Lucene/Search/QueryTokenizer.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_Query_Term */
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/** Zend_Search_Lucene_Search_Query_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParser
{
/**
* Parses a query string, returning a Zend_Search_Lucene_Search_Query
*
* @param string $strQuery
* @return Zend_Search_Lucene_Search_Query
*/
static public function parse($strQuery)
{
$tokens = new Zend_Search_Lucene_Search_QueryTokenizer($strQuery);
// Empty query
if (!$tokens->count()) {
throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.');
}
// Term query
if ($tokens->count() == 1) {
if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) {
return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents'));
} else {
throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.');
}
}
/**
* MultiTerm Query
*
* Process each token that was returned by the tokenizer.
*/
$terms = array();
$signs = array();
$prevToken = null;
$openBrackets = 0;
$field = 'contents';
foreach ($tokens as $token) {
switch ($token->type) {
case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD:
$terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field);
$field = 'contents';
if ($prevToken !== null &&
$prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) {
if ($prevToken->text == "+") {
$signs[] = true;
} else {
$signs[] = false;
}
} else {
$signs[] = null;
}
break;
case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN:
if ($prevToken !== null &&
$prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) {
throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.');
}
break;
case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD:
$field = $token->text;
// let previous token to be signed as next $prevToken
$token = $prevToken;
break;
case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET:
$token->text=='(' ? $openBrackets++ : $openBrackets--;
}
$prevToken = $token;
}
// Finish up parsing: check the last token in the query for an opening sign or parenthesis.
if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) {
throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.');
}
// Finish up parsing: check that every opening bracket has a matching closing bracket.
if ($openBrackets != 0) {
throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.');
}
switch (count($terms)) {
case 0:
throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.');
case 1:
return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false);
default:
return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs);
}
}
}

View File

@ -0,0 +1,104 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryToken
{
/**
* Token type Word.
*/
const TOKTYPE_WORD = 0;
/**
* Token type Field.
* Field indicator in 'field:word' pair
*/
const TOKTYPE_FIELD = 1;
/**
* Token type Sign.
* '+' (required) or '-' (absentee) sign
*/
const TOKTYPE_SIGN = 2;
/**
* Token type Bracket.
* '(' or ')'
*/
const TOKTYPE_BRACKET = 3;
/**
* Token type.
*
* @var integer
*/
public $type;
/**
* Token text.
*
* @var integer
*/
public $text;
/**
* IndexReader constructor needs token type and token text as a parameters.
*
* @param $tokType integer
* @param $tokText string
*/
public function __construct($tokType, $tokText)
{
switch ($tokType) {
case self::TOKTYPE_BRACKET:
// fall through to the next case
case self::TOKTYPE_FIELD:
// fall through to the next case
case self::TOKTYPE_SIGN:
// fall through to the next case
case self::TOKTYPE_WORD:
break;
default:
throw new Zend_Search_Lucene_Exception("Unrecognized token type \"$tokType\".");
}
if (!strlen($tokText)) {
throw new Zend_Search_Lucene_Exception('Token text must be supplied.');
}
$this->type = $tokType;
$this->text = $tokText;
}
}

View File

@ -0,0 +1,164 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryTokenizer implements Iterator
{
/**
* inputString tokens.
*
* @var array
*/
protected $_tokens = array();
/**
* tokens pointer.
*
* @var integer
*/
protected $_currToken = 0;
/**
* QueryTokenize constructor needs query string as a parameter.
*
* @param string $inputString
*/
public function __construct($inputString)
{
if (!strlen($inputString)) {
throw new Zend_Search_Lucene_Exception('Cannot tokenize empty query string.');
}
$currentToken = '';
for ($count = 0; $count < strlen($inputString); $count++) {
if (ctype_alnum( $inputString{$count} )) {
$currentToken .= $inputString{$count};
} else {
// Previous token is finished
if (strlen($currentToken)) {
$this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD,
$currentToken);
$currentToken = '';
}
if ($inputString{$count} == '+' || $inputString{$count} == '-') {
$this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN,
$inputString{$count});
} elseif ($inputString{$count} == '(' || $inputString{$count} == ')') {
$this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET,
$inputString{$count});
} elseif ($inputString{$count} == ':' && $this->count()) {
if ($this->_tokens[count($this->_tokens)-1]->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) {
$this->_tokens[count($this->_tokens)-1]->type = Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD;
}
}
}
}
if (strlen($currentToken)) {
$this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, $currentToken);
}
}
/**
* Returns number of tokens
*
* @return integer
*/
public function count()
{
return count($this->_tokens);
}
/**
* Returns TRUE if a token exists at the current position.
*
* @return boolean
*/
public function valid()
{
return $this->_currToken < $this->count();
}
/**
* Resets token stream.
*
* @return integer
*/
public function rewind()
{
$this->_currToken = 0;
}
/**
* Returns the token at the current position or FALSE if
* the position does not contain a valid token.
*
* @return mixed
*/
public function current()
{
return $this->valid() ? $this->_tokens[$this->_currToken] : false;
}
/**
* Returns next token
*
* @return Zend_Search_Lucene_Search_QueryToken
*/
public function next()
{
return ++$this->_currToken;
}
/**
* Return the position of the current token.
*
* @return integer
*/
public function key()
{
return $this->_currToken;
}
}

View File

@ -0,0 +1,553 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Similarity_Default */
require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Similarity
{
/**
* The Similarity implementation used by default.
*
* @var Zend_Search_Lucene_Search_Similarity
*/
static private $_defaultImpl;
/**
* Cache of decoded bytes.
* Array of floats
*
* @var array
*/
static private $_normTable = array( 0 => 0.0,
1 => 5.820766E-10,
2 => 6.9849193E-10,
3 => 8.1490725E-10,
4 => 9.313226E-10,
5 => 1.1641532E-9,
6 => 1.3969839E-9,
7 => 1.6298145E-9,
8 => 1.8626451E-9,
9 => 2.3283064E-9,
10 => 2.7939677E-9,
11 => 3.259629E-9,
12 => 3.7252903E-9,
13 => 4.656613E-9,
14 => 5.5879354E-9,
15 => 6.519258E-9,
16 => 7.4505806E-9,
17 => 9.313226E-9,
18 => 1.1175871E-8,
19 => 1.3038516E-8,
20 => 1.4901161E-8,
21 => 1.8626451E-8,
22 => 2.2351742E-8,
23 => 2.6077032E-8,
24 => 2.9802322E-8,
25 => 3.7252903E-8,
26 => 4.4703484E-8,
27 => 5.2154064E-8,
28 => 5.9604645E-8,
29 => 7.4505806E-8,
30 => 8.940697E-8,
31 => 1.0430813E-7,
32 => 1.1920929E-7,
33 => 1.4901161E-7,
34 => 1.7881393E-7,
35 => 2.0861626E-7,
36 => 2.3841858E-7,
37 => 2.9802322E-7,
38 => 3.5762787E-7,
39 => 4.172325E-7,
40 => 4.7683716E-7,
41 => 5.9604645E-7,
42 => 7.1525574E-7,
43 => 8.34465E-7,
44 => 9.536743E-7,
45 => 1.1920929E-6,
46 => 1.4305115E-6,
47 => 1.66893E-6,
48 => 1.9073486E-6,
49 => 2.3841858E-6,
50 => 2.861023E-6,
51 => 3.33786E-6,
52 => 3.8146973E-6,
53 => 4.7683716E-6,
54 => 5.722046E-6,
55 => 6.67572E-6,
56 => 7.6293945E-6,
57 => 9.536743E-6,
58 => 1.1444092E-5,
59 => 1.335144E-5,
60 => 1.5258789E-5,
61 => 1.9073486E-5,
62 => 2.2888184E-5,
63 => 2.670288E-5,
64 => 3.0517578E-5,
65 => 3.8146973E-5,
66 => 4.5776367E-5,
67 => 5.340576E-5,
68 => 6.1035156E-5,
69 => 7.6293945E-5,
70 => 9.1552734E-5,
71 => 1.0681152E-4,
72 => 1.2207031E-4,
73 => 1.5258789E-4,
74 => 1.8310547E-4,
75 => 2.1362305E-4,
76 => 2.4414062E-4,
77 => 3.0517578E-4,
78 => 3.6621094E-4,
79 => 4.272461E-4,
80 => 4.8828125E-4,
81 => 6.1035156E-4,
82 => 7.324219E-4,
83 => 8.544922E-4,
84 => 9.765625E-4,
85 => 0.0012207031,
86 => 0.0014648438,
87 => 0.0017089844,
88 => 0.001953125,
89 => 0.0024414062,
90 => 0.0029296875,
91 => 0.0034179688,
92 => 0.00390625,
93 => 0.0048828125,
94 => 0.005859375,
95 => 0.0068359375,
96 => 0.0078125,
97 => 0.009765625,
98 => 0.01171875,
99 => 0.013671875,
100 => 0.015625,
101 => 0.01953125,
102 => 0.0234375,
103 => 0.02734375,
104 => 0.03125,
105 => 0.0390625,
106 => 0.046875,
107 => 0.0546875,
108 => 0.0625,
109 => 0.078125,
110 => 0.09375,
111 => 0.109375,
112 => 0.125,
113 => 0.15625,
114 => 0.1875,
115 => 0.21875,
116 => 0.25,
117 => 0.3125,
118 => 0.375,
119 => 0.4375,
120 => 0.5,
121 => 0.625,
122 => 0.75,
123 => 0.875,
124 => 1.0,
125 => 1.25,
126 => 1.5,
127 => 1.75,
128 => 2.0,
129 => 2.5,
130 => 3.0,
131 => 3.5,
132 => 4.0,
133 => 5.0,
134 => 6.0,
135 => 7.0,
136 => 8.0,
137 => 10.0,
138 => 12.0,
139 => 14.0,
140 => 16.0,
141 => 20.0,
142 => 24.0,
143 => 28.0,
144 => 32.0,
145 => 40.0,
146 => 48.0,
147 => 56.0,
148 => 64.0,
149 => 80.0,
150 => 96.0,
151 => 112.0,
152 => 128.0,
153 => 160.0,
154 => 192.0,
155 => 224.0,
156 => 256.0,
157 => 320.0,
158 => 384.0,
159 => 448.0,
160 => 512.0,
161 => 640.0,
162 => 768.0,
163 => 896.0,
164 => 1024.0,
165 => 1280.0,
166 => 1536.0,
167 => 1792.0,
168 => 2048.0,
169 => 2560.0,
170 => 3072.0,
171 => 3584.0,
172 => 4096.0,
173 => 5120.0,
174 => 6144.0,
175 => 7168.0,
176 => 8192.0,
177 => 10240.0,
178 => 12288.0,
179 => 14336.0,
180 => 16384.0,
181 => 20480.0,
182 => 24576.0,
183 => 28672.0,
184 => 32768.0,
185 => 40960.0,
186 => 49152.0,
187 => 57344.0,
188 => 65536.0,
189 => 81920.0,
190 => 98304.0,
191 => 114688.0,
192 => 131072.0,
193 => 163840.0,
194 => 196608.0,
195 => 229376.0,
196 => 262144.0,
197 => 327680.0,
198 => 393216.0,
199 => 458752.0,
200 => 524288.0,
201 => 655360.0,
202 => 786432.0,
203 => 917504.0,
204 => 1048576.0,
205 => 1310720.0,
206 => 1572864.0,
207 => 1835008.0,
208 => 2097152.0,
209 => 2621440.0,
210 => 3145728.0,
211 => 3670016.0,
212 => 4194304.0,
213 => 5242880.0,
214 => 6291456.0,
215 => 7340032.0,
216 => 8388608.0,
217 => 1.048576E7,
218 => 1.2582912E7,
219 => 1.4680064E7,
220 => 1.6777216E7,
221 => 2.097152E7,
222 => 2.5165824E7,
223 => 2.9360128E7,
224 => 3.3554432E7,
225 => 4.194304E7,
226 => 5.0331648E7,
227 => 5.8720256E7,
228 => 6.7108864E7,
229 => 8.388608E7,
230 => 1.00663296E8,
231 => 1.17440512E8,
232 => 1.34217728E8,
233 => 1.6777216E8,
234 => 2.01326592E8,
235 => 2.34881024E8,
236 => 2.68435456E8,
237 => 3.3554432E8,
238 => 4.02653184E8,
239 => 4.69762048E8,
240 => 5.3687091E8,
241 => 6.7108864E8,
242 => 8.0530637E8,
243 => 9.395241E8,
244 => 1.07374182E9,
245 => 1.34217728E9,
246 => 1.61061274E9,
247 => 1.87904819E9,
248 => 2.14748365E9,
249 => 2.68435456E9,
250 => 3.22122547E9,
251 => 3.75809638E9,
252 => 4.2949673E9,
253 => 5.3687091E9,
254 => 6.4424509E9,
255 => 7.5161928E9 );
/**
* Set the default Similarity implementation used by indexing and search
* code.
*
* @param Zend_Search_Lucene_Search_Similarity $similarity
*/
static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
{
self::$_defaultImpl = $similarity;
}
/**
* Return the default Similarity implementation used by indexing and search
* code.
*
* @return Zend_Search_Lucene_Search_Similarity
*/
static public function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
}
return self::$_defaultImpl;
}
/**
* Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
* search code.
*
* Matches in longer fields are less precise, so implemenations of this
* method usually return smaller values when 'numTokens' is large,
* and larger values when 'numTokens' is small.
*
* That these values are computed under
* IndexWriter::addDocument(Document) and stored then using
* encodeNorm(float). Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* fieldName - name of field
* numTokens - the total number of tokens contained in fields named
* 'fieldName' of 'doc'.
* Returns a normalization factor for hits on this field of this document
*
* @param string $fieldName
* @param integer $numTokens
* @return float
*/
abstract public function lengthNorm($fieldName, $numTokens);
/**
* Computes the normalization value for a query given the sum of the squared
* weights of each of the query terms. This value is then multipled into the
* weight of each query term.
*
* This does not affect ranking, but rather just attempts to make scores
* from different queries comparable.
*
* sumOfSquaredWeights - the sum of the squares of query term weights
* Returns a normalization factor for query weights
*
* @param float $sumOfSquaredWeights
* @return float
*/
abstract public function queryNorm($sumOfSquaredWeights);
/**
* Decodes a normalization factor stored in an index.
*
* @param integer $byte
* @return float
*/
static public function decodeNorm($byte)
{
return self::$_normTable[$byte & 0xFF];
}
/**
* Encodes a normalization factor for storage in an index.
*
* The encoding uses a five-bit exponent and three-bit mantissa, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
*
* @param float $f
* @return integer
*/
static function encodeNorm($f)
{
return self::_floatToByte($f);
}
/**
* Float to byte conversion
*
* @param integer $b
* @return float
*/
static private function _floatToByte($f)
{
// round negatives up to zero
if ($f <= 0.0) {
return 0;
}
// search for appropriate value
$lowIndex = 0;
$highIndex = 255;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$delta = $f - self::$_normTable[$mid];
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
return $mid; // We got it!
}
}
// round to closest value
if ($highIndex != 255 &&
$f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
return $highIndex + 1;
} else {
return $highIndex;
}
}
/**
* Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the idf(Term, Searcher)
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when 'freq' is large, and smaller values when 'freq'
* is small.
*
* freq - the frequency of a term within a document
* Returns a score factor based on a term's within-document frequency
*
* @param float $freq
* @return float
*/
abstract public function tf($freq);
/**
* Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to tf(float).
*
* A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* distance - the edit distance of this sloppy phrase match
* Returns the frequency increment for this match
*
* @param integer $distance
* @return float
*/
abstract public function sloppyFreq($distance);
/**
* Computes a score factor for a simple term or a phrase.
*
* The default implementation is:
* return idfFreq(searcher.docFreq(term), searcher.maxDoc());
*
* input - the term in question or array of terms
* reader - reader the document collection being searched
* Returns a score factor for the term
*
* @param mixed $input
* @param Zend_Search_Lucene $reader
* @return a score factor for the term
*/
public function idf($input, $reader)
{
if (!is_array($input)) {
return $this->idfFreq($reader->docFreq($input), $reader->count());
} else {
$idf = 0.0;
foreach ($input as $term) {
$idf += $this->idfFreq($reader->docFreq($term), $reader->count());
}
return $idf;
}
}
/**
* Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* tf(int) factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* Terms that occur in fewer documents are better indicators of topic, so
* implemenations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* docFreq - the number of documents which contain the term
* numDocs - the total number of documents in the collection
* Returns a score factor based on the term's document frequency
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
abstract public function idfFreq($docFreq, $numDocs);
/**
* Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*
* The presence of a large portion of the query terms indicates a better
* match with the query, so implemenations of this method usually return
* larger values when the ratio between these parameters is large and smaller
* values when the ratio between them is small.
*
* overlap - the number of query terms matched in the document
* maxOverlap - the total number of terms in the query
* Returns a score factor based on term overlap with the query
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
abstract public function coord($overlap, $maxOverlap);
}

View File

@ -0,0 +1,105 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity
{
/**
* Implemented as '1/sqrt(numTerms)'.
*
* @param string $fieldName
* @param integer numTerms
* @return float
*/
public function lengthNorm($fieldName, $numTerms)
{
if ($numTerms == 0) {
return 1E10;
}
return 1.0/sqrt($numTerms);
}
/**
* Implemented as '1/sqrt(sumOfSquaredWeights)'.
*
* @param float $sumOfSquaredWeights
* @return float
*/
public function queryNorm($sumOfSquaredWeights)
{
return 1.0/sqrt($sumOfSquaredWeights);
}
/**
* Implemented as 'sqrt(freq)'.
*
* @param float $freq
* @return float
*/
public function tf($freq)
{
return sqrt($freq);
}
/**
* Implemented as '1/(distance + 1)'.
*
* @param integer $distance
* @return float
*/
public function sloppyFreq($distance)
{
return 1.0/($distance + 1);
}
/**
* Implemented as 'log(numDocs/(docFreq+1)) + 1'.
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
public function idfFreq($docFreq, $numDocs)
{
return log($numDocs/(float)($docFreq+1)) + 1.0;
}
/**
* Implemented as 'overlap/maxOverlap'.
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
public function coord($overlap, $maxOverlap)
{
return $overlap/(float)$maxOverlap;
}
}

View File

@ -0,0 +1,61 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Calculate query weights and build query scorers.
*
* A Weight is constructed by a query Query->createWeight().
* The sumOfSquaredWeights() method is then called on the top-level
* query to compute the query normalization factor Similarity->queryNorm(float).
* This factor is then passed to normalize(float). At this point the weighting
* is complete.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Weight
{
/**
* The weight for this query.
*
* @return float
*/
abstract public function getValue();
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
abstract public function sumOfSquaredWeights();
/**
* Assigns the query normalization factor to this.
*
* @param $norm
*/
abstract public function normalize($norm);
}

View File

@ -0,0 +1,135 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query_MultiTerm
*/
private $_query;
/**
* Query terms weights
* Array of Zend_Search_Lucene_Search_Weight_Term
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query_MultiTerm $query
* @param Zend_Search_Lucene $reader
*/
public function __construct($query, $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getTerms() as $num => $term) {
if ($signs === null || $signs[$num] === null || $signs[$num]) {
$this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader);
$query->setWeight($num, $this->_weights[$num]);
}
}
}
/**
* The weight for this query
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}

View File

@ -0,0 +1,141 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Weight
*/
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query_Phrase
*/
private $_query;
/**
* Weight value
*
* @var float
*/
private $_value;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Normalization factor
*
* @var float
*/
private $_queryNorm;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Phrase constructor
*
* @param Zend_Search_Lucene_Search_Query_Phrase $query
* @param Zend_Search_Lucene $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query, Zend_Search_Lucene $reader)
{
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The weight for this query
*
* @return float
*/
public function getValue()
{
return $this->_value;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@ -0,0 +1,146 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene
*/
private $_reader;
/**
* Term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Weight value
*
* @var float
*/
private $_value;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Normalization factor
*
* @var float
*/
private $_queryNorm;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Term constructor
* reader - index reader
*
* @param Zend_Search_Lucene $reader
*/
public function __construct($term, $query, $reader)
{
$this->_term = $term;
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The weight for this query
*
* @return float
*/
public function getValue()
{
return $this->_value;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@ -0,0 +1,120 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Storage_Directory
{
/**
* Closes the store.
*
* @return void
*/
abstract public function close();
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
abstract public function fileList();
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
*/
abstract public function createFile($filename);
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
*/
abstract public function deleteFile($filename);
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
abstract public function fileExists($filename);
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
abstract public function fileLength($filename);
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
abstract public function fileModified($filename);
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
*/
abstract public function renameFile($from, $to);
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
abstract public function touchFile($filename);
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
*/
abstract public function getFileObject($filename);
}

View File

@ -0,0 +1,272 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Storage_Directory */
require_once 'Zend/Search/Lucene/Storage/Directory.php';
/** Zend_Search_Lucene_Storage_File_Filesystem */
require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php';
/**
* FileSystem implementation of Directory abstraction.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory
{
/**
* Filesystem path to the directory
*
* @var string
*/
private $_dirPath = null;
/**
* Cache for Zend_Search_Lucene_Storage_File_Filesystem objects
* Array: filename => Zend_Search_Lucene_Storage_File object
*
* @var array
* @throws Zend_Search_Lucene_Exception
*/
private $_fileHandlers;
/**
* Utility function to recursive directory creation
*
* @param string $dir
* @param integer $mode
* @param boolean $recursive
* @return boolean
*/
static public function mkdirs($dir, $mode = 0777, $recursive = true)
{
if (is_null($dir) || $dir === '') {
return false;
}
if (is_dir($dir) || $dir === '/') {
return true;
}
if (self::mkdirs(dirname($dir), $mode, $recursive)) {
return mkdir($dir, $mode);
}
return false;
}
/**
* Object constructor
* Checks if $path is a directory or tries to create it.
*
* @param string $path
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($path)
{
if (!is_dir($path)) {
if (file_exists($path)) {
throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory');
} else {
if (!self::mkdirs($path)) {
throw new Zend_Search_Lucene_Exception("Can't create directory '$path'.");
}
}
}
$this->_dirPath = $path;
$this->_fileHandlers = array();
}
/**
* Closes the store.
*
* @return void
*/
public function close()
{
foreach ($this->_fileHandlers as $fileObject) {
$fileObject->close();
}
unset($this->_fileHandlers);
}
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
public function fileList()
{
$result = array();
$dirContent = opendir( $this->_dirPath );
while ($file = readdir($dirContent)) {
if (($file == '..')||($file == '.')) continue;
$fullName = $this->_dirPath . '/' . $file;
if( !is_dir($this->_dirPath . '/' . $file) ) {
$result[] = $file;
}
}
return $result;
}
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
*/
public function createFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b');
return $this->_fileHandlers[$filename];
}
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
*/
public function deleteFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
unlink($this->_dirPath .'/'. $filename);
}
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
public function fileExists($filename)
{
return isset($this->_fileHandlers[$filename]) ||
file_exists($this->_dirPath . '/' . $filename);
}
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
public function fileLength($filename)
{
if (isset( $this->_fileHandlers[$filename] )) {
return $this->_fileHandlers[$filename]->size();
}
return filesize($this->_dirPath .'/'. $filename);
}
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
public function fileModified($filename)
{
return filemtime($this->_dirPath .'/'. $filename);
}
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
*/
public function renameFile($from, $to)
{
if ($this->_fileHandlers[$from] !== null) {
$this->_fileHandlers[$from]->close();
}
unset($this->_fileHandlers[$from]);
if ($this->_fileHandlers[$to] !== null) {
$this->_fileHandlers[$to]->close();
}
unset($this->_fileHandlers[$to]);
if (file_exists($this->_dirPath . '/' . $to)) {
unlink($this->_dirPath . '/' . $to);
}
return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to);
}
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
public function touchFile($filename)
{
return touch($this->_dirPath .'/'. $filename);
}
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
*/
public function getFileObject($filename)
{
if (isset( $this->_fileHandlers[$filename] )) {
$this->_fileHandlers[$filename]->seek(0);
return $this->_fileHandlers[$filename];
}
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename);
return $this->_fileHandlers[$filename];
}
}

View File

@ -0,0 +1,371 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Storage_File
{
/**
* Reads $length number of bytes at the current position in the
* file and advances the file pointer.
*
* @param integer $length
* @return string
*/
abstract protected function _fread($length=1);
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
abstract public function seek($offset, $whence=SEEK_SET);
/**
* Get file position.
*
* @return integer
*/
abstract public function tell();
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
abstract protected function _fwrite($data, $length=null);
/**
* Reads a byte from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readByte()
{
return ord($this->_fread(1));
}
/**
* Writes a byte to the end of the file.
*
* @param integer $byte
*/
public function writeByte($byte)
{
return $this->_fwrite(chr($byte), 1);
}
/**
* Read num bytes from the current position in the file
* and advances the file pointer.
*
* @param integer $num
* @return string
*/
public function readBytes($num)
{
return $this->_fread($num);
}
/**
* Writes num bytes of data (all, if $num===null) to the end
* of the string.
*
* @param string $data
* @param integer $num
*/
public function writeBytes($data, $num=null)
{
$this->_fwrite($data, $num);
}
/**
* Reads an integer from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readInt()
{
$str = $this->_fread(4);
return ord($str{0}) << 24 |
ord($str{1}) << 16 |
ord($str{2}) << 8 |
ord($str{3});
}
/**
* Writes an integer to the end of file.
*
* @param integer $value
*/
public function writeInt($value)
{
settype($value, 'integer');
$this->_fwrite( chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 4 );
}
/**
* Returns a long integer from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readLong()
{
$str = $this->_fread(8);
/**
* PHP uses long as largest integer. fseek() uses long for offset.
* long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent
* conversion to float.
* So, largest index segment file is 2Gb
*/
return /* ord($str{0}) << 56 | */
/* ord($str{1}) << 48 | */
/* ord($str{2}) << 40 | */
/* ord($str{3}) << 32 | */
ord($str{4}) << 24 |
ord($str{5}) << 16 |
ord($str{6}) << 8 |
ord($str{7});
}
/**
* Writes long integer to the end of file
*
* @param integer $value
*/
public function writeLong($value)
{
/**
* PHP uses long as largest integer. fseek() uses long for offset.
* long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent
* conversion to float.
* So, largest index segment file is 2Gb
*/
settype($value, 'integer');
$this->_fwrite( "\x00\x00\x00\x00" .
chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 8 );
}
/**
* Returns a variable-length integer from the current
* position in the file and advances the file pointer.
*
* @return integer
*/
public function readVInt()
{
$nextByte = ord($this->_fread(1));
$val = $nextByte & 0x7F;
for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) {
$nextByte = ord($this->_fread(1));
$val |= ($nextByte & 0x7F) << $shift;
}
return $val;
}
/**
* Writes a variable-length integer to the end of file.
*
* @param integer $value
*/
public function writeVInt($value)
{
settype($value, 'integer');
while ($value > 0x7F) {
$this->_fwrite(chr( ($value & 0x7F)|0x80 ));
$value >>= 7;
}
$this->_fwrite(chr($value));
}
/**
* Reads a string from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readString()
{
$strlen = $this->readVInt();
if ($strlen == 0) {
return '';
} else {
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
$str_val = $this->_fread($strlen);
for ($count = 0; $count < $strlen; $count++ ) {
if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($str_val{$count}) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
if (ord($str_val{$count}) & 0x10 ) {
$addBytes++;
}
}
$str_val .= $this->_fread($addBytes);
$strlen += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($str_val{$count}) == 0xC0 &&
ord($str_val{$count+1}) == 0x80 ) {
$str_val{$count} = 0;
$str_val = substr($str_val,0,$count+1)
. substr($str_val,$count+2);
}
$count += $addBytes;
}
}
return $str_val;
}
}
/**
* Writes a string to the end of file.
*
* @param string $str
* @throws Zend_Search_Lucene_Exception
*/
public function writeString($str)
{
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
// convert input to a string before iterating string characters
settype($str, 'string');
$chars = $strlen = strlen($str);
$containNullChars = false;
for ($count = 0; $count < $strlen; $count++ ) {
/**
* String is already in Java 2 representation.
* We should only calculate actual string length and replace
* \x00 by \xC0\x80
*/
if ((ord($str{$count}) & 0xC0) == 0xC0) {
$addBytes = 1;
if (ord($str{$count}) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
// and we dont't support non-BMP characters
if (ord($str{$count}) & 0x10 ) {
$addBytes++;
}
}
$chars -= $addBytes;
if (ord($str{$count}) == 0 ) {
$containNullChars = true;
}
$count += $addBytes;
}
}
if ($chars < 0) {
throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string');
}
$this->writeVInt($chars);
if ($containNullChars) {
$this->_fwrite(str_replace($str, "\x00", "\xC0\x80"));
} else {
$this->_fwrite($str);
}
}
/**
* Reads binary data from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readBinary()
{
return $this->_fread($this->readVInt());
}
}

View File

@ -0,0 +1,171 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Storage_File */
require_once 'Zend/Search/Lucene/Storage/File.php';
/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File
{
/**
* Resource of the open file
*
* @var resource
*/
private $_fileHandle;
/**
* Class constructor. Open the file.
*
* @param string $filename
* @param string $mode
*/
public function __construct($filename, $mode='r+b')
{
global $php_errormsg;
$trackErrors = ini_get( "track_errors");
ini_set('track_errors', '1');
$this->_fileHandle = @fopen($filename, $mode);
if ($this->_fileHandle===false) {
ini_set('track_errors', $trackErrors);
throw new Zend_Search_Lucene_Exception($php_errormsg);
}
ini_set('track_errors', $trackErrors);
}
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* SEEK_CUR is the only supported offset type for compound files
*
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
public function seek($offset, $whence=SEEK_SET)
{
return fseek($this->_fileHandle, $offset, $whence);
}
/**
* Get file position.
*
* @return integer
*/
public function tell()
{
return ftell($this->_fileHandle);
}
/**
* Close File object
*/
public function close()
{
if ($this->_fileHandle !== null ) {
@fclose($this->_fileHandle);
$this->_fileHandle = null;
}
}
/**
* Get the size of the already opened file
*
* @return integer
*/
public function size()
{
$position = ftell($this->_fileHandle);
fseek($this->_fileHandle, 0, SEEK_END);
$size = ftell($this->_fileHandle);
fseek($this->_fileHandle,$position);
return $size;
}
/**
* Read a $length bytes from the file and advance the file pointer.
*
* @param integer $length
* @return string
*/
protected function _fread($length=1)
{
if ($length == 0) {
return '';
}
if ($length < 1024) {
return fread($this->_fileHandle, $length);
}
$data = '';
while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) {
$data .= $nextBlock;
$length -= strlen($nextBlock);
}
return $data;
}
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
protected function _fwrite($data, $length=null)
{
if ($length === null ) {
fwrite($this->_fileHandle, $data);
} else {
fwrite($this->_fileHandle, $data, $length);
}
}
}

View File

@ -0,0 +1,14 @@
@todo
- Improve API: fix ZSearchMultiTermQuery($terms, $signs);
- Analysis and indexing engine
- Additional queries: phrase, wildcard, proximity, and range
- Better class-level docblocks (most functions okay)
- Some Windows issues(?) during indexing
- Finish renaming classes to PEAR-like conventions

15
search/db/mysql.sql Normal file
View File

@ -0,0 +1,15 @@
CREATE TABLE IF NOT EXISTS `search_documents` (
`id` int(11) NOT NULL auto_increment,
`type` varchar(12) NOT NULL default 'none',
`title` varchar(100) NOT NULL default '',
`url` varchar(100) NOT NULL default '',
`updated` timestamp NOT NULL default CURRENT_TIMESTAMP,
`courseid` int(11) NOT NULL default '0',
`userid` int(11) NOT NULL default '0',
`groupid` int(11) NOT NULL default '0',
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=1;
DELETE FROM `search_documents` WHERE 1;
ALTER TABLE `search_documents` AUTO_INCREMENT =1;

21
search/db/postgres7.sql Normal file
View File

@ -0,0 +1,21 @@
--probably a bit suspect, need to explicitly create
--id sequence (i.e. don't depend on postgres default seq naming)?
--not sure about table owner either
CREATE TABLE search_documents
(
id serial,
"type" varchar(12) NOT NULL DEFAULT 'none',
title varchar(100) NOT NULL default '',
url varchar(100) NOT NULL default '',
updated timestamp NOT NULL DEFAULT NOW(),
courseid int4,
userid int4,
groupid int4,
CONSTRAINT id_pkey PRIMARY KEY (id)
) WITHOUT OIDS;
--ALTER TABLE search_documents OWNER TO postgres;
DELETE FROM search_documents;
SELECT setval('public.search_documents_id_seq', 1);

View File

@ -0,0 +1,12 @@
<?php
class SearchDocument extends Zend_Search_Lucene_Document {
public function __construct($document_type, $cid, $uid, $gid) {
$this->addField(Zend_Search_Lucene_Field::Keyword('type', $document_type));
$this->addField(Zend_Search_Lucene_Field::Keyword('courseid', $cid));
$this->addField(Zend_Search_Lucene_Field::Keyword('userid', $uid));
$this->addField(Zend_Search_Lucene_Field::Keyword('groupid', $gid));
} //constructor
} //SearchDocument
?>

View File

@ -0,0 +1,28 @@
<?php
require_once("$CFG->dirroot/search/documents/document.php");
class WikiSearchDocument extends SearchDocument {
public function __construct(&$page, $wiki_id, $cid, $uid, $gid) {
$this->addField(Zend_Search_Lucene_Field::Text('title', $page->pagename));
$this->addField(Zend_Search_Lucene_Field::Text('author', $page->author));
$this->addField(Zend_Search_Lucene_Field::UnStored('contents', $page->content));
$this->addField(Zend_Search_Lucene_Field::Keyword('id', $page->id));
$this->addField(Zend_Search_Lucene_Field::Keyword('version', $page->version));
$this->addField(Zend_Search_Lucene_Field::Keyword('wiki', $wiki_id));
parent::__construct(SEARCH_WIKI_TYPE, $cid, $uid, $gid);
} //constructor
} //WikiSearchDocument
function wiki_name_convert($str) {
return str_replace(' ', '+', $str);
} //wiki_name_convert
function wiki_make_link(&$doc) {
global $CFG;
return $CFG->wwwroot.'/mod/wiki/view.php?wid='.$doc->wiki.'&page='.wiki_name_convert($doc->title).'&version='.$doc->version;
} //wiki_make_link
?>

10
search/index.php Normal file
View File

@ -0,0 +1,10 @@
<?php
/*$id = required_param('id', PARAM_INT); // course
if (! $course = get_record("course", "id", $id)) {
error("Course ID is incorrect");
}
require_course_login($course);
add_to_log($course->id, "wiki", "view all", "index.php?id=$course->id", "");*/
header("Location: query.php");
?>

152
search/indexer.php Normal file
View File

@ -0,0 +1,152 @@
<?php
//this'll take some time, set up the environment
@set_time_limit(0);
@ob_implicit_flush(true);
@ob_end_flush();
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");
require_login();
if (!isadmin()) {
error("You need to be an admin user to use this page.", "$CFG->wwwroot/login/index.php");
} //if
$sure = strtolower(optional_param('areyousure', '', PARAM_ALPHA));
if ($sure != 'yes') {
mtrace("Sorry, you weren't sure enough (<a href='index.php'>back to query page</a>).");
exit(0);
} //if
//check for php5 (lib.php)
if (!search_check_php5()) {
$phpversion = phpversion();
mtrace("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)");
exit(0);
} //if
require_once("$CFG->dirroot/search/Zend/Search/Lucene.php");
//begin timer
search_stopwatch();
mtrace('<pre>Server Time: '.date('r',time())."\n");
//paths
$index_path = $CFG->dataroot.'/search';
$index_db_file = "$CFG->dirroot/search/db/$CFG->dbtype.sql";
if (!file_exists($index_path)) {
mtrace("Data directory ($index_path) does not exist, attempting to create.");
if (!mkdir($index_path)) {
search_pexit("Error creating data directory at: $index_path. Please correct.");
} else {
mtrace("Directory successfully created.");
} //else
} else {
mtrace("Using $index_path as data directory.");
} //else
//stop accidental re-indexing (zzz)
//search_pexit("Not indexing at this time.");
$index = new Zend_Search_Lucene($index_path, true);
//create the database tables
ob_start(); //turn output buffering on - to hide modify_database() output
modify_database($index_db_file, '', false);
ob_end_clean(); //chuck the buffer and resume normal operation
//empty database table goes here
// delete * from search_documents;
// set auto_increment back to 1
//-------- debug stuff
/*
include_once("$CFG->dirroot/mod/wiki/lib.php");
$wikis = get_all_instances_in_courses("wiki", get_courses());
#search_pexit($wikis[1]);
$entries = wiki_get_entries($wikis[1]);
#search_pexit($entries);
#$r = wiki_get_pages($entries[134]);
$r = wiki_get_latest_pages($entries[95]);
search_pexit($r);
//ignore me --------*/
mtrace('Starting activity modules');
if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
foreach ($mods as $mod) {
$libfile = "$CFG->dirroot/mod/$mod->name/lib.php";
if (file_exists($libfile)) {
include_once($libfile);
$iter_function = $mod->name.'_iterator';
$index_function = $mod->name.'_get_content_for_index';
$include_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$c = 0;
$doc = new stdClass;
if (function_exists($index_function) && function_exists($iter_function)) {
include_once($include_file);
mtrace("Processing module function $index_function ...");
foreach ($iter_function() as $i) {
$documents = $index_function($i);
//begin transaction
foreach($documents as $document) {
$c++;
//db sync increases indexing time from 55 sec to 73 (64 on Saturday?), so ~30%
//therefore, let us make a custom insert function for this search module
//data object for db
$doc->type = $document->type;
$doc->title = mysql_real_escape_string($document->title); //naughty
$doc->update = time();
$doc->permissions = 0;
$doc->url = 'none';
$doc->courseid = $document->courseid;
$doc->userid = $document->userid;
$doc->groupid = $document->groupid;
//insert summary into db
$id = insert_record($CFG->prefix.'search_documents', $doc);
//synchronise db with index
$document->addField(Zend_Search_Lucene_Field::Keyword('dbid', $id));
$index->addDocument($document);
//commit every 100 new documents, and print a status message
if (($c%100) == 0) {
$index->commit();
mtrace(".. $c");
} //if
} //foreach
//end transaction
} //foreach
//commit left over documents, and finish up
$index->commit();
mtrace("-- $c documents indexed");
mtrace('done.');
} //if
} //if
} //foreach
} //if
//done modules
mtrace('Finished activity modules');
search_stopwatch();
mtrace(".<br><a href='index.php'>Back to query page</a>.");
mtrace('</pre>');
?>

44
search/indexersplash.php Normal file
View File

@ -0,0 +1,44 @@
<?php
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");
require_login();
if (!isadmin()) {
error("You need to be an admin user to use this page.", "$CFG->wwwroot/login/index.php");
} //if
//check for php5 (lib.php)
if (!search_check_php5()) {
$phpversion = phpversion();
mtrace("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)");
exit(0);
} //if
$index_path = "$CFG->dataroot/search";
$index_dir = get_directory_list($index_path, '', false, false);
$index_filecount = count($index_dir);
//check if the table exists in the db
$tables = $db->MetaTables();
if (array_search('search_documents', $tables)) {
$db_count = count_records($CFG->prefix.'search_documents');
} else {
$db_count = 0;
} //else
//elaborate on error messages, when db!=0 and index=0 -> corrupt, etc.
if ($index_filecount != 0 or $db_count != 0) {
mtrace("<pre>The data directory ($index_path) contains $index_filecount files, and "
."there are $db_count records in the <em>search_documents</em> table.");
mtrace('');
mtrace("This indicates that you have already indexed this site - click the following "
."link if you're sure you want to continue: <a href='indexer.php?areyousure=yes'>Go!</a>");
mtrace('');
mtrace("<a href='index.php'>Back to query page</a>.");
mtrace("</pre>");
} else {
header('Location: indexer.php?areyousure=yes');
} //else
?>

59
search/lib.php Normal file
View File

@ -0,0 +1,59 @@
<?php
//Move this stuff to lib/searchlib.php?
// Author: Michael Champanis
//document types that can be searched
define('SEARCH_NO_TYPE', 'none');
define('SEARCH_WIKI_TYPE', 'wiki');
//returns all the document type constants
function search_get_document_types() {
$r = Array(SEARCH_WIKI_TYPE, SEARCH_NO_TYPE);
return $r;
} //search_get_document_types
//shortens a url so it can fit on the results page
function search_shorten_url($url, $length=30) {
return substr($url, 0, $length)."...";
} //search_shorten_url
//get a real php 5 version number, using 5.0.0 arbitrarily
function search_check_php5($feedback=false) {
if (!check_php_version("5.0.0")) {
if ($feedback) {
$phpversion = phpversion();
print_heading("Sorry, global search requires PHP 5.0.0 or later (currently using version $phpversion)");
} //if
return false;
} else {
return true;
} //else
} //search_check_php5
//simple timer function, outputs result on 2nd call
function search_stopwatch($cli = false) {
if (!empty($GLOBALS['search_script_start_time'])) {
if (!$cli) print '<em>';
print round(microtime(true) - $GLOBALS['search_script_start_time'], 6).' seconds';
if (!$cli) print '</em>';
unset($GLOBALS['search_script_start_time']);
} else {
$GLOBALS['search_script_start_time'] = microtime(true);
} //else
} //search_stopwatch
//print and exit (for debugging)
function search_pexit($str = "") {
if (is_array($str) or is_object($str)) {
print_r($str);
} else if ($str) {
print $str."<br>";
} //if
exit(0);
} //search_pexit
?>

116
search/query.php Normal file
View File

@ -0,0 +1,116 @@
<?php
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");
//check for php5, but don't die yet (see line 27)
if ($check = search_check_php5()) {
require_once("$CFG->dirroot/search/Zend/Search/Lucene.php");
require_once("$CFG->dirroot/search/documents/wiki_document.php");
$query_string = optional_param('query_string', '', PARAM_CLEAN);
$index_path = "$CFG->dataroot/search";
$no_index = false; //optimism!
try {
$index = new Zend_Search_Lucene($index_path, false);
} catch(Exception $e) {
//print $e;
$no_index = true;
} //catch
} //if
if (!$site = get_site()) {
redirect("index.php");
} //if
$strsearch = "Search"; //get_string();
$strquery = "Enter your search query"; //get_string();
print_header("$site->shortname: $strsearch: $strquery", "$site->fullname",
"<a href=\"index.php\">$strsearch</a> -> $strquery");
//keep things pretty, even if php5 isn't available
if (!$check) {
print_heading(search_check_php5(true));
print_footer();
exit(0);
} //if
print_simple_box_start('center', '100%', '', 20);
print_heading($strquery);
print_simple_box_start('center', '', '', 20);
?>
<form name="query" method="get" action="query.php">
<input type="text" name="query_string" length="50" value="<?php print $query_string ?>"/>
&nbsp;<input type="submit" value="Search"/>&nbsp;&nbsp;<a href="query.php?advanced=yes">Advanced search</a>
<a href="stats.php">Statistics</a>
</form>
<br>
<div align="center">
<?php
echo 'Searching: ';
if ($no_index) {
print "0";
} else {
print $index->count();
} //else
print ' documents.';
if ($no_index and isadmin()) {
print "<br><br>Admin: There appears to be no index, click <a href='indexersplash.php'>here</a> to create one.";
} //if
?>
</div>
<?php
print_simple_box_end();
if (!empty($query_string) and !$no_index) {
print_simple_box_start('center', '50%', 'white', 10);
search_stopwatch();
$hits = $index->find(strtolower($query_string));
if (count($hits) > 0) {
$link_function = $hits[0]->type.'_make_link';
} //if
print "<br>";
print count($hits)." results returned for '".$query_string."'.";
print "<br><br>";
print "<ol>";
foreach ($hits as $listing) {
print "<li><a href='".$link_function($listing)."'>$listing->title</a><br>\n"
."<em>".search_shorten_url($link_function($listing), 70)."</em><br>\n"
."Type: ".$listing->type.", score: ".round($listing->score, 3)."<br>\n"
."<br></li>\n";
} //foreach
print "</ol>";
print_simple_box_end();
} //if
if (!empty($query_string) and !$no_index) {
?>
<div align="center">
It took <?php search_stopwatch(); ?> to fetch these results.
</div>
<?php
} //if
print_simple_box_end();
print_footer();
?>

91
search/stats.php Normal file
View File

@ -0,0 +1,91 @@
<?php
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");
//check for php5, but don't die yet
if ($check = search_check_php5()) {
//filesystem stats
$index_path = "$CFG->dataroot/search";
$index_size = display_size(get_directory_size($index_path));
$index_dir = get_directory_list($index_path, '', false, false);
$index_filecount = count($index_dir);
//indexed documents stats
$tables = $db->MetaTables();
if (array_search('search_documents', $tables)) {
$types = search_get_document_types();
sort($types);
//total documents
$type_counts['Total'] = count_records($CFG->prefix.'search_documents');
foreach($types as $type) {
$c = count_records($CFG->prefix.'search_documents', 'type', $type);
$type_counts[$type] = (int)$c;
} //foreach
} else {
$type_counts['Total'] = 0;
} //else
} //if
if (!$site = get_site()) {
redirect("index.php");
} //if
$strsearch = "Search"; //get_string();
$strquery = "Search statistics"; //get_string();
print_header("$site->shortname: $strsearch: $strquery", "$site->fullname",
"<a href=\"index.php\">$strsearch</a> -> $strquery");
//keep things pretty, even if php5 isn't available
if (!$check) {
print_heading(search_check_php5(true));
print_footer();
exit(0);
} //if
print_simple_box_start('center', '100%', '', 20);
print_heading($strquery);
print_simple_box_start('center', '', '', 20);
$table->tablealign = "center";
$table->align = array ("right", "left");
$table->wrap = array ("nowrap", "nowrap");
$table->cellpadding = 5;
$table->cellspacing = 0;
$table->width = '500';
$table->data[] = array('<strong>Data directory</strong>', '<em><strong>'.$index_path.'</strong></em>');
$table->data[] = array('Files in index directory', $index_filecount);
$table->data[] = array('Total size', $index_size);
if ($index_filecount == 0) {
$table->data[] = array('Click to create index', "<a href='indexersplash.php'>Indexer</a>");
} //if
$return_of_table->tablealign = "center";
$return_of_table->align = array ("right", "left");
$return_of_table->wrap = array ("nowrap", "nowrap");
$return_of_table->cellpadding = 5;
$return_of_table->cellspacing = 0;
$return_of_table->width = '500';
$return_of_table->data[] = array('<strong>Database</strong>', '<em><strong>search_documents<strong></em>');
foreach($type_counts as $key => $value) {
$return_of_table->data[] = array($key, $value);
} //foreach
if (isadmin()) {
print_table($table);
print_spacer(20);
} //if
print_table($return_of_table);
print_simple_box_end();
print_simple_box_end();
print_footer();
?>