Lucene Zend Implementation update (better handle of UTF8)
authordiml <diml>
Mon, 9 Jul 2007 20:57:29 +0000 (20:57 +0000)
committerdiml <diml>
Mon, 9 Jul 2007 20:57:29 +0000 (20:57 +0000)
36 files changed:
search/Zend/Search/Exception.php
search/Zend/Search/Lucene.php
search/Zend/Search/Lucene/Analysis/Analyzer.php
search/Zend/Search/Lucene/Analysis/Analyzer/Common.php
search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php
search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php
search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php [new file with mode: 0644]
search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php [new file with mode: 0644]
search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php [new file with mode: 0644]
search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php [new file with mode: 0644]
search/Zend/Search/Lucene/Analysis/Token.php
search/Zend/Search/Lucene/Analysis/TokenFilter.php
search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php
search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php [new file with mode: 0644]
search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php [new file with mode: 0644]
search/Zend/Search/Lucene/Document.php
search/Zend/Search/Lucene/Document/Html.php [new file with mode: 0644]
search/Zend/Search/Lucene/Exception.php
search/Zend/Search/Lucene/FSM.php [new file with mode: 0644]
search/Zend/Search/Lucene/FSMAction.php [new file with mode: 0644]
search/Zend/Search/Lucene/Field.php
search/Zend/Search/Lucene/Index/DictionaryLoader.php [new file with mode: 0644]
search/Zend/Search/Lucene/Index/FieldInfo.php
search/Zend/Search/Lucene/Index/SegmentInfo.php
search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php [new file with mode: 0644]
search/Zend/Search/Lucene/Index/SegmentMerger.php [new file with mode: 0644]
search/Zend/Search/Lucene/Index/SegmentWriter.php
search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php [new file with mode: 0644]
search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php [new file with mode: 0644]
search/Zend/Search/Lucene/Index/Term.php
search/Zend/Search/Lucene/Index/TermInfo.php
search/Zend/Search/Lucene/Index/Writer.php
search/Zend/Search/Lucene/Interface.php [new file with mode: 0644]
search/Zend/Search/Lucene/PriorityQueue.php [new file with mode: 0644]
search/Zend/Search/Lucene/Proxy.php [new file with mode: 0644]
search/Zend/Search/TODO.txt

index a111cf6..291cc43 100644 (file)
@@ -14,7 +14,7 @@
  *
  * @category   Zend
  * @package    Zend_Search
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 /**
  * Framework base exception
  */
-require_once 'Zend/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Exception.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Exception extends Zend_Exception
index 3e33b7c..1f15c9a 100644 (file)
  *
  * @category   Zend
  * @package    Zend_Search_Lucene
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
 
 /** Zend_Search_Lucene_Document */
-require_once 'Zend/Search/Lucene/Document.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
+
+/** Zend_Search_Lucene_Document_Html */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php';
 
 /** Zend_Search_Lucene_Storage_Directory */
-require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php';
+
+/** Zend_Search_Lucene_Storage_File_Memory */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Memory.php';
 
 /** Zend_Search_Lucene_Index_Term */
-require_once 'Zend/Search/Lucene/Index/Term.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php';
 
 /** Zend_Search_Lucene_Index_TermInfo */
-require_once 'Zend/Search/Lucene/Index/TermInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/TermInfo.php';
 
 /** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
 
 /** Zend_Search_Lucene_Index_FieldInfo */
-require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/FieldInfo.php';
 
 /** Zend_Search_Lucene_Index_Writer */
-require_once 'Zend/Search/Lucene/Index/Writer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Writer.php';
 
 /** Zend_Search_Lucene_Search_QueryParser */
-require_once 'Zend/Search/Lucene/Search/QueryParser.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php';
 
 /** Zend_Search_Lucene_Search_QueryHit */
-require_once 'Zend/Search/Lucene/Search/QueryHit.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryHit.php';
 
 /** Zend_Search_Lucene_Search_Similarity */
-require_once 'Zend/Search/Lucene/Search/Similarity.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
+
+
+/** Zend_Search_Lucene_Interface */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php';
+
+/** Zend_Search_Lucene_Proxy */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Proxy.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
-class Zend_Search_Lucene
+class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
 {
+    /**
+     * Default field name for search
+     *
+     * Null means search through all fields
+     *
+     * @var string
+     */
+    private static $_defaultSearchField = null;
+
     /**
      * File system adapter.
      *
@@ -103,6 +128,51 @@ class Zend_Search_Lucene
      */
     private $_hasChanges = false;
 
+
+    /**
+     * Index lock object
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_lock;
+
+    /**
+     * Signal, that index is already closed, changes are fixed and resources are cleaned up
+     *
+     * @var boolean
+     */
+    private $_closed = false;
+
+    /**
+     * Number of references to the index object
+     *
+     * @var integer
+     */
+    private $_refCount = 0;
+
+
+    /**
+     * Create index
+     *
+     * @param mixed $directory
+     * @return Zend_Search_Lucene_Interface
+     */
+    public static function create($directory)
+    {
+        return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
+    }
+
+    /**
+     * Open index
+     *
+     * @param mixed $directory
+     * @return Zend_Search_Lucene_Interface
+     */
+    public static function open($directory)
+    {
+        return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
+    }
+
     /**
      * Opens the index.
      *
@@ -126,13 +196,32 @@ class Zend_Search_Lucene
             $this->_closeDirOnExit = true;
         }
 
+
+        // Get a shared lock to the index
+        $this->_lock = $this->_directory->createFile('index.lock');
+
+        $this->_segmentInfos = array();
+
         if ($create) {
-            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true);
+            // Throw an exception if index is under processing now
+            if (!$this->_lock->lock(LOCK_EX, true)) {
+                throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now');
+            }
+
+            // Writer will create segments file for empty segments list
+            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, true);
+
+            if (!$this->_lock->lock(LOCK_SH)) {
+                throw new Zend_Search_Lucene_Exception('Can\'t reduce lock level from Exclusive to Shared');
+            }
         } else {
+            // Wait if index is under switching from one set of segments to another (Index_Writer::_updateSegments())
+            if (!$this->_lock->lock(LOCK_SH)) {
+                throw new Zend_Search_Lucene_Exception('Can\'t obtain shared index lock');
+            }
             $this->_writer = null;
         }
 
-        $this->_segmentInfos = array();
 
         $segmentsFile = $this->_directory->getFileObject('segments');
 
@@ -143,9 +232,10 @@ class Zend_Search_Lucene
         }
 
         // read version
-        $segmentsFile->readLong();
+        // $segmentsFile->readLong();
+        $segmentsFile->readInt(); $segmentsFile->readInt();
 
-        // read counter
+        // read segment name counter
         $segmentsFile->readInt();
 
         $segments = $segmentsFile->readInt();
@@ -158,35 +248,83 @@ class Zend_Search_Lucene
             $segSize = $segmentsFile->readInt();
             $this->_docCount += $segSize;
 
-            $this->_segmentInfos[$count] =
+            $this->_segmentInfos[] =
                                 new Zend_Search_Lucene_Index_SegmentInfo($segName,
                                                                          $segSize,
                                                                          $this->_directory);
         }
     }
 
-
     /**
-     * Object destructor
+     * Close current index and free resources
      */
-    public function __destruct()
+    private function _close()
     {
+        if ($this->_closed) {
+            // index is already closed and resources are cleaned up
+            return;
+        }
+
         $this->commit();
 
+        // Free shared lock
+        $this->_lock->unlock();
+
         if ($this->_closeDirOnExit) {
             $this->_directory->close();
         }
+
+        $this->_directory    = null;
+        $this->_writer       = null;
+        $this->_segmentInfos = null;
+
+        $this->_closed = true;
+    }
+
+    /**
+     * Add reference to the index object
+     *
+     * @internal
+     */
+    public function addReference()
+    {
+        $this->_refCount++;
+    }
+
+    /**
+     * Remove reference from the index object
+     *
+     * When reference count becomes zero, index is closed and resources are cleaned up
+     *
+     * @internal
+     */
+    public function removeReference()
+    {
+        $this->_refCount--;
+
+        if ($this->_refCount == 0) {
+            $this->_close();
+        }
+    }
+
+    /**
+     * Object destructor
+     */
+    public function __destruct()
+    {
+        $this->_close();
     }
 
     /**
      * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
      *
+     * @internal
      * @return Zend_Search_Lucene_Index_Writer
      */
     public function getIndexWriter()
     {
         if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
-            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
+            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos);
         }
 
         return $this->_writer;
@@ -205,7 +343,7 @@ class Zend_Search_Lucene
 
 
     /**
-     * Returns the total number of documents in this index.
+     * Returns the total number of documents in this index (including deleted documents).
      *
      * @return integer
      */
@@ -214,6 +352,192 @@ class Zend_Search_Lucene
         return $this->_docCount;
     }
 
+    /**
+     * Returns one greater than the largest possible document number.
+     * This may be used to, e.g., determine how big to allocate a structure which will have
+     * an element for every document number in an index.
+     *
+     * @return integer
+     */
+    public function maxDoc()
+    {
+        return $this->count();
+    }
+
+    /**
+     * Returns the total number of non-deleted documents in this index.
+     *
+     * @return integer
+     */
+    public function numDocs()
+    {
+        $numDocs = 0;
+
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            $numDocs += $segmentInfo->numDocs();
+        }
+
+        return $numDocs;
+    }
+
+    /**
+     * Checks, that document is deleted
+     *
+     * @param integer $id
+     * @return boolean
+     * @throws Zend_Search_Lucene_Exception    Exception is thrown if $id is out of the range
+     */
+    public function isDeleted($id)
+    {
+        if ($id >= $this->_docCount) {
+            throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
+        }
+
+        $segmentStartId = 0;
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            if ($segmentStartId + $segmentInfo->count() > $id) {
+                break;
+            }
+
+            $segmentStartId += $segmentInfo->count();
+        }
+
+        return $segmentInfo->isDeleted($id - $segmentStartId);
+    }
+
+    /**
+     * Set default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * Default value is null
+     *
+     * @param string $fieldName
+     */
+    public static function setDefaultSearchField($fieldName)
+    {
+        self::$_defaultSearchField = $fieldName;
+    }
+
+    /**
+     * Get default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * @return string
+     */
+    public static function getDefaultSearchField()
+    {
+        return self::$_defaultSearchField;
+    }
+
+    /**
+     * Retrieve index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMaxBufferedDocs()
+    {
+        return $this->getIndexWriter()->maxBufferedDocs;
+    }
+
+    /**
+     * Set index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @param integer $maxBufferedDocs
+     */
+    public function setMaxBufferedDocs($maxBufferedDocs)
+    {
+        $this->getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
+    }
+
+    /**
+     * Retrieve index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @return integer
+     */
+    public function getMaxMergeDocs()
+    {
+        return $this->getIndexWriter()->maxMergeDocs;
+    }
+
+    /**
+     * Set index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMaxMergeDocs($maxMergeDocs)
+    {
+        $this->getIndexWriter()->maxMergeDocs = $maxMergeDocs;
+    }
+
+    /**
+     * Retrieve index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMergeFactor()
+    {
+        return $this->getIndexWriter()->mergeFactor;
+    }
+
+    /**
+     * Set index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMergeFactor($mergeFactor)
+    {
+        $this->getIndexWriter()->mergeFactor = $mergeFactor;
+    }
 
     /**
      * Performs a query against the index and returns an array
@@ -221,7 +545,8 @@ class Zend_Search_Lucene
      * Input is a string or Zend_Search_Lucene_Search_Query.
      *
      * @param mixed $query
-     * @return array ZSearchHit
+     * @return array Zend_Search_Lucene_Search_QueryHit
+     * @throws Zend_Search_Lucene_Exception
      */
     public function find($query)
     {
@@ -235,22 +560,115 @@ class Zend_Search_Lucene
 
         $this->commit();
 
-        $hits = array();
+        $hits   = array();
         $scores = array();
+        $ids    = array();
+
+        $query = $query->rewrite($this)->optimize($this);
+
+        $query->execute($this);
+
+        $topScore = 0;
 
-        $docNum = $this->count();
-        for( $count=0; $count < $docNum; $count++ ) {
-            $docScore = $query->score( $count, $this);
+        foreach ($query->matchedDocs() as $id => $num) {
+            $docScore = $query->score($id, $this);
             if( $docScore != 0 ) {
                 $hit = new Zend_Search_Lucene_Search_QueryHit($this);
-                $hit->id = $count;
+                $hit->id = $id;
                 $hit->score = $docScore;
 
-                $hits[] = $hit;
+                $hits[]   = $hit;
+                $ids[]    = $id;
                 $scores[] = $docScore;
+
+                if ($docScore > $topScore) {
+                    $topScore = $docScore;
+                }
+            }
+        }
+
+        if (count($hits) == 0) {
+            // skip sorting, which may cause a error on empty index
+               return array();
+        }
+
+        if ($topScore > 1) {
+            foreach ($hits as $hit) {
+                $hit->score /= $topScore;
+            }
+        }
+
+        if (func_num_args() == 1) {
+            // sort by scores
+            array_multisort($scores, SORT_DESC, SORT_NUMERIC,
+                            $ids,    SORT_ASC,  SORT_NUMERIC,
+                            $hits);
+        } else {
+            // sort by given field names
+
+            $argList    = func_get_args();
+            $fieldNames = $this->getFieldNames();
+            $sortArgs   = array();
+
+            for ($count = 1; $count < count($argList); $count++) {
+                $fieldName = $argList[$count];
+
+                if (!is_string($fieldName)) {
+                    throw new Zend_Search_Lucene_Exception('Field name must be a string.');
+                }
+
+                if (!in_array($fieldName, $fieldNames)) {
+                    throw new Zend_Search_Lucene_Exception('Wrong field name.');
+                }
+
+                $valuesArray = array();
+                foreach ($hits as $hit) {
+                    try {
+                        $value = $hit->getDocument()->getFieldValue($fieldName);
+                    } catch (Zend_Search_Lucene_Exception $e) {
+                        if (strpos($e->getMessage(), 'not found') === false) {
+                            throw $e;
+                        } else {
+                            $value = null;
+                        }
+                    }
+
+                    $valuesArray[] = $value;
+                }
+
+                $sortArgs[] = $valuesArray;
+
+                if ($count + 1 < count($argList)  &&  is_integer($argList[$count+1])) {
+                    $count++;
+                    $sortArgs[] = $argList[$count];
+
+                    if ($count + 1 < count($argList)  &&  is_integer($argList[$count+1])) {
+                        $count++;
+                        $sortArgs[] = $argList[$count];
+                    } else {
+                        if ($argList[$count] == SORT_ASC  || $argList[$count] == SORT_DESC) {
+                            $sortArgs[] = SORT_REGULAR;
+                        } else {
+                            $sortArgs[] = SORT_ASC;
+                        }
+                    }
+                } else {
+                    $sortArgs[] = SORT_ASC;
+                    $sortArgs[] = SORT_REGULAR;
+                }
             }
+
+            // Sort by id's if values are equal
+            $sortArgs[] = $ids;
+            $sortArgs[] = SORT_ASC;
+            $sortArgs[] = SORT_NUMERIC;
+
+            // Array to be sorted
+            $sortArgs[] = &$hits;
+
+            // Do sort
+            call_user_func_array('array_multisort', $sortArgs);
         }
-        array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits);
 
         return $hits;
     }
@@ -290,41 +708,45 @@ class Zend_Search_Lucene
             throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
         }
 
-        $segCount = 0;
-        $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
-        while( $nextSegmentStartId <= $id ) {
-               $segCount++;
-               $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+        $segmentStartId = 0;
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            if ($segmentStartId + $segmentInfo->count() > $id) {
+                break;
+            }
+
+            $segmentStartId += $segmentInfo->count();
         }
-        $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
 
-        $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx');
+        $fdxFile = $segmentInfo->openCompoundFile('.fdx');
         $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
         $fieldValuesPosition = $fdxFile->readLong();
 
-        $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt');
-        $fdtFile->seek( $fieldValuesPosition, SEEK_CUR );
+        $fdtFile = $segmentInfo->openCompoundFile('.fdt');
+        $fdtFile->seek($fieldValuesPosition, SEEK_CUR);
         $fieldCount = $fdtFile->readVInt();
 
         $doc = new Zend_Search_Lucene_Document();
-        for( $count = 0; $count < $fieldCount; $count++ ) {
+        for ($count = 0; $count < $fieldCount; $count++) {
             $fieldNum = $fdtFile->readVInt();
             $bits = $fdtFile->readByte();
 
-            $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum);
+            $fieldInfo = $segmentInfo->getField($fieldNum);
 
-            if( !($bits & 2) ) { // Text data
+            if (!($bits & 2)) { // Text data
                 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
                                                       $fdtFile->readString(),
+                                                      'UTF-8',
                                                       true,
                                                       $fieldInfo->isIndexed,
                                                       $bits & 1 );
-            } else {
+            } else {            // Binary data
                 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
                                                       $fdtFile->readBinary(),
+                                                      '',
                                                       true,
                                                       $fieldInfo->isIndexed,
-                                                      $bits & 1 );
+                                                      $bits & 1,
+                                                      true );
             }
 
             $doc->addField($field);
@@ -335,7 +757,26 @@ class Zend_Search_Lucene
 
 
     /**
-     * Returns an array of all the documents which contain term.
+     * Returns true if index contain documents with specified term.
+     *
+     * Is used for query optimization.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return boolean
+     */
+    public function hasTerm(Zend_Search_Lucene_Index_Term $term)
+    {
+        foreach ($this->_segmentInfos as $segInfo) {
+            if ($segInfo->getTermInfo($term) instanceof Zend_Search_Lucene_Index_TermInfo) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Returns IDs of all the documents containing term.
      *
      * @param Zend_Search_Lucene_Index_Term $term
      * @return array
@@ -377,55 +818,40 @@ class Zend_Search_Lucene
 
 
     /**
-     * Returns an array of all term positions in the documents.
-     * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+     * Returns an array of all term freqs.
+     * Result array structure: array(docId => freq, ...)
      *
      * @param Zend_Search_Lucene_Index_Term $term
-     * @return array
+     * @return integer
      */
-    public function termPositions(Zend_Search_Lucene_Index_Term $term)
+    public function termFreqs(Zend_Search_Lucene_Index_Term $term)
     {
         $result = array();
         $segmentStartDocId = 0;
-        foreach( $this->_segmentInfos as $segInfo ) {
-            $termInfo = $segInfo->getTermInfo($term);
-
-            if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
-                $segmentStartDocId += $segInfo->count();
-                continue;
-            }
-
-            $frqFile = $segInfo->openCompoundFile('.frq');
-            $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
-            $freqs = array();
-            $docId = 0;
-
-            for( $count = 0; $count < $termInfo->docFreq; $count++ ) {
-                $docDelta = $frqFile->readVInt();
-                if( $docDelta % 2 == 1 ) {
-                    $docId += ($docDelta-1)/2;
-                    $freqs[ $docId ] = 1;
-                } else {
-                    $docId += $docDelta/2;
-                    $freqs[ $docId ] = $frqFile->readVInt();
-                }
-            }
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            $result += $segmentInfo->termFreqs($term, $segmentStartDocId);
 
-            $prxFile = $segInfo->openCompoundFile('.prx');
-            $prxFile->seek($termInfo->proxPointer,SEEK_CUR);
-            foreach ($freqs as $docId => $freq) {
-                $termPosition = 0;
-                $positions = array();
+            $segmentStartDocId += $segmentInfo->count();
+        }
 
-                for ($count = 0; $count < $freq; $count++ ) {
-                    $termPosition += $prxFile->readVInt();
-                    $positions[] = $termPosition;
-                }
+        return $result;
+    }
 
-                $result[ $segmentStartDocId + $docId ] = $positions;
-            }
+    /**
+     * Returns an array of all term positions in the documents.
+     * Result array structure: array(docId => array(pos1, pos2, ...), ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return array
+     */
+    public function termPositions(Zend_Search_Lucene_Index_Term $term)
+    {
+        $result = array();
+        $segmentStartDocId = 0;
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            $result += $segmentInfo->termPositions($term, $segmentStartDocId);
 
-            $segmentStartDocId += $segInfo->count();
+            $segmentStartDocId += $segmentInfo->count();
         }
 
         return $result;
@@ -468,9 +894,9 @@ class Zend_Search_Lucene
      *
      * @param integer $id
      * @param string $fieldName
-     * @return Zend_Search_Lucene_Document
+     * @return float
      */
-    public function norm( $id, $fieldName )
+    public function norm($id, $fieldName)
     {
         if ($id >= $this->_docCount) {
             return null;
@@ -527,16 +953,17 @@ class Zend_Search_Lucene
             throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
         }
 
-        $segCount = 0;
-        $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
-        while( $nextSegmentStartId <= $id ) {
-               $segCount++;
-               $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+        $segmentStartId = 0;
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            if ($segmentStartId + $segmentInfo->count() > $id) {
+                break;
+            }
+
+            $segmentStartId += $segmentInfo->count();
         }
+        $segmentInfo->delete($id - $segmentStartId);
 
         $this->_hasChanges = true;
-        $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
-        $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId);
     }
 
 
@@ -548,18 +975,26 @@ class Zend_Search_Lucene
      */
     public function addDocument(Zend_Search_Lucene_Document $document)
     {
-        if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
-            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
-        }
-
-        $this->_writer->addDocument($document);
+        $this->getIndexWriter()->addDocument($document);
+        $this->_docCount++;
     }
 
 
+    /**
+     * Update document counter
+     */
+    private function _updateDocCount()
+    {
+        $this->_docCount = 0;
+        foreach ($this->_segmentInfos as $segInfo) {
+            $this->_docCount += $segInfo->count();
+        }
+    }
+
     /**
      * Commit changes resulting from delete() or undeleteAll() operations.
      *
-     * @todo delete() and undeleteAll processing.
+     * @todo undeleteAll processing.
      */
     public function commit()
     {
@@ -572,38 +1007,73 @@ class Zend_Search_Lucene
         }
 
         if ($this->_writer !== null) {
-            foreach ($this->_writer->commit() as $segmentName => $segmentInfo) {
-                if ($segmentInfo !== null) {
-                    $this->_segmentInfos[] = $segmentInfo;
-                    $this->_docCount += $segmentInfo->count();
-                } else {
-                    foreach ($this->_segmentInfos as $segId => $segInfo) {
-                        if ($segInfo->getName() == $segmentName) {
-                            unset($this->_segmentInfos[$segId]);
-                        }
-                    }
-                }
-            }
+            $this->_writer->commit();
+
+            $this->_updateDocCount();
         }
     }
 
 
-    /*************************************************************************
-    @todo UNIMPLEMENTED
-    *************************************************************************/
+    /**
+     * Optimize index.
+     *
+     * Merges all segments into one
+     */
+    public function optimize()
+    {
+        // Commit changes if any changes have been made
+        $this->commit();
+
+        if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
+            $this->getIndexWriter()->optimize();
+            $this->_updateDocCount();
+        }
+    }
+
 
     /**
      * Returns an array of all terms in this index.
      *
-     * @todo Implementation
      * @return array
      */
     public function terms()
     {
-        return array();
+        $result = array();
+
+        $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
+
+        foreach ($this->_segmentInfos as $segmentInfo) {
+            $segmentInfo->reset();
+
+            // Skip "empty" segments
+            if ($segmentInfo->currentTerm() !== null) {
+                $segmentInfoQueue->put($segmentInfo);
+            }
+        }
+
+        while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
+            if ($segmentInfoQueue->top() === null ||
+                $segmentInfoQueue->top()->currentTerm()->key() !=
+                            $segmentInfo->currentTerm()->key()) {
+                // We got new term
+                $result[] = $segmentInfo->currentTerm();
+            }
+
+            $segmentInfo->nextTerm();
+            // check, if segment dictionary is finished
+            if ($segmentInfo->currentTerm() !== null) {
+                // Put segment back into the priority queue
+                $segmentInfoQueue->put($segmentInfo);
+            }
+        }
+
+        return $result;
     }
 
 
+    /*************************************************************************
+    @todo UNIMPLEMENTED
+    *************************************************************************/
     /**
      * Undeletes all documents currently marked as deleted in this index.
      *
@@ -611,4 +1081,4 @@ class Zend_Search_Lucene
      */
     public function undeleteAll()
     {}
-}
\ No newline at end of file
+}
index febf88e..e57f6a5 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_Token */
-require_once 'Zend/Search/Lucene/Analysis/Token.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
 
 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
 
 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
+
+/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
 
+/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
 
 
 /**
@@ -44,7 +61,7 @@ require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.p
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -55,16 +72,74 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer
      *
      * @var Zend_Search_Lucene_Analysis_Analyzer
      */
-    static private $_defaultImpl;
+    private static $_defaultImpl;
+
+    /**
+     * Input string
+     *
+     * @var string
+     */
+    protected $_input = null;
+
+    /**
+     * Input string encoding
+     *
+     * @var string
+     */
+    protected $_encoding = '';
 
     /**
      * Tokenize text to a terms
      * Returns array of Zend_Search_Lucene_Analysis_Token objects
      *
+     * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
+     *
      * @param string $data
      * @return array
      */
-    abstract public function tokenize($data);
+    public function tokenize($data, $encoding = '')
+    {
+        $this->setInput($data, $encoding);
+
+        $tokenList = array();
+        while (($nextToken = $this->nextToken()) !== null) {
+            $tokenList[] = $nextToken;
+        }
+
+        return $tokenList;
+    }
+
+
+    /**
+     * Tokenization stream API
+     * Set input
+     *
+     * @param string $data
+     */
+    public function setInput($data, $encoding = '')
+    {
+        $this->_input    = $data;
+        $this->_encoding = $encoding;
+        $this->reset();
+    }
+
+    /**
+     * Reset token stream
+     */
+    abstract public function reset();
+
+    /**
+     * Tokenization stream API
+     * Get next token
+     * Returns null at the end of stream
+     *
+     * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
+     *
+     * @return Zend_Search_Lucene_Analysis_Token|null
+     */
+    abstract public function nextToken();
+
+
 
 
     /**
@@ -72,7 +147,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer
      *
      * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
      */
-    static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
+    public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
     {
         self::$_defaultImpl = $analyzer;
     }
@@ -83,7 +158,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer
      *
      * @return Zend_Search_Lucene_Analysis_Analyzer
      */
-    static public function getDefault()
+    public static function getDefault()
     {
         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
             self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
@@ -91,6 +166,5 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer
 
         return self::$_defaultImpl;
     }
-
 }
 
index 2ad8a05..c518e93 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_Analyzer */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php';
 
 
 /**
@@ -34,7 +34,7 @@ require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
@@ -58,7 +58,7 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_L
     }
 
     /**
-     * Apply filters to the token.
+     * Apply filters to the token. Can return null when the token was removed.
      *
      * @param Zend_Search_Lucene_Analysis_Token $token
      * @return Zend_Search_Lucene_Analysis_Token
@@ -67,6 +67,11 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_L
     {
         foreach ($this->_filters as $filter) {
             $token = $filter->normalize($token);
+
+            // resulting token can be null if the filter removed it
+            if (is_null($token)) {
+                return null;
+            }
         }
 
         return $token;
index 6f6f0dd..d084ebc 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
 {
     /**
-     * Tokenize text to a terms
-     * Returns array of Zend_Search_Lucene_Analysis_Token objects
+     * Current position in a stream
      *
-     * @param string $data
-     * @return array
+     * @var integer
      */
-    public function tokenize($data)
+    private $_position;
+
+    /**
+     * Reset token stream
+     */
+    public function reset()
     {
-        $tokenStream = array();
+        $this->_position = 0;
 
-        $position = 0;
-        while ($position < strlen($data)) {
-            // skip white space
-            while ($position < strlen($data) && !ctype_alpha( $data{$position} )) {
-                $position++;
-            }
+        if ($this->_input === null) {
+            return;
+        }
 
-            $termStartPosition = $position;
+        // convert input into ascii
+        $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
+        $this->_encoding = 'ASCII';
+    }
 
-            // read token
-            while ($position < strlen($data) && ctype_alpha( $data{$position} )) {
-                $position++;
-            }
+    /**
+     * Tokenization stream API
+     * Get next token
+     * Returns null at the end of stream
+     *
+     * @return Zend_Search_Lucene_Analysis_Token|null
+     */
+    public function nextToken()
+    {
+        if ($this->_input === null) {
+            return null;
+        }
 
-            // Empty token, end of stream.
-            if ($position == $termStartPosition) {
-                break;
+
+        do {
+            if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
+                // It covers both cases a) there are no matches (preg_match(...) === 0)
+                // b) error occured (preg_match(...) === FALSE)
+               return null;
             }
 
-            $token = new Zend_Search_Lucene_Analysis_Token(substr($data,
-                                             $termStartPosition,
-                                             $position-$termStartPosition),
-                                      $termStartPosition,
-                                      $position);
-            $tokenStream[] = $this->normalize($token);
-        }
+            $str = $match[0][0];
+            $pos = $match[0][1];
+            $endpos = $pos + strlen($str);
+
+            $this->_position = $endpos;
+
+            $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
+        } while ($token === null); // try again if token is skipped
 
-        return $tokenStream;
+        return $token;
     }
 }
 
index e5fc372..d9f786a 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
 
 /** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
-require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php
new file mode 100644 (file)
index 0000000..d68b594
--- /dev/null
@@ -0,0 +1,92 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+    /**
+     * Current position in a stream
+     *
+     * @var integer
+     */
+    private $_position;
+
+    /**
+     * Reset token stream
+     */
+    public function reset()
+    {
+        $this->_position = 0;
+
+        if ($this->_input === null) {
+            return;
+        }
+
+        // convert input into ascii
+        $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
+        $this->_encoding = 'ASCII';
+    }
+
+    /**
+     * Tokenization stream API
+     * Get next token
+     * Returns null at the end of stream
+     *
+     * @return Zend_Search_Lucene_Analysis_Token|null
+     */
+    public function nextToken()
+    {
+        if ($this->_input === null) {
+            return null;
+        }
+
+        do {
+            if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
+                // It covers both cases a) there are no matches (preg_match(...) === 0)
+                // b) error occured (preg_match(...) === FALSE)
+               return null;
+            }
+
+            $str = $match[0][0];
+            $pos = $match[0][1];
+            $endpos = $pos + strlen($str);
+
+            $this->_position = $endpos;
+
+            $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
+        } while ($token === null); // try again if token is skipped
+
+        return $token;
+    }
+}
+
diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php
new file mode 100644 (file)
index 0000000..6eab437
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
+
+/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum
+{
+    public function __construct()
+    {
+        $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
+    }
+}
+
diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php
new file mode 100644 (file)
index 0000000..674a3d9
--- /dev/null
@@ -0,0 +1,169 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+    /**
+     * Current char position in an UTF-8 stream
+     *
+     * @var integer
+     */
+    private $_position;
+
+    /**
+     * Current binary position in an UTF-8 stream
+     *
+     * @var integer
+     */
+    private $_bytePosition;
+
+    /**
+     * Stream length
+     *
+     * @var integer
+     */
+    private $_streamLength;
+
+    /**
+     * Reset token stream
+     */
+    public function reset()
+    {
+        $this->_position     = 0;
+        $this->_bytePosition = 0;
+
+        // convert input into UTF-8
+        if (strcasecmp($this->_encoding, 'utf8' ) != 0  &&
+            strcasecmp($this->_encoding, 'utf-8') != 0 ) {
+                $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
+                $this->_encoding = 'UTF-8';
+        }
+
+        // Get UTF-8 string length.
+        // It also checks if it's a correct utf-8 string
+        $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
+    }
+
+    /**
+     * Check, that character is a letter
+     *
+     * @param string $char
+     * @return boolean
+     */
+    private static function _isAlpha($char)
+    {
+        if (strlen($char) > 1) {
+            // It's an UTF-8 character
+            return true;
+        }
+
+        return ctype_alpha($char);
+    }
+
+    /**
+     * Get next UTF-8 char
+     *
+     * @param string $char
+     * @return boolean
+     */
+    private function _nextChar()
+    {
+        $char = $this->_input[$this->_bytePosition++];
+
+        if (( ord($char) & 0xC0 ) == 0xC0) {
+            $addBytes = 1;
+            if (ord($char) & 0x20 ) {
+                $addBytes++;
+                if (ord($char) & 0x10 ) {
+                    $addBytes++;
+                }
+            }
+            $char .= substr($this->_input, $this->_bytePosition, $addBytes);
+            $this->_bytePosition += $addBytes;
+        }
+
+        $this->_position++;
+
+        return $char;
+    }
+
+    /**
+     * Tokenization stream API
+     * Get next token
+     * Returns null at the end of stream
+     *
+     * @return Zend_Search_Lucene_Analysis_Token|null
+     */
+    public function nextToken()
+    {
+        if ($this->_input === null) {
+            return null;
+        }
+
+        while ($this->_position < $this->_streamLength) {
+            // skip white space
+            while ($this->_position < $this->_streamLength &&
+                   !self::_isAlpha($char = $this->_nextChar())) {
+                $char = '';
+            }
+
+            $termStartPosition = $this->_position - 1;
+            $termText = $char;
+
+            // read token
+            while ($this->_position < $this->_streamLength &&
+                   self::_isAlpha($char = $this->_nextChar())) {
+                $termText .= $char;
+            }
+
+            // Empty token, end of stream.
+            if ($termText == '') {
+                return null;
+            }
+
+            $token = new Zend_Search_Lucene_Analysis_Token(
+                                      $termText,
+                                      $termStartPosition,
+                                      $this->_position - 1);
+            $token = $this->normalize($token);
+            if ($token !== null) {
+                return $token;
+            }
+            // Continue if token is skipped
+        }
+
+        return null;
+    }
+}
+
diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php
new file mode 100644 (file)
index 0000000..982b554
--- /dev/null
@@ -0,0 +1,169 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+    /**
+     * Current char position in an UTF-8 stream
+     *
+     * @var integer
+     */
+    private $_position;
+
+    /**
+     * Current binary position in an UTF-8 stream
+     *
+     * @var integer
+     */
+    private $_bytePosition;
+
+    /**
+     * Stream length
+     *
+     * @var integer
+     */
+    private $_streamLength;
+
+    /**
+     * Reset token stream
+     */
+    public function reset()
+    {
+        $this->_position     = 0;
+        $this->_bytePosition = 0;
+
+        // convert input into UTF-8
+        if (strcasecmp($this->_encoding, 'utf8' ) != 0  &&
+            strcasecmp($this->_encoding, 'utf-8') != 0 ) {
+                $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
+                $this->_encoding = 'UTF-8';
+        }
+
+        // Get UTF-8 string length.
+        // It also checks if it's a correct utf-8 string
+        $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
+    }
+
+    /**
+     * Check, that character is a letter
+     *
+     * @param string $char
+     * @return boolean
+     */
+    private static function _isAlNum($char)
+    {
+        if (strlen($char) > 1) {
+            // It's an UTF-8 character
+            return true;
+        }
+
+        return ctype_alnum($char);
+    }
+
+    /**
+     * Get next UTF-8 char
+     *
+     * @param string $char
+     * @return boolean
+     */
+    private function _nextChar()
+    {
+        $char = $this->_input[$this->_bytePosition++];
+
+        if (( ord($char) & 0xC0 ) == 0xC0) {
+            $addBytes = 1;
+            if (ord($char) & 0x20 ) {
+                $addBytes++;
+                if (ord($char) & 0x10 ) {
+                    $addBytes++;
+                }
+            }
+            $char .= substr($this->_input, $this->_bytePosition, $addBytes);
+            $this->_bytePosition += $addBytes;
+        }
+
+        $this->_position++;
+
+        return $char;
+    }
+
+    /**
+     * Tokenization stream API
+     * Get next token
+     * Returns null at the end of stream
+     *
+     * @return Zend_Search_Lucene_Analysis_Token|null
+     */
+    public function nextToken()
+    {
+        if ($this->_input === null) {
+            return null;
+        }
+
+        while ($this->_position < $this->_streamLength) {
+            // skip white space
+            while ($this->_position < $this->_streamLength &&
+                   !self::_isAlNum($char = $this->_nextChar())) {
+                $char = '';
+            }
+
+            $termStartPosition = $this->_position - 1;
+            $termText = $char;
+
+            // read token
+            while ($this->_position < $this->_streamLength &&
+                   self::_isAlNum($char = $this->_nextChar())) {
+                $termText .= $char;
+            }
+
+            // Empty token, end of stream.
+            if ($termText == '') {
+                return null;
+            }
+
+            $token = new Zend_Search_Lucene_Analysis_Token(
+                                      $termText,
+                                      $termStartPosition,
+                                      $this->_position - 1);
+            $token = $this->normalize($token);
+            if ($token !== null) {
+                return $token;
+            }
+            // Continue if token is skipped
+        }
+
+        return null;
+    }
+}
+
index f2e9ee7..91586b0 100644 (file)
@@ -15,7 +15,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -24,7 +24,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Analysis_Token
@@ -50,13 +50,6 @@ class Zend_Search_Lucene_Analysis_Token
      */
     private $_endOffset;
 
-    /**
-     * Lexical type.
-     *
-     * @var string
-     */
-    private $_type;
-
     /**
      * The position of this token relative to the previous Token.
      *
@@ -90,12 +83,11 @@ class Zend_Search_Lucene_Analysis_Token
      * @param integer $end
      * @param string  $type
      */
-    public function __construct($text, $start, $end, $type = 'word' )
+    public function __construct($text, $start, $end)
     {
         $this->_termText    = $text;
         $this->_startOffset = $start;
         $this->_endOffset   = $end;
-        $this->_type        = $type;
 
         $this->_positionIncrement = 1;
     }
@@ -157,15 +149,5 @@ class Zend_Search_Lucene_Analysis_Token
     {
         return $this->_endOffset;
     }
-
-    /**
-     * Returns this Token's lexical type.  Defaults to 'word'.
-     *
-     * @return string
-     */
-    public function getType()
-    {
-        return $this->_type;
-    }
 }
 
index a363aa1..4d559a9 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_Token */
-require_once 'Zend/Search/Lucene/Analysis/Token.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php';
 
 
 /**
@@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Analysis/Token.php';
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
index 5ea1edf..01d25c6 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Analysis_TokenFilter */
-require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
 
 
 /**
@@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Analysis
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -44,10 +44,10 @@ class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Luce
      */
     public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
     {
-        $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ),
+        $newToken = new Zend_Search_Lucene_Analysis_Token(
+                                     strtolower( $srcToken->getTermText() ),
                                      $srcToken->getStartOffset(),
-                                     $srcToken->getEndOffset(),
-                                     $srcToken->getType());
+                                     $srcToken->getEndOffset());
 
         $newToken->setPositionIncrement($srcToken->getPositionIncrement());
 
diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php
new file mode 100644 (file)
index 0000000..83abfb2
--- /dev/null
@@ -0,0 +1,68 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_TokenFilter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
+
+
+/**
+ * Token filter that removes short words. What is short word can be configured with constructor.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_TokenFilter_ShortWords extends Zend_Search_Lucene_Analysis_TokenFilter
+{
+    /**
+     * Minimum allowed term length
+     * @var integer
+     */
+    private $length;
+
+    /**
+     * Constructs new instance of this filter.
+     *
+     * @param integer $short  minimum allowed length of term which passes this filter (default 2)
+     */
+    public function __construct($length = 2) {
+        $this->length = $length;
+    }
+
+    /**
+     * Normalize Token or remove it (if null is returned)
+     *
+     * @param Zend_Search_Lucene_Analysis_Token $srcToken
+     * @return Zend_Search_Lucene_Analysis_Token
+     */
+    public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
+        if (strlen($srcToken->getTermText()) < $this->length) {
+            return null;
+        } else {
+            return $srcToken;
+        }
+    }
+}
+
diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php
new file mode 100644 (file)
index 0000000..f85d5d5
--- /dev/null
@@ -0,0 +1,101 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Analysis_TokenFilter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
+
+
+/**
+ * Token filter that removes stop words. These words must be provided as array (set), example:
+ * $stopwords = array('the' => 1, 'an' => '1');
+ *
+ * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter
+{
+    /**
+     * Minimum allowed term length
+     * @var array
+     */
+    private $_stopSet;
+
+    /**
+     * Constructs new instance of this filter.
+     *
+     * @param array $stopwords array (set) of words that will be filtered out
+     */
+    public function __construct($stopwords = array()) {
+        $this->_stopSet = array_flip($stopwords);
+    }
+
+    /**
+     * Normalize Token or remove it (if null is returned)
+     *
+     * @param Zend_Search_Lucene_Analysis_Token $srcToken
+     * @return Zend_Search_Lucene_Analysis_Token
+     */
+    public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
+        if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
+            $t = $srcToken->getTermText();
+            return null;
+        } else {
+            return $srcToken;
+        }
+    }
+
+    /**
+     * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first
+     * column are ignored (as comments).
+     *
+     * You can call this method one or more times. New stopwords are always added to current set.
+     *
+     * @param string $filepath full path for text file with stopwords
+     * @throws Zend_Search_Exception When the file doesn`t exists or is not readable.
+     */
+    public function loadFromFile($filepath = null) {
+        if (! $filepath || ! file_exists($filepath)) {
+            throw new Zend_Search_Exception('You have to provide valid file path');
+        }
+        $fd = fopen($filepath, "r");
+        if (! $fd) {
+            throw new Zend_Search_Exception('Cannot open file ' . $filepath);
+        }
+        while (!feof ($fd)) {
+            $buffer = trim(fgets($fd));
+            if (strlen($buffer) > 0 && $buffer[0] != '#') {
+                $this->_stopSet[$buffer] = 1;
+            }
+        }
+        if (!fclose($fd)) {
+            throw new Zend_Search_Exception('Cannot close file ' . $filepath);
+        }
+    }
+}
+
index 48e48cf..6309719 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Document
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Field */
-require_once 'Zend/Search/Lucene/Field.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Field.php';
 
 
 /**
@@ -30,7 +30,7 @@ require_once 'Zend/Search/Lucene/Field.php';
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Document
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Document
@@ -90,9 +90,9 @@ class Zend_Search_Lucene_Document
      */
     public function getField($fieldName)
     {
-               if (!array_key_exists($fieldName, $this->_fields)) {
-                       throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
-               }
+        if (!array_key_exists($fieldName, $this->_fields)) {
+            throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
+        }
         return $this->_fields[$fieldName];
     }
 
@@ -105,7 +105,17 @@ class Zend_Search_Lucene_Document
      */
     public function getFieldValue($fieldName)
     {
-       return $this->getField($fieldName)->stringValue;
+       return $this->getField($fieldName)->value;
     }
 
+    /**
+     * Returns the string value of a named field in UTF-8 encoding.
+     *
+     * @see __get()
+     * @return string
+     */
+    public function getFieldUtf8Value($fieldName)
+    {
+       return $this->getField($fieldName)->getUtf8Value();
+    }
 }
diff --git a/search/Zend/Search/Lucene/Document/Html.php b/search/Zend/Search/Lucene/Document/Html.php
new file mode 100644 (file)
index 0000000..c10c823
--- /dev/null
@@ -0,0 +1,310 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Document
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Document */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
+
+
+/**
+ * HTML document.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Document
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
+{
+    /**
+     * List of document links
+     *
+     * @var array
+     */
+    private $_links = array();
+
+    /**
+     * List of document header links
+     *
+     * @var array
+     */
+    private $_headerLinks = array();
+
+    /**
+     * Stored DOM representation
+     *
+     * @var DOMDocument
+     */
+    private $_doc;
+
+    /**
+     * Object constructor
+     *
+     * @param string  $data
+     * @param boolean $isFile
+     * @param boolean $storeContent
+     */
+    private function __construct($data, $isFile, $storeContent)
+    {
+        $this->_doc = new DOMDocument();
+        $this->_doc->substituteEntities = true;
+
+        if ($isFile) {
+            @$this->_doc->loadHTMLFile($data);
+        } else{
+            @$this->_doc->loadHTML($data);
+        }
+
+        $xpath = new DOMXPath($this->_doc);
+
+        $docTitle = '';
+        $titleNodes = $xpath->query('/html/head/title');
+        foreach ($titleNodes as $titleNode) {
+            // title should always have only one entry, but we process all nodeset entries
+            $docTitle .= $titleNode->nodeValue . ' ';
+        }
+        $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding));
+
+        $metaNodes = $xpath->query('/html/head/meta[@name]');
+        foreach ($metaNodes as $metaNode) {
+            $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
+                                                           $metaNode->getAttribute('content'),
+                                                           $this->_doc->actualEncoding));
+        }
+
+        $docBody = '';
+        $bodyNodes = $xpath->query('/html/body');
+        foreach ($bodyNodes as $bodyNode) {
+            // body should always have only one entry, but we process all nodeset entries
+            $this->_retrieveNodeText($bodyNode, $docBody);
+        }
+        if ($storeContent) {
+            $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding));
+        } else {
+            $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding));
+        }
+
+        $linkNodes = $this->_doc->getElementsByTagName('a');
+        foreach ($linkNodes as $linkNode) {
+            if (($href = $linkNode->getAttribute('href')) != '') {
+                $this->_links[] = $href;
+            }
+        }
+        $this->_links = array_unique($this->_links);
+
+        $linkNodes = $xpath->query('/html/head/link');
+        foreach ($linkNodes as $linkNode) {
+            if (($href = $linkNode->getAttribute('href')) != '') {
+                $this->_headerLinks[] = $href;
+            }
+        }
+        $this->_headerLinks = array_unique($this->_headerLinks);
+    }
+
+    /**
+     * Get node text
+     *
+     * We should exclude scripts, which may be not included into comment tags, CDATA sections,
+     *
+     * @param DOMNode $node
+     * @param string &$text
+     */
+    private function _retrieveNodeText(DOMNode $node, &$text)
+    {
+        if ($node->nodeType == XML_TEXT_NODE) {
+            $text .= $node->nodeValue ;
+            $text .= ' ';
+        } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
+            foreach ($node->childNodes as $childNode) {
+                $this->_retrieveNodeText($childNode, $text);
+            }
+        }
+    }
+
+    /**
+     * Get document HREF links
+     *
+     * @return array
+     */
+    public function getLinks()
+    {
+        return $this->_links;
+    }
+
+    /**
+     * Get document header links
+     *
+     * @return array
+     */
+    public function getHeaderLinks()
+    {
+        return $this->_headerLinks;
+    }
+
+    /**
+     * Load HTML document from a string
+     *
+     * @param string $data
+     * @param boolean $storeContent
+     * @return Zend_Search_Lucene_Document_Html
+     */
+    public static function loadHTML($data, $storeContent = false)
+    {
+        return new Zend_Search_Lucene_Document_Html($data, false, $storeContent);
+    }
+
+    /**
+     * Load HTML document from a file
+     *
+     * @param string $file
+     * @param boolean $storeContent
+     * @return Zend_Search_Lucene_Document_Html
+     */
+    public static function loadHTMLFile($file, $storeContent = false)
+    {
+        return new Zend_Search_Lucene_Document_Html($file, true, $storeContent);
+    }
+
+
+    /**
+     * Highlight text in text node
+     *
+     * @param DOMText $node
+     * @param array   $wordsToHighlight
+     * @param string  $color
+     */
+    public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color)
+    {
+        $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+        $analyzer->setInput($node->nodeValue, $this->_doc->encoding);
+
+        $matchedTokens = array();
+
+        while (($token = $analyzer->nextToken()) !== null) {
+            if (isset($wordsToHighlight[$token->getTermText()])) {
+                $matchedTokens[] = $token;
+            }
+        }
+
+        if (count($matchedTokens) == 0) {
+            return;
+        }
+
+        $matchedTokens = array_reverse($matchedTokens);
+
+        foreach ($matchedTokens as $token) {
+            // Cut text after matched token
+            $node->splitText($token->getEndOffset());
+
+            // Cut matched node
+            $matchedWordNode = $node->splitText($token->getStartOffset());
+
+            $highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue);
+            $highlightedNode->setAttribute('style', 'color:black;background-color:' . $color);
+
+            $node->parentNode->replaceChild($highlightedNode, $matchedWordNode);
+        }
+    }
+
+
+    /**
+     * highlight words in content of the specified node
+     *
+     * @param DOMNode $contextNode
+     * @param array $wordsToHighlight
+     * @param string $color
+     */
+    public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color)
+    {
+        $textNodes = array();
+
+        if (!$contextNode->hasChildNodes()) {
+            return;
+        }
+
+        foreach ($contextNode->childNodes as $childNode) {
+            if ($childNode->nodeType == XML_TEXT_NODE) {
+                // process node later to leave childNodes structure untouched
+                $textNodes[] = $childNode;
+            } else {
+                // Skip script nodes
+                if ($childNode->nodeName != 'script') {
+                    $this->_highlightNode($childNode, $wordsToHighlight, $color);
+                }
+            }
+        }
+
+        foreach ($textNodes as $textNode) {
+            $this->_highlightTextNode($textNode, $wordsToHighlight, $color);
+        }
+    }
+
+
+
+    /**
+     * Highlight text with specified color
+     *
+     * @param string|array $words
+     * @param string $color
+     * @return string
+     */
+    public function highlight($words, $color = '#66ffff')
+    {
+        if (!is_array($words)) {
+            $words = array($words);
+        }
+        $wordsToHighlight = array();
+
+        $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+        foreach ($words as $wordString) {
+            $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString));
+        }
+
+        if (count($wordsToHighlight) == 0) {
+            return $this->_doc->saveHTML();
+        }
+
+        $wordsToHighlightFlipped = array();
+        foreach ($wordsToHighlight as $id => $token) {
+            $wordsToHighlightFlipped[$token->getTermText()] = $id;
+        }
+
+        $xpath = new DOMXPath($this->_doc);
+
+        $matchedNodes = $xpath->query("/html/body/*");
+        foreach ($matchedNodes as $matchedNode) {
+            $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color);
+        }
+
+    }
+
+    /**
+     * Get HTML
+     *
+     * @return string
+     */
+    public function getHTML()
+    {
+        return $this->_doc->saveHTML();
+    }
+}
+
index 5b73b29..9d06e89 100644 (file)
@@ -14,7 +14,7 @@
  *
  * @category   Zend
  * @package    Zend_Search_Lucene
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 /**
  * Framework base exception
  */
-require_once 'Zend/Search/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Exception extends Zend_Search_Exception
diff --git a/search/Zend/Search/Lucene/FSM.php b/search/Zend/Search/Lucene/FSM.php
new file mode 100644 (file)
index 0000000..31c9069
--- /dev/null
@@ -0,0 +1,433 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_FSMAction */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/FSMAction.php';
+
+/** Zend_Search_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Exception.php';
+
+
+/**
+ * Abstract Finite State Machine
+ *
+ * Take a look on Wikipedia state machine description: http://en.wikipedia.org/wiki/Finite_state_machine
+ *
+ * Any type of Transducers (Moore machine or Mealy machine) also may be implemented by using this abstract FSM.
+ * process() methods invokes a specified actions which may construct FSM output.
+ * Actions may be also used to signal, that we have reached Accept State
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+abstract class Zend_Search_Lucene_FSM
+{
+    /**
+     * Machine States alphabet
+     *
+     * @var array
+     */
+    private $_states = array();
+
+    /**
+     * Current state
+     *
+     * @var integer|string
+     */
+    private $_currentState = null;
+
+    /**
+     * Input alphabet
+     *
+     * @var array
+     */
+    private $_inputAphabet = array();
+
+    /**
+     * State transition table
+     *
+     * [sourceState][input] => targetState
+     *
+     * @var array
+     */
+    private $_rules = array();
+
+    /**
+     * List of entry actions
+     * Each action executes when entering the state
+     *
+     * [state] => action
+     *
+     * @var array
+     */
+    private $_entryActions =  array();
+
+    /**
+     * List of exit actions
+     * Each action executes when exiting the state
+     *
+     * [state] => action
+     *
+     * @var array
+     */
+    private $_exitActions =  array();
+
+    /**
+     * List of input actions
+     * Each action executes when entering the state
+     *
+     * [state][input] => action
+     *
+     * @var array
+     */
+    private $_inputActions =  array();
+
+    /**
+     * List of input actions
+     * Each action executes when entering the state
+     *
+     * [state1][state2] => action
+     *
+     * @var array
+     */
+    private $_transitionActions =  array();
+
+    /**
+     * Finite State machine constructor
+     *
+     * $states is an array of integers or strings with a list of possible machine states
+     * constructor treats fist list element as a sturt state (assignes it to $_current state).
+     * It may be reassigned by setState() call.
+     * States list may be empty and can be extended later by addState() or addStates() calls.
+     *
+     * $inputAphabet is the same as $states, but represents input alphabet
+     * it also may be extended later by addInputSymbols() or addInputSymbol() calls.
+     *
+     * $rules parameter describes FSM transitions and has a structure:
+     * array( array(sourseState, input, targetState[, inputAction]),
+     *        array(sourseState, input, targetState[, inputAction]),
+     *        array(sourseState, input, targetState[, inputAction]),
+     *        ...
+     *      )
+     * Rules also can be added later by addRules() and addRule() calls.
+     *
+     * FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(),
+     * addInputAction() and addTransitionAction() calls.
+     *
+     * @param array $states
+     * @param array $inputAphabet
+     * @param array $rules
+     */
+    public function __construct($states = array(), $inputAphabet = array(), $rules = array())
+    {
+        $this->addStates($states);
+        $this->addInputSymbols($inputAphabet);
+        $this->addRules($rules);
+    }
+
+    /**
+     * Add states to the state machine
+     *
+     * @param array $states
+     */
+    public function addStates($states)
+    {
+        foreach ($states as $state) {
+            $this->addState($state);
+        }
+    }
+
+    /**
+     * Add state to the state machine
+     *
+     * @param integer|string $state
+     */
+    public function addState($state)
+    {
+        $this->_states[$state] = $state;
+
+        if ($this->_currentState === null) {
+            $this->_currentState = $state;
+        }
+    }
+
+    /**
+     * Set FSM state.
+     * No any action is invoked
+     *
+     * @param integer|string $state
+     * @throws Zend_Search_Exception
+     */
+    public function setState($state)
+    {
+        if (!isset($this->_states[$state])) {
+            throw new Zend_Search_Exception('State \'' . $state . '\' is not on of the possible FSM states.');
+        }
+
+        $this->_currentState = $state;
+    }
+
+    /**
+     * Get FSM state.
+     *
+     * @return integer|string $state|null
+     */
+    public function getState()
+    {
+        return $this->_currentState;
+    }
+
+    /**
+     * Add symbols to the input alphabet
+     *
+     * @param array $inputAphabet
+     */
+    public function addInputSymbols($inputAphabet)
+    {
+        foreach ($inputAphabet as $inputSymbol) {
+            $this->addInputSymbol($inputSymbol);
+        }
+    }
+
+    /**
+     * Add symbol to the input alphabet
+     *
+     * @param integer|string $inputSymbol
+     */
+    public function addInputSymbol($inputSymbol)
+    {
+        $this->_inputAphabet[$inputSymbol] = $inputSymbol;
+    }
+
+
+    /**
+     * Add transition rules
+     *
+     * array structure:
+     * array( array(sourseState, input, targetState[, inputAction]),
+     *        array(sourseState, input, targetState[, inputAction]),
+     *        array(sourseState, input, targetState[, inputAction]),
+     *        ...
+     *      )
+     *
+     * @param array $rules
+     */
+    public function addRules($rules)
+    {
+        foreach ($rules as $rule) {
+            $this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null);
+        }
+    }
+
+    /**
+     * Add symbol to the input alphabet
+     *
+     * @param integer|string $sourceState
+     * @param integer|string $input
+     * @param integer|string $targetState
+     * @param Zend_Search_Lucene_FSMAction|null $inputAction
+     * @throws Zend_Search_Exception
+     */
+    public function addRule($sourceState, $input, $targetState, $inputAction = null)
+    {
+        if (!isset($this->_states[$sourceState])) {
+            throw new Zend_Search_Exception('Undefined source state (' . $sourceState . ').');
+        }
+        if (!isset($this->_states[$targetState])) {
+            throw new Zend_Search_Exception('Undefined target state (' . $targetState . ').');
+        }
+        if (!isset($this->_inputAphabet[$input])) {
+            throw new Zend_Search_Exception('Undefined input symbol (' . $input . ').');
+        }
+
+        if (!isset($this->_rules[$sourceState])) {
+            $this->_rules[$sourceState] = array();
+        }
+        if (isset($this->_rules[$sourceState][$input])) {
+            throw new Zend_Search_Exception('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.');
+        }
+
+        $this->_rules[$sourceState][$input] = $targetState;
+
+
+        if ($inputAction !== null) {
+            $this->addInputAction($sourceState, $input, $inputAction);
+        }
+    }
+
+
+    /**
+     * Add state entry action.
+     * Several entry actions are allowed.
+     * Action execution order is defined by addEntryAction() calls
+     *
+     * @param integer|string $state
+     * @param Zend_Search_Lucene_FSMAction $action
+     */
+    public function addEntryAction($state, Zend_Search_Lucene_FSMAction $action)
+    {
+        if (!isset($this->_states[$state])) {
+            throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+        }
+
+        if (!isset($this->_entryActions[$state])) {
+            $this->_entryActions[$state] = array();
+        }
+
+        $this->_entryActions[$state][] = $action;
+    }
+
+    /**
+     * Add state exit action.
+     * Several exit actions are allowed.
+     * Action execution order is defined by addEntryAction() calls
+     *
+     * @param integer|string $state
+     * @param Zend_Search_Lucene_FSMAction $action
+     */
+    public function addExitAction($state, Zend_Search_Lucene_FSMAction $action)
+    {
+        if (!isset($this->_states[$state])) {
+            throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+        }
+
+        if (!isset($this->_exitActions[$state])) {
+            $this->_exitActions[$state] = array();
+        }
+
+        $this->_exitActions[$state][] = $action;
+    }
+
+    /**
+     * Add input action (defined by {state, input} pair).
+     * Several input actions are allowed.
+     * Action execution order is defined by addInputAction() calls
+     *
+     * @param integer|string $state
+     * @param integer|string $input
+     * @param Zend_Search_Lucene_FSMAction $action
+     */
+    public function addInputAction($state, $inputSymbol, Zend_Search_Lucene_FSMAction $action)
+    {
+        if (!isset($this->_states[$state])) {
+            throw new Zend_Search_Exception('Undefined state (' . $state. ').');
+        }
+        if (!isset($this->_inputAphabet[$inputSymbol])) {
+            throw new Zend_Search_Exception('Undefined input symbol (' . $inputSymbol. ').');
+        }
+
+        if (!isset($this->_inputActions[$state])) {
+            $this->_inputActions[$state] = array();
+        }
+        if (!isset($this->_inputActions[$state][$inputSymbol])) {
+            $this->_inputActions[$state][$inputSymbol] = array();
+        }
+
+        $this->_inputActions[$state][$inputSymbol][] = $action;
+    }
+
+    /**
+     * Add transition action (defined by {state, input} pair).
+     * Several transition actions are allowed.
+     * Action execution order is defined by addTransitionAction() calls
+     *
+     * @param integer|string $sourceState
+     * @param integer|string $targetState
+     * @param Zend_Search_Lucene_FSMAction $action
+     */
+    public function addTransitionAction($sourceState, $targetState, Zend_Search_Lucene_FSMAction $action)
+    {
+        if (!isset($this->_states[$sourceState])) {
+            throw new Zend_Search_Exception('Undefined source state (' . $sourceState. ').');
+        }
+        if (!isset($this->_states[$targetState])) {
+            throw new Zend_Search_Exception('Undefined source state (' . $targetState. ').');
+        }
+
+        if (!isset($this->_transitionActions[$sourceState])) {
+            $this->_transitionActions[$sourceState] = array();
+        }
+        if (!isset($this->_transitionActions[$sourceState][$targetState])) {
+            $this->_transitionActions[$sourceState][$targetState] = array();
+        }
+
+        $this->_transitionActions[$sourceState][$targetState][] = $action;
+    }
+
+
+    /**
+     * Process an input
+     *
+     * @param mixed $input
+     * @throws Zend_Search_Exception
+     */
+    public function process($input)
+    {
+        if (!isset($this->_rules[$this->_currentState])) {
+            throw new Zend_Search_Exception('There is no any rule for current state (' . $this->_currentState . ').');
+        }
+        if (!isset($this->_rules[$this->_currentState][$input])) {
+            throw new Zend_Search_Exception('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').');
+        }
+
+        $sourceState = $this->_currentState;
+        $targetState = $this->_rules[$this->_currentState][$input];
+
+        if ($sourceState != $targetState  &&  isset($this->_exitActions[$sourceState])) {
+            foreach ($this->_exitActions[$sourceState] as $action) {
+                $action->doAction();
+            }
+        }
+        if (isset($this->_inputActions[$sourceState]) &&
+            isset($this->_inputActions[$sourceState][$input])) {
+            foreach ($this->_inputActions[$sourceState][$input] as $action) {
+                $action->doAction();
+            }
+        }
+
+
+        $this->_currentState = $targetState;
+
+        if (isset($this->_transitionActions[$sourceState]) &&
+            isset($this->_transitionActions[$sourceState][$targetState])) {
+            foreach ($this->_transitionActions[$sourceState][$targetState] as $action) {
+                $action->doAction();
+            }
+        }
+        if ($sourceState != $targetState  &&  isset($this->_entryActions[$targetState])) {
+            foreach ($this->_entryActions[$targetState] as $action) {
+                $action->doAction();
+            }
+        }
+    }
+
+    public function reset()
+    {
+        if (count($this->_states) == 0) {
+            throw new Zend_Search_Exception('There is no any state defined for FSM.');
+        }
+
+        $this->_currentState = $this->_states[0];
+    }
+}
+
diff --git a/search/Zend/Search/Lucene/FSMAction.php b/search/Zend/Search/Lucene/FSMAction.php
new file mode 100644 (file)
index 0000000..606de7b
--- /dev/null
@@ -0,0 +1,65 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/**
+ * Abstract Finite State Machine
+ *
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_FSMAction
+{
+    /**
+     * Object reference
+     *
+     * @var object
+     */
+    private $_object;
+
+    /**
+     * Method name
+     *
+     * @var string
+     */
+    private $_method;
+
+    /**
+     * Object constructor
+     *
+     * @param object $object
+     * @param string $method
+     */
+    public function __construct($object, $method)
+    {
+        $this->_object = $object;
+        $this->_method = $method;
+    }
+
+    public function doAction()
+    {
+        $methodName = $this->_method;
+        $this->_object->$methodName();
+    }
+}
+
index 5a18fcf..86cd22c 100644 (file)
@@ -15,7 +15,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Document
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Document
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Field
 {
-    public $kind;
+    /**
+     * Field name
+     *
+     * @var string
+     */
+    public $name;
 
-    public $name        = 'body';
-    public $stringValue = null;
+
+    public $value;
     public $isStored    = false;
     public $isIndexed   = true;
     public $isTokenized = true;
@@ -47,26 +52,48 @@ class Zend_Search_Lucene_Field
 
     public $storeTermVector = false;
 
+    /**
+     * Field boos factor
+     * It's not stored directly in the index, but affects on normalizetion factor
+     *
+     * @var float
+     */
     public $boost = 1.0;
 
-    public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false)
+    /**
+     * Field value encoding.
+     *
+     * @var string
+     */
+    public $encoding;
+
+    /**
+     * Object constructor
+     *
+     * @param string $name
+     * @param string $value
+     * @param string $encoding
+     * @param boolean $isStored
+     * @param boolean $isIndexed
+     * @param boolean $isTokenized
+     * @param boolean $isBinary
+     */
+    public function __construct($name, $value, $encoding, $isStored, $isIndexed, $isTokenized, $isBinary = false)
     {
-        $this->name        = $name;
+        $this->name  = $name;
+        $this->value = $value;
 
         if (!$isBinary) {
-            /**
-             * @todo Correct UTF-8 string should be required in future
-             * Until full UTF-8 support is not completed, string should be normalized to ANSII encoding
-             */
-            $this->stringValue = iconv(mb_detect_encoding($stringValue), 'ASCII//TRANSLIT', $stringValue);
-            //$this->stringValue = iconv('', 'ASCII//TRANSLIT', $stringValue);
+            $this->encoding    = $encoding;
+            $this->isTokenized = $isTokenized;
         } else {
-            $this->stringValue = $stringValue;
+            $this->encoding    = '';
+            $this->isTokenized = false;
         }
-        $this->isStored    = $isStored;
-        $this->isIndexed   = $isIndexed;
-        $this->isTokenized = $isTokenized;
-        $this->isBinary    = $isBinary;
+
+        $this->isStored  = $isStored;
+        $this->isIndexed = $isIndexed;
+        $this->isBinary  = $isBinary;
 
         $this->storeTermVector = false;
         $this->boost           = 1.0;
@@ -79,11 +106,12 @@ class Zend_Search_Lucene_Field
      *
      * @param string $name
      * @param string $value
+     * @param string $encoding
      * @return Zend_Search_Lucene_Field
      */
-    static public function Keyword($name, $value)
+    public static function Keyword($name, $value, $encoding = '')
     {
-        return new self($name, $value, true, true, false);
+        return new self($name, $value, $encoding, true, true, false);
     }
 
 
@@ -93,11 +121,12 @@ class Zend_Search_Lucene_Field
      *
      * @param string $name
      * @param string $value
+     * @param string $encoding
      * @return Zend_Search_Lucene_Field
      */
-    static public function UnIndexed($name, $value)
+    public static function UnIndexed($name, $value, $encoding = '')
     {
-        return new self($name, $value, true, false, false);
+        return new self($name, $value, $encoding, true, false, false);
     }
 
 
@@ -107,11 +136,12 @@ class Zend_Search_Lucene_Field
      *
      * @param string $name
      * @param string $value
+     * @param string $encoding
      * @return Zend_Search_Lucene_Field
      */
-    static public function Binary($name, $value)
+    public static function Binary($name, $value)
     {
-        return new self($name, $value, true, false, false, true);
+        return new self($name, $value, '', true, false, false, true);
     }
 
     /**
@@ -121,11 +151,12 @@ class Zend_Search_Lucene_Field
      *
      * @param string $name
      * @param string $value
+     * @param string $encoding
      * @return Zend_Search_Lucene_Field
      */
-    static public function Text($name, $value)
+    public static function Text($name, $value, $encoding = '')
     {
-        return new self($name, $value, true, true, true);
+        return new self($name, $value, $encoding, true, true, true);
     }
 
 
@@ -135,12 +166,27 @@ class Zend_Search_Lucene_Field
      *
      * @param string $name
      * @param string $value
+     * @param string $encoding
      * @return Zend_Search_Lucene_Field
      */
-    static public function UnStored($name, $value)
+    public static function UnStored($name, $value, $encoding = '')
     {
-        return new self($name, $value, false, true, true);
+        return new self($name, $value, $encoding, false, true, true);
     }
 
+    /**
+     * Get field value in UTF-8 encoding
+     *
+     * @return string
+     */
+    public function getUtf8Value()
+    {
+        if (strcasecmp($this->encoding, 'utf8' ) == 0  ||
+            strcasecmp($this->encoding, 'utf-8') == 0 ) {
+                return $this->value;
+        } else {
+            return iconv($this->encoding, 'UTF-8', $this->value);
+        }
+    }
 }
 
diff --git a/search/Zend/Search/Lucene/Index/DictionaryLoader.php b/search/Zend/Search/Lucene/Index/DictionaryLoader.php
new file mode 100644 (file)
index 0000000..d3f0669
--- /dev/null
@@ -0,0 +1,254 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+
+/**
+ * Dictionary loader
+ *
+ * It's a dummy class which is created to encapsulate non-good structured code.
+ * Manual "method inlining" is performed to increase dictionary index loading operation
+ * which is major bottelneck for search performance.
+ *
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_DictionaryLoader
+{
+    /**
+     * Dictionary index loader.
+     *
+     * It takes a string which is actually <segment_name>.tii index file data and
+     * returns two arrays - term and tremInfo lists.
+     *
+     * See Zend_Search_Lucene_Index_SegmintInfo class for details
+     *
+     * @param string $data
+     * @return array
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public static function load($data)
+    {
+        $termDictionary = array();
+        $termInfos      = array();
+        $pos = 0;
+
+        // $tiVersion = $tiiFile->readInt();
+        $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
+        $pos += 4;
+        if ($tiVersion != (int)0xFFFFFFFE) {
+            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+        }
+
+        // $indexTermCount = = $tiiFile->readLong();
+        if (PHP_INT_SIZE > 4) {
+            $indexTermCount = ord($data[$pos]) << 56  |
+                              ord($data[$pos+1]) << 48  |
+                              ord($data[$pos+2]) << 40  |
+                              ord($data[$pos+3]) << 32  |
+                              ord($data[$pos+4]) << 24  |
+                              ord($data[$pos+5]) << 16  |
+                              ord($data[$pos+6]) << 8   |
+                              ord($data[$pos+7]);
+        } else {
+            if ((ord($data[$pos])            != 0) ||
+                (ord($data[$pos+1])          != 0) ||
+                (ord($data[$pos+2])          != 0) ||
+                (ord($data[$pos+3])          != 0) ||
+                ((ord($data[$pos+4]) & 0x80) != 0)) {
+                     throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
+                 }
+
+            $indexTermCount = ord($data[$pos+4]) << 24  |
+                              ord($data[$pos+5]) << 16  |
+                              ord($data[$pos+6]) << 8   |
+                              ord($data[$pos+7]);
+        }
+        $pos += 8;
+
+        //                  $tiiFile->readInt();  // IndexInterval
+        $pos += 4;
+
+        // $skipInterval   = $tiiFile->readInt();
+        $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
+        $pos += 4;
+        if ($indexTermCount < 1) {
+            throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
+        }
+
+        $prevTerm     = '';
+        $freqPointer  =  0;
+        $proxPointer  =  0;
+        $indexPointer =  0;
+        for ($count = 0; $count < $indexTermCount; $count++) {
+            //$termPrefixLength = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $termPrefixLength = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $termPrefixLength |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $termSuffix       = $tiiFile->readString();
+            $nbyte = ord($data[$pos++]);
+            $len = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $len |= ($nbyte & 0x7F) << $shift;
+            }
+            if ($len == 0) {
+                $termSuffix = '';
+            } else {
+                $termSuffix = substr($data, $pos, $len);
+                $pos += $len;
+                for ($count1 = 0; $count1 < $len; $count1++ ) {
+                    if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
+                        $addBytes = 1;
+                        if (ord($termSuffix[$count1]) & 0x20 ) {
+                            $addBytes++;
+                        }
+                        $termSuffix .= substr($data, $pos, $addBytes);
+                        $pos += $addBytes;
+                        $len += $addBytes;
+
+                        // Check for null character. Java2 encodes null character
+                        // in two bytes.
+                        if (ord($termSuffix[$count1]) == 0xC0 &&
+                            ord($termSuffix[$count1+1]) == 0x80   ) {
+                            $termSuffix[$count1] = 0;
+                            $termSuffix = substr($termSuffix,0,$count1+1)
+                                        . substr($termSuffix,$count1+2);
+                        }
+                        $count1 += $addBytes;
+                    }
+                }
+            }
+
+            // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
+            $pb = 0; $pc = 0;
+            while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
+                $charBytes = 1;
+                if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
+                    $charBytes++;
+                    if (ord($prevTerm[$pb]) & 0x20 ) {
+                        $charBytes++;
+                        if (ord($prevTerm[$pb]) & 0x10 ) {
+                            $charBytes++;
+                        }
+                    }
+                }
+
+                if ($pb + $charBytes > strlen($data)) {
+                    // wrong character
+                    break;
+                }
+
+                $pc++;
+                $pb += $charBytes;
+            }
+            $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
+
+            // $termFieldNum     = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $termFieldNum = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $termFieldNum |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $docFreq          = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $docFreq = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $docFreq |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $freqPointer     += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $freqPointer += $vint;
+
+            // $proxPointer     += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $proxPointer += $vint;
+
+            if( $docFreq >= $skipInterval ) {
+                // $skipDelta = $tiiFile->readVInt();
+                $nbyte = ord($data[$pos++]);
+                $vint = $nbyte & 0x7F;
+                for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                    $nbyte = ord($data[$pos++]);
+                    $vint |= ($nbyte & 0x7F) << $shift;
+                }
+                $skipDelta = $vint;
+            } else {
+                $skipDelta = 0;
+            }
+
+            // $indexPointer += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $indexPointer += $vint;
+
+
+            // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
+            $termDictionary[] = array($termFieldNum, $termValue);
+
+            $termInfos[] =
+                 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+                 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+
+            $prevTerm = $termValue;
+        }
+
+        // Check special index entry mark
+        if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
+            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+        } else if (PHP_INT_SIZE > 4){
+            // Treat 64-bit 0xFFFFFFFF as -1
+            $termDictionary[0][0] = -1;
+        }
+
+        return array(&$termDictionary, &$termInfos);
+    }
+}
+
index 4c11aaa..1d138b6 100644 (file)
@@ -15,7 +15,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -24,7 +24,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Index_FieldInfo
index aeceab6..c6f7868 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
+/** Zend_Search_Lucene_Index_DictionaryLoader */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php';
+
 
 /** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Index_SegmentInfo
@@ -49,7 +52,12 @@ class Zend_Search_Lucene_Index_SegmentInfo
 
     /**
      * Term Dictionary Index
-     * Array of the Zend_Search_Lucene_Index_Term objects
+     *
+     * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
+     * of performance considerations)
+     * [0] -> $termValue
+     * [1] -> $termFieldNum
+     *
      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
      *
      * @var array
@@ -58,7 +66,14 @@ class Zend_Search_Lucene_Index_SegmentInfo
 
     /**
      * Term Dictionary Index TermInfos
-     * Array of the Zend_Search_Lucene_Index_TermInfo objects
+     *
+     * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
+     * of performance considerations)
+     * [0] -> $docFreq
+     * [1] -> $freqPointer
+     * [2] -> $proxPointer
+     * [3] -> $skipOffset
+     * [4] -> $indexPointer
      *
      * @var array
      */
@@ -88,6 +103,14 @@ class Zend_Search_Lucene_Index_SegmentInfo
      */
     private $_segFiles;
 
+    /**
+     * Associative array where the key is the file name and the value is file size (.csf).
+     *
+     * @var array
+     */
+    private $_segFileSizes;
+
+
     /**
      * File system adapter.
      *
@@ -122,6 +145,7 @@ class Zend_Search_Lucene_Index_SegmentInfo
      */
     private $_deletedDirty = false;
 
+
     /**
      * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
      * Documents count and Directory as a parameter.
@@ -144,9 +168,15 @@ class Zend_Search_Lucene_Index_SegmentInfo
 
             for ($count = 0; $count < $segFilesCount; $count++) {
                 $dataOffset = $cfsFile->readLong();
+                if ($count != 0) {
+                    $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
+                }
                 $fileName = $cfsFile->readString();
                 $this->_segFiles[$fileName] = $dataOffset;
             }
+            if ($count != 0) {
+                $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
+            }
         }
 
         $fnmFile = $this->openCompoundFile('.fnm');
@@ -197,7 +227,6 @@ class Zend_Search_Lucene_Index_SegmentInfo
                         }
                     }
                 }
-
             }
         } catch(Zend_Search_Exception $e) {
             if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
@@ -212,16 +241,17 @@ class Zend_Search_Lucene_Index_SegmentInfo
      * Opens index file stoted within compound index file
      *
      * @param string $extension
+     * @param boolean $shareHandler
      * @throws Zend_Search_Lucene_Exception
      * @return Zend_Search_Lucene_Storage_File
      */
-    public function openCompoundFile($extension)
+    public function openCompoundFile($extension, $shareHandler = true)
     {
         $filename = $this->_name . $extension;
 
         // Try to open common file first
         if ($this->_directory->fileExists($filename)) {
-            return $this->_directory->getFileObject($filename);
+            return $this->_directory->getFileObject($filename, $shareHandler);
         }
 
         if( !isset($this->_segFiles[$filename]) ) {
@@ -229,11 +259,34 @@ class Zend_Search_Lucene_Index_SegmentInfo
                                        . $filename . ' file.' );
         }
 
-        $file = $this->_directory->getFileObject( $this->_name.".cfs" );
+        $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
         $file->seek($this->_segFiles[$filename]);
         return $file;
     }
 
+    /**
+     * Get compound file length
+     *
+     * @param string $extension
+     * @return integer
+     */
+    public function compoundFileLength($extension)
+    {
+        $filename = $this->_name . $extension;
+
+        // Try to get common file first
+        if ($this->_directory->fileExists($filename)) {
+            return $this->_directory->fileLength($filename);
+        }
+
+        if( !isset($this->_segFileSizes[$filename]) ) {
+            throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
+                                       . $filename . ' file.' );
+        }
+
+        return $this->_segFileSizes[$filename];
+    }
+
     /**
      * Returns field index or -1 if field is not found
      *
@@ -255,7 +308,7 @@ class Zend_Search_Lucene_Index_SegmentInfo
      * Returns field info for specified field
      *
      * @param integer $fieldNum
-     * @return ZSearchFieldInfo
+     * @return Zend_Search_Lucene_Index_FieldInfo
      */
     public function getField($fieldNum)
     {
@@ -281,77 +334,68 @@ class Zend_Search_Lucene_Index_SegmentInfo
     }
 
     /**
-     * Returns the total number of documents in this segment.
+     * Returns array of FieldInfo objects.
      *
-     * @return integer
+     * @return array
      */
-    public function count()
+    public function getFieldInfos()
     {
-        return $this->_docCount;
+        return $this->_fields;
     }
 
     /**
-     * Get field position in a fields dictionary
+     * Returns the total number of documents in this segment (including deleted documents).
      *
-     * @param integer $fieldNum
      * @return integer
      */
-    private function _getFieldPosition($fieldNum) {
-        // Treat values which are not in a translation table as a 'direct value'
-        return isset($this->_fieldsDicPositions[$fieldNum]) ?
-                           $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
+    public function count()
+    {
+        return $this->_docCount;
     }
 
     /**
-     * Loads Term dictionary from TermInfoIndex file
+     * Returns number of deleted documents.
+     *
+     * @return integer
      */
-    protected function _loadDictionary()
+    private function _deletedCount()
     {
-        if ($this->_termDictionary !== null) {
-            return;
+        if ($this->_deleted === null) {
+            return 0;
         }
 
-        $this->_termDictionary = array();
-        $this->_termDictionaryInfos = array();
-
-        $tiiFile = $this->openCompoundFile('.tii');
-        $tiVersion = $tiiFile->readInt();
-        if ($tiVersion != (int)0xFFFFFFFE) {
-            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
-        }
-
-        $indexTermCount = $tiiFile->readLong();
-                          $tiiFile->readInt();  // IndexInterval
-        $skipInterval   = $tiiFile->readInt();
-
-        $prevTerm     = '';
-        $freqPointer  =  0;
-        $proxPointer  =  0;
-        $indexPointer =  0;
-        for ($count = 0; $count < $indexTermCount; $count++) {
-            $termPrefixLength = $tiiFile->readVInt();
-            $termSuffix       = $tiiFile->readString();
-            $termValue        = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;
-
-            $termFieldNum     = $tiiFile->readVInt();
-            $docFreq          = $tiiFile->readVInt();
-            $freqPointer     += $tiiFile->readVInt();
-            $proxPointer     += $tiiFile->readVInt();
-            if( $docFreq >= $skipInterval ) {
-                $skipDelta = $tiiFile->readVInt();
-            } else {
-                $skipDelta = 0;
-            }
-
-            $indexPointer += $tiiFile->readVInt();
+        if (extension_loaded('bitset')) {
+            return count(bitset_to_array($this->_deleted));
+        } else {
+            return count($this->_deleted);
+        }
+    }
 
-            $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
-            $this->_termDictionaryInfos[] =
-                new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
-            $prevTerm = $termValue;
+    /**
+     * Returns the total number of non-deleted documents in this segment.
+     *
+     * @return integer
+     */
+    public function numDocs()
+    {
+        if ($this->hasDeletions()) {
+            return $this->_docCount - $this->_deletedCount();
+        } else {
+            return $this->_docCount;
         }
     }
 
+    /**
+     * Get field position in a fields dictionary
+     *
+     * @param integer $fieldNum
+     * @return integer
+     */
+    private function _getFieldPosition($fieldNum) {
+        // Treat values which are not in a translation table as a 'direct value'
+        return isset($this->_fieldsDicPositions[$fieldNum]) ?
+                           $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
+    }
 
     /**
      * Return segment name
@@ -364,15 +408,75 @@ class Zend_Search_Lucene_Index_SegmentInfo
     }
 
 
+    /**
+     * TermInfo cache
+     *
+     * Size is 1024.
+     * Numbers are used instead of class constants because of performance considerations
+     *
+     * @var array
+     */
+    private $_termInfoCache = array();
+
+    private function _cleanUpTermInfoCache()
+    {
+        // Clean 256 term infos
+        foreach ($this->_termInfoCache as $key => $termInfo) {
+            unset($this->_termInfoCache[$key]);
+
+            // leave 768 last used term infos
+            if (count($this->_termInfoCache) == 768) {
+                break;
+            }
+        }
+    }
+
     /**
      * Scans terms dictionary and returns term info
      *
      * @param Zend_Search_Lucene_Index_Term $term
      * @return Zend_Search_Lucene_Index_TermInfo
      */
-    public function getTermInfo($term)
+    public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
     {
-        $this->_loadDictionary();
+        $termKey = $term->key();
+        if (isset($this->_termInfoCache[$termKey])) {
+            $termInfo = $this->_termInfoCache[$termKey];
+
+            // Move termInfo to the end of cache
+            unset($this->_termInfoCache[$termKey]);
+            $this->_termInfoCache[$termKey] = $termInfo;
+
+            return $termInfo;
+        }
+
+
+        if ($this->_termDictionary === null) {
+            // Check, if index is already serialized
+            if ($this->_directory->fileExists($this->_name . '.sti')) {
+                // Prefetch dictionary index data
+                $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
+                $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
+
+                // Load dictionary index data
+                list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData);
+            } else {
+                // Prefetch dictionary index data
+                $tiiFile = $this->openCompoundFile('.tii');
+                $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
+
+                // Load dictionary index data
+                list($this->_termDictionary, $this->_termDictionaryInfos) =
+                            Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
+
+                $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
+                $stiFile = $this->_directory->createFile($this->_name . '.sti');
+                $stiFile->writeBytes($stiFileData);
+            }
+
+        }
+
+
 
         $searchField = $this->getFieldNum($term->field);
 
@@ -389,10 +493,10 @@ class Zend_Search_Lucene_Index_SegmentInfo
             $mid = ($highIndex + $lowIndex) >> 1;
             $midTerm = $this->_termDictionary[$mid];
 
-            $fieldNum = $this->_getFieldPosition($midTerm->field);
+            $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
             $delta = $searchDicField - $fieldNum;
             if ($delta == 0) {
-                $delta = strcmp($term->text, $midTerm->text);
+                $delta = strcmp($term->text, $midTerm[1] /* text */);
             }
 
             if ($delta < 0) {
@@ -400,7 +504,14 @@ class Zend_Search_Lucene_Index_SegmentInfo
             } elseif ($delta > 0) {
                 $lowIndex  = $mid+1;
             } else {
-                return $this->_termDictionaryInfos[$mid]; // We got it!
+                // return $this->_termDictionaryInfos[$mid]; // We got it!
+                $a = $this->_termDictionaryInfos[$mid];
+                $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
+
+                // Put loaded termInfo into cache
+                $this->_termInfoCache[$termKey] = $termInfo;
+
+                return $termInfo;
             }
         }
 
@@ -411,7 +522,7 @@ class Zend_Search_Lucene_Index_SegmentInfo
 
         $prevPosition = $highIndex;
         $prevTerm = $this->_termDictionary[$prevPosition];
-        $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];
+        $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
 
         $tisFile = $this->openCompoundFile('.tis');
         $tiVersion = $tisFile->readInt();
@@ -423,12 +534,12 @@ class Zend_Search_Lucene_Index_SegmentInfo
         $indexInterval = $tisFile->readInt();
         $skipInterval  = $tisFile->readInt();
 
-        $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);
+        $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);
 
-        $termValue    = $prevTerm->text;
-        $termFieldNum = $prevTerm->field;
-        $freqPointer = $prevTermInfo->freqPointer;
-        $proxPointer = $prevTermInfo->proxPointer;
+        $termValue    = $prevTerm[1] /* text */;
+        $termFieldNum = $prevTerm[0] /* field */;
+        $freqPointer = $prevTermInfo[1] /* freqPointer */;
+        $proxPointer = $prevTermInfo[2] /* proxPointer */;
         for ($count = $prevPosition*$indexInterval + 1;
              $count <= $termCount &&
              ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
@@ -438,7 +549,7 @@ class Zend_Search_Lucene_Index_SegmentInfo
             $termPrefixLength = $tisFile->readVInt();
             $termSuffix       = $tisFile->readString();
             $termFieldNum     = $tisFile->readVInt();
-            $termValue        = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;
+            $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
 
             $docFreq      = $tisFile->readVInt();
             $freqPointer += $tisFile->readVInt();
@@ -451,10 +562,115 @@ class Zend_Search_Lucene_Index_SegmentInfo
         }
 
         if ($termFieldNum == $searchField && $termValue == $term->text) {
-            return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
+            $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
         } else {
-            return null;
+            $termInfo = null;
+        }
+
+        // Put loaded termInfo into cache
+        $this->_termInfoCache[$termKey] = $termInfo;
+
+        if (count($this->_termInfoCache) == 1024) {
+            $this->_cleanUpTermInfoCache();
+        }
+
+        return $termInfo;
+    }
+
+    /**
+     * Returns term freqs array.
+     * Result array structure: array(docId => freq, ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @param integer $shift
+     * @return Zend_Search_Lucene_Index_TermInfo
+     */
+    public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0)
+    {
+        $termInfo = $this->getTermInfo($term);
+
+        if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+            return array();
         }
+
+        $frqFile = $this->openCompoundFile('.frq');
+        $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+        $result = array();
+        $docId = 0;
+
+        for ($count = 0; $count < $termInfo->docFreq; $count++) {
+            $docDelta = $frqFile->readVInt();
+            if ($docDelta % 2 == 1) {
+                $docId += ($docDelta-1)/2;
+                $result[$shift + $docId] = 1;
+            } else {
+                $docId += $docDelta/2;
+                $result[$shift + $docId] = $frqFile->readVInt();
+            }
+        }
+
+        return $result;
+    }
+
+    /**
+     * Returns term positions array.
+     * Result array structure: array(docId => array(pos1, pos2, ...), ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @param integer $shift
+     * @return Zend_Search_Lucene_Index_TermInfo
+     */
+    public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0)
+    {
+        $termInfo = $this->getTermInfo($term);
+
+        if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+            return array();
+        }
+
+        $frqFile = $this->openCompoundFile('.frq');
+        $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+        $freqs = array();
+        $docId = 0;
+
+        for ($count = 0; $count < $termInfo->docFreq; $count++) {
+            $docDelta = $frqFile->readVInt();
+            if ($docDelta % 2 == 1) {
+                $docId += ($docDelta-1)/2;
+                $freqs[$docId] = 1;
+            } else {
+                $docId += $docDelta/2;
+                $freqs[$docId] = $frqFile->readVInt();
+            }
+        }
+
+        $result = array();
+        $prxFile = $this->openCompoundFile('.prx');
+        $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
+        foreach ($freqs as $docId => $freq) {
+            $termPosition = 0;
+            $positions = array();
+
+            for ($count = 0; $count < $freq; $count++ ) {
+                $termPosition += $prxFile->readVInt();
+                $positions[] = $termPosition;
+            }
+
+            $result[$shift + $docId] = $positions;
+        }
+
+        return $result;
+    }
+
+    /**
+     * Load normalizatin factors from an index file
+     *
+     * @param integer $fieldNum
+     */
+    private function _loadNorm($fieldNum)
+    {
+        $fFile = $this->openCompoundFile('.f' . $fieldNum);
+        $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
     }
 
     /**
@@ -462,7 +678,7 @@ class Zend_Search_Lucene_Index_SegmentInfo
      *
      * @param integer $id
      * @param string $fieldName
-     * @return string
+     * @return float
      */
     public function norm($id, $fieldName)
     {
@@ -472,14 +688,37 @@ class Zend_Search_Lucene_Index_SegmentInfo
             return null;
         }
 
-        if ( !isset( $this->_norms[$fieldNum] )) {
-            $fFile = $this->openCompoundFile('.f' . $fieldNum);
-            $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
+        if (!isset($this->_norms[$fieldNum])) {
+            $this->_loadNorm($fieldNum);
         }
 
         return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
     }
 
+    /**
+     * Returns norm vector, encoded in a byte string
+     *
+     * @param string $fieldName
+     * @return string
+     */
+    public function normVector($fieldName)
+    {
+        $fieldNum = $this->getFieldNum($fieldName);
+
+        if ($fieldNum == -1  ||  !($this->_fields[$fieldNum]->isIndexed)) {
+            $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
+
+            return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
+                              $this->_docCount);
+        }
+
+        if (!isset($this->_norms[$fieldNum])) {
+            $this->_loadNorm($fieldNum);
+        }
+
+        return $this->_norms[$fieldNum];
+    }
+
 
     /**
      * Returns true if any documents have been deleted from this index segment.
@@ -571,5 +810,247 @@ class Zend_Search_Lucene_Index_SegmentInfo
 
         $this->_deletedDirty = false;
     }
+
+
+
+    /**
+     * Term Dictionary File object for stream like terms reading
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_tisFile = null;
+
+    /**
+     * Frequencies File object for stream like terms reading
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_frqFile = null;
+
+    /**
+     * Offset of the .frq file in the compound file
+     *
+     * @var integer
+     */
+    private $_frqFileOffset;
+
+    /**
+     * Positions File object for stream like terms reading
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_prxFile = null;
+
+    /**
+     * Offset of the .prx file in the compound file
+     *
+     * @var integer
+     */
+    private $_prxFileOffset;
+
+
+    /**
+     * Number of terms in term stream
+     *
+     * @var integer
+     */
+    private $_termCount = 0;
+
+    /**
+     * Segment skip interval
+     *
+     * @var integer
+     */
+    private $_skipInterval;
+
+    /**
+     * Last TermInfo in a terms stream
+     *
+     * @var Zend_Search_Lucene_Index_TermInfo
+     */
+    private $_lastTermInfo = null;
+
+    /**
+     * Last Term in a terms stream
+     *
+     * @var Zend_Search_Lucene_Index_Term
+     */
+    private $_lastTerm = null;
+
+    /**
+     * Map of the document IDs
+     * Used to get new docID after removing deleted documents.
+     * It's not very effective from memory usage point of view,
+     * but much more faster, then other methods
+     *
+     * @var array|null
+     */
+    private $_docMap = null;
+
+    /**
+     * An array of all term positions in the documents.
+     * Array structure: array( docId => array( pos1, pos2, ...), ...)
+     *
+     * @var array
+     */
+    private $_lastTermPositions;
+
+    /**
+     * Reset terms stream
+     *
+     * $startId - id for the fist document
+     * $compact - remove deleted documents
+     *
+     * Returns start document id for the next segment
+     *
+     * @param integer $startId
+     * @param boolean $compact
+     * @throws Zend_Search_Lucene_Exception
+     * @return integer
+     */
+    public function reset($startId = 0, $compact = false)
+    {
+        if ($this->_tisFile !== null) {
+            $this->_tisFile = null;
+        }
+
+        $this->_tisFile = $this->openCompoundFile('.tis', false);
+        $tiVersion = $this->_tisFile->readInt();
+        if ($tiVersion != (int)0xFFFFFFFE) {
+            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
+        }
+
+        $this->_termCount    = $this->_tisFile->readLong();
+                               $this->_tisFile->readInt();  // Read Index interval
+        $this->_skipInterval = $this->_tisFile->readInt();  // Read skip interval
+
+        if ($this->_frqFile !== null) {
+            $this->_frqFile = null;
+        }
+        $this->_frqFile = $this->openCompoundFile('.frq', false);
+        $this->_frqFileOffset = $this->_frqFile->tell();
+
+        if ($this->_prxFile !== null) {
+            $this->_prxFile = null;
+        }
+        $this->_prxFile = $this->openCompoundFile('.prx', false);
+        $this->_prxFileOffset = $this->_prxFile->tell();
+
+        $this->_lastTerm     = new Zend_Search_Lucene_Index_Term('', -1);
+        $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
+
+        $this->_docMap = array();
+        for ($count = 0; $count < $this->_docCount; $count++) {
+            if (!$this->isDeleted($count)) {
+                $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
+            }
+        }
+
+        $this->nextTerm();
+        return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
+    }
+
+
+    /**
+     * Scans terms dictionary and returns next term
+     *
+     * @return Zend_Search_Lucene_Index_Term|null
+     */
+    public function nextTerm()
+    {
+        if ($this->_tisFile === null  ||  $this->_termCount == 0) {
+            $this->_lastTerm     = null;
+            $this->_lastTermInfo = null;
+
+            // may be necessary for "empty" segment
+            $this->_tisFile = null;
+            $this->_frqFile = null;
+            $this->_prxFile = null;
+
+            return null;
+        }
+
+        $termPrefixLength = $this->_tisFile->readVInt();
+        $termSuffix       = $this->_tisFile->readString();
+        $termFieldNum     = $this->_tisFile->readVInt();
+        $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
+
+        $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
+
+        $docFreq     = $this->_tisFile->readVInt();
+        $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
+        $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
+        if ($docFreq >= $this->_skipInterval) {
+            $skipOffset = $this->_tisFile->readVInt();
+        } else {
+            $skipOffset = 0;
+        }
+
+        $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
+
+
+        $this->_lastTermPositions = array();
+
+        $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
+        $freqs = array();   $docId = 0;
+        for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
+            $docDelta = $this->_frqFile->readVInt();
+            if( $docDelta % 2 == 1 ) {
+                $docId += ($docDelta-1)/2;
+                $freqs[ $docId ] = 1;
+            } else {
+                $docId += $docDelta/2;
+                $freqs[ $docId ] = $this->_frqFile->readVInt();
+            }
+        }
+
+        $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
+        foreach ($freqs as $docId => $freq) {
+            $termPosition = 0;  $positions = array();
+
+            for ($count = 0; $count < $freq; $count++ ) {
+                $termPosition += $this->_prxFile->readVInt();
+                $positions[] = $termPosition;
+            }
+
+            if (isset($this->_docMap[$docId])) {
+                $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
+            }
+        }
+
+
+        $this->_termCount--;
+        if ($this->_termCount == 0) {
+            $this->_tisFile = null;
+            $this->_frqFile = null;
+            $this->_prxFile = null;
+        }
+
+        return $this->_lastTerm;
+    }
+
+
+    /**
+     * Returns term in current position
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return Zend_Search_Lucene_Index_Term|null
+     */
+    public function currentTerm()
+    {
+        return $this->_lastTerm;
+    }
+
+
+    /**
+     * Returns an array of all term positions in the documents.
+     * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+     *
+     * @return array
+     */
+    public function currentTermPositions()
+    {
+        return $this->_lastTermPositions;
+    }
 }
 
diff --git a/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php b/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php
new file mode 100644 (file)
index 0000000..4d0f346
--- /dev/null
@@ -0,0 +1,53 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/PriorityQueue.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentInfoPriorityQueue extends Zend_Search_Lucene_PriorityQueue
+{
+    /**
+     * Compare elements
+     *
+     * Returns true, if $el1 is less than $el2; else otherwise
+     *
+     * @param mixed $segmentInfo1
+     * @param mixed $segmentInfo2
+     * @return boolean
+     */
+    protected function _less($segmentInfo1, $segmentInfo2)
+    {
+        return strcmp($segmentInfo1->currentTerm()->key(), $segmentInfo2->currentTerm()->key()) < 0;
+    }
+
+}
diff --git a/search/Zend/Search/Lucene/Index/SegmentMerger.php b/search/Zend/Search/Lucene/Index/SegmentMerger.php
new file mode 100644 (file)
index 0000000..157489c
--- /dev/null
@@ -0,0 +1,273 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentMerger
+{
+    /**
+     * Target segment writer
+     *
+     * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
+     */
+    private $_writer;
+
+    /**
+     * Number of docs in a new segment
+     *
+     * @var integer
+     */
+    private $_docCount;
+
+    /**
+     * A set of segments to be merged
+     *
+     * @var array Zend_Search_Lucene_Index_SegmentInfo
+     */
+    private $_segmentInfos = array();
+
+    /**
+     * Flag to signal, that merge is already done
+     *
+     * @var boolean
+     */
+    private $_mergeDone = false;
+
+    /**
+     * Field map
+     * [<segment_name>][<field_number>] => <target_field_number>
+     *
+     * @var array
+     */
+    private $_fieldsMap = array();
+
+
+
+    /**
+     * Object constructor.
+     *
+     * Creates new segment merger with $directory as target to merge segments into
+     * and $name as a name of new segment
+     *
+     * @param Zend_Search_Lucene_Storage_Directory $directory
+     * @param string $name
+     */
+    public function __construct($directory, $name)
+    {
+        $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
+    }
+
+
+    /**
+     * Add segmnet to a collection of segments to be merged
+     *
+     * @param Zend_Search_Lucene_Index_SegmentInfo $segment
+     */
+    public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
+    {
+        $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
+    }
+
+
+    /**
+     * Do merge.
+     *
+     * Returns number of documents in newly created segment
+     *
+     * @return Zend_Search_Lucene_Index_SegmentInfo
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function merge()
+    {
+        if ($this->_mergeDone) {
+            throw new Zend_Search_Lucene_Exception('Merge is already done.');
+        }
+
+        if (count($this->_segmentInfos) < 1) {
+            throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
+                                                 . count($this->_segmentInfos)
+                                                 . ').');
+        }
+
+        $this->_mergeFields();
+        $this->_mergeNorms();
+        $this->_mergeStoredFields();
+        $this->_mergeTerms();
+
+        $this->_mergeDone = true;
+
+        return $this->_writer->close();
+    }
+
+
+    /**
+     * Merge fields information
+     */
+    private function _mergeFields()
+    {
+        foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+            foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
+                $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
+            }
+        }
+    }
+
+    /**
+     * Merge field's normalization factors
+     */
+    private function _mergeNorms()
+    {
+        foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
+            if ($fieldInfo->isIndexed) {
+                foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+                    if ($segmentInfo->hasDeletions()) {
+                        $srcNorm = $segmentInfo->normVector($fieldInfo->name);
+                        $norm    = '';
+                        $docs    = $segmentInfo->count();
+                        for ($count = 0; $count < $docs; $count++) {
+                            if (!$segmentInfo->isDeleted($count)) {
+                                $norm .= $srcNorm[$count];
+                            }
+                        }
+                        $this->_writer->addNorm($fieldInfo->name, $norm);
+                    } else {
+                        $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Merge fields information
+     */
+    private function _mergeStoredFields()
+    {
+        $this->_docCount = 0;
+
+        foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+            $fdtFile = $segmentInfo->openCompoundFile('.fdt');
+
+            for ($count = 0; $count < $segmentInfo->count(); $count++) {
+                $fieldCount = $fdtFile->readVInt();
+                $storedFields = array();
+
+                for ($count2 = 0; $count2 < $fieldCount; $count2++) {
+                    $fieldNum = $fdtFile->readVInt();
+                    $bits = $fdtFile->readByte();
+                    $fieldInfo = $segmentInfo->getField($fieldNum);
+
+                    if (!($bits & 2)) { // Text data
+                        $storedFields[] =
+                                 new Zend_Search_Lucene_Field($fieldInfo->name,
+                                                              $fdtFile->readString(),
+                                                              'UTF-8',
+                                                              true,
+                                                              $fieldInfo->isIndexed,
+                                                              $bits & 1 );
+                    } else {            // Binary data
+                        $storedFields[] =
+                                 new Zend_Search_Lucene_Field($fieldInfo->name,
+                                                              $fdtFile->readBinary(),
+                                                              '',
+                                                              true,
+                                                              $fieldInfo->isIndexed,
+                                                              $bits & 1,
+                                                              true);
+                    }
+                }
+
+                if (!$segmentInfo->isDeleted($count)) {
+                    $this->_docCount++;
+                    $this->_writer->addStoredFields($storedFields);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Merge fields information
+     */
+    private function _mergeTerms()
+    {
+        $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
+
+        $segmentStartId = 0;
+        foreach ($this->_segmentInfos as $segName => $segmentInfo) {
+            $segmentStartId = $segmentInfo->reset($segmentStartId, true);
+
+            // Skip "empty" segments
+            if ($segmentInfo->currentTerm() !== null) {
+                $segmentInfoQueue->put($segmentInfo);
+            }
+        }
+
+        $this->_writer->initializeDictionaryFiles();
+
+        $termDocs = array();
+        while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
+            // Merge positions array
+            $termDocs += $segmentInfo->currentTermPositions();
+
+            if ($segmentInfoQueue->top() === null ||
+                $segmentInfoQueue->top()->currentTerm()->key() !=
+                            $segmentInfo->currentTerm()->key()) {
+                // We got new term
+                ksort($termDocs, SORT_NUMERIC);
+
+                // Add term if it's contained in any document
+                if (count($termDocs) > 0) {
+                    $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
+                }
+                $termDocs = array();
+            }
+
+            $segmentInfo->nextTerm();
+            // check, if segment dictionary is finished
+            if ($segmentInfo->currentTerm() !== null) {
+                // Put segment back into the priority queue
+                $segmentInfoQueue->put($segmentInfo);
+            }
+        }
+
+        $this->_writer->closeDictionaryFiles();
+    }
+}
index 6cb4477..2f1a05e 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
 /** Zend_Search_Lucene_Exception */
-require_once 'Zend/Search/Lucene/Exception.php';
-
-/** Zend_Search_Lucene_Analysis_Analyzer */
-require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
 
 /** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
-class Zend_Search_Lucene_Index_SegmentWriter
+abstract class Zend_Search_Lucene_Index_SegmentWriter
 {
     /**
      * Expert: The fraction of terms in the "dictionary" which should be stored
@@ -48,7 +45,7 @@ class Zend_Search_Lucene_Index_SegmentWriter
      *
      * @var integer
      */
-    static public $indexInterval = 128;
+    public static $indexInterval = 128;
 
     /** Expert: The fraction of TermDocs entries stored in skip tables.
      * Larger values result in smaller indexes, greater acceleration, but fewer
@@ -61,28 +58,28 @@ class Zend_Search_Lucene_Index_SegmentWriter
      *
      * @var integer
      */
-    static public $skipInterval = 0x7FFFFFFF;
+    public static $skipInterval = 0x7FFFFFFF;
 
     /**
      * Number of docs in a segment
      *
      * @var integer
      */
-    private $_docCount;
+    protected $_docCount = 0;
 
     /**
      * Segment name
      *
      * @var string
      */
-    private $_name;
+    protected $_name;
 
     /**
      * File system adapter.
      *
      * @var Zend_Search_Lucene_Storage_Directory
      */
-    private $_directory;
+    protected $_directory;
 
     /**
      * List of the index files.
@@ -90,52 +87,41 @@ class Zend_Search_Lucene_Index_SegmentWriter
      *
      * @var unknown_type
      */
-    private $_files;
-
-    /**
-     * Term Dictionary
-     * Array of the Zend_Search_Lucene_Index_Term objects
-     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
-     *
-     * @var array
-     */
-    private $_termDictionary;
-
-    /**
-     * Documents, which contain the term
-     *
-     * @var array
-     */
-    private $_termDocs;
+    protected $_files = array();
 
     /**
      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
      *
      * @var array
      */
-    private $_fields;
+    protected $_fields = array();
 
     /**
-     * Sizes of the indexed fields.
-     * Used for normalization factors calculation.
+     * Normalization factors.
+     * An array fieldName => normVector
+     * normVector is a binary string.
+     * Each byte corresponds to an indexed document in a segment and
+     * encodes normalization factor (float value, encoded by
+     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
      *
      * @var array
      */
-    private $_fieldLengths;
+    protected $_norms = array();
+
 
     /**
      * '.fdx'  file - Stored Fields, the field index.
      *
      * @var Zend_Search_Lucene_Storage_File
      */
-    private $_fdxFile;
+    protected $_fdxFile = null;
 
     /**
      * '.fdt'  file - Stored Fields, the field data.
      *
      * @var Zend_Search_Lucene_Storage_File
      */
-    private $_fdtFile;
+    protected $_fdtFile = null;
 
 
     /**
@@ -144,132 +130,125 @@ class Zend_Search_Lucene_Index_SegmentWriter
      * @param Zend_Search_Lucene_Storage_Directory $directory
      * @param string $name
      */
-    public function __construct($directory, $name)
+    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
     {
         $this->_directory = $directory;
         $this->_name      = $name;
-        $this->_docCount  = 0;
-
-        $this->_fields         = array();
-        $this->_termDocs       = array();
-        $this->_files          = array();
-        $this->_norms          = array();
-        $this->_fieldLengths   = array();
-        $this->_termDictionary = array();
-
-        $this->_fdxFile = null;
-        $this->_fdtFile = null;
     }
 
 
     /**
      * Add field to the segment
      *
+     * Returns actual field number
+     *
      * @param Zend_Search_Lucene_Field $field
+     * @return integer
      */
-    private function _addFieldInfo(Zend_Search_Lucene_Field $field)
+    public function addField(Zend_Search_Lucene_Field $field)
     {
         if (!isset($this->_fields[$field->name])) {
+            $fieldNumber = count($this->_fields);
             $this->_fields[$field->name] =
                                 new Zend_Search_Lucene_Index_FieldInfo($field->name,
                                                                        $field->isIndexed,
-                                                                       count($this->_fields),
+                                                                       $fieldNumber,
                                                                        $field->storeTermVector);
+
+            return $fieldNumber;
         } else {
             $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
             $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
+
+            return $this->_fields[$field->name]->number;
         }
     }
 
-
     /**
-     * Adds a document to this segment.
+     * Add fieldInfo to the segment
      *
-     * @param Zend_Search_Lucene_Document $document
-     * @throws Zend_Search_Lucene_Exception
+     * Returns actual field number
+     *
+     * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
+     * @return integer
      */
-    public function addDocument(Zend_Search_Lucene_Document $document)
+    public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
     {
-        $storedFields = array();
+        if (!isset($this->_fields[$fieldInfo->name])) {
+            $fieldNumber = count($this->_fields);
+            $this->_fields[$fieldInfo->name] =
+                                new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
+                                                                       $fieldInfo->isIndexed,
+                                                                       $fieldNumber,
+                                                                       $fieldInfo->storeTermVector);
+
+            return $fieldNumber;
+        } else {
+            $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
+            $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
 
-        foreach ($document->getFieldNames() as $fieldName) {
-            $field = $document->getField($fieldName);
-            $this->_addFieldInfo($field);
+            return $this->_fields[$fieldInfo->name]->number;
+        }
+    }
 
-            if ($field->storeTermVector) {
-                /**
-                 * @todo term vector storing support
-                 */
-                throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
-            }
+    /**
+     * Returns array of FieldInfo objects.
+     *
+     * @return array
+     */
+    public function getFieldInfos()
+    {
+        return $this->_fields;
+    }
 
-            if ($field->isIndexed) {
-                if ($field->isTokenized) {
-                    $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
-                } else {
-                    $tokenList = array();
-                    $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
-                }
-                $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
-
-                $position = 0;
-                foreach ($tokenList as $token) {
-                    $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
-                    $termKey = $term->key();
-
-                    if (!isset($this->_termDictionary[$termKey])) {
-                        // New term
-                        $this->_termDictionary[$termKey] = $term;
-                        $this->_termDocs[$termKey] = array();
-                        $this->_termDocs[$termKey][$this->_docCount] = array();
-                    } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
-                        // Existing term, but new term entry
-                        $this->_termDocs[$termKey][$this->_docCount] = array();
-                    }
-                    $position += $token->getPositionIncrement();
-                    $this->_termDocs[$termKey][$this->_docCount][] = $position;
-                }
-            }
+    /**
+     * Add stored fields information
+     *
+     * @param array $storedFields array of Zend_Search_Lucene_Field objects
+     */
+    public function addStoredFields($storedFields)
+    {
+        if (!isset($this->_fdxFile)) {
+            $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+            $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
 
-            if ($field->isStored) {
-                $storedFields[] = $field;
-            }
+            $this->_files[] = $this->_name . '.fdx';
+            $this->_files[] = $this->_name . '.fdt';
         }
 
-        if (count($storedFields) != 0) {
-            if (!isset($this->_fdxFile)) {
-                $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
-                $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
-
-                $this->_files[] = $this->_name . '.fdx';
-                $this->_files[] = $this->_name . '.fdt';
-            }
-
-            $this->_fdxFile->writeLong($this->_fdtFile->tell());
-            $this->_fdtFile->writeVInt(count($storedFields));
-            foreach ($storedFields as $field) {
-                $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
-                $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
-                             ($field->isBinary ?    0x02 : 0x00) |
-                             0x00; /* 0x04 - third bit, compressed (ZLIB) */
-                $this->_fdtFile->writeByte($fieldBits);
-                if ($field->isBinary) {
-                    $this->_fdtFile->writeVInt(strlen($field->stringValue));
-                    $this->_fdtFile->writeBytes($field->stringValue);
-                } else {
-                    $this->_fdtFile->writeString($field->stringValue);
-                }
+        $this->_fdxFile->writeLong($this->_fdtFile->tell());
+        $this->_fdtFile->writeVInt(count($storedFields));
+        foreach ($storedFields as $field) {
+            $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
+            $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
+                         ($field->isBinary ?    0x02 : 0x00) |
+                         0x00; /* 0x04 - third bit, compressed (ZLIB) */
+            $this->_fdtFile->writeByte($fieldBits);
+            if ($field->isBinary) {
+                $this->_fdtFile->writeVInt(strlen($field->value));
+                $this->_fdtFile->writeBytes($field->value);
+            } else {
+                $this->_fdtFile->writeString($field->getUtf8Value());
             }
         }
 
         $this->_docCount++;
     }
 
+    /**
+     * Returns the total number of documents in this segment.
+     *
+     * @return integer
+     */
+    public function count()
+    {
+        return $this->_docCount;
+    }
 
     /**
      * Dump Field Info (.fnm) segment file
      */
-    private function _dumpFNM()
+    protected function _dumpFNM()
     {
         $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
         $fnmFile->writeVInt(count($this->_fields));
@@ -283,20 +262,9 @@ class Zend_Search_Lucene_Index_SegmentWriter
                                );
 
             if ($field->isIndexed) {
-                $fieldNum   = $this->_fields[$field->name]->number;
-                $fieldName  = $field->name;
-                $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
-                $norm       = '';
-
-                for ($count = 0; $count < $this->_docCount; $count++) {
-                    $numTokens = isset($this->_fieldLengths[$fieldName][$count]) ?
-                                      $this->_fieldLengths[$fieldName][$count] : 0;
-                    $norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens)));
-                }
-
-                $normFileName = $this->_name . '.f' . $fieldNum;
+                $normFileName = $this->_name . '.f' . $field->number;
                 $fFile = $this->_directory->createFile($normFileName);
-                $fFile->writeBytes($norm);
+                $fFile->writeBytes($this->_norms[$field->name]);
                 $this->_files[] = $normFileName;
             }
         }
@@ -305,6 +273,194 @@ class Zend_Search_Lucene_Index_SegmentWriter
     }
 
 
+
+    /**
+     * Term Dictionary file
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_tisFile = null;
+
+    /**
+     * Term Dictionary index file
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_tiiFile = null;
+
+    /**
+     * Frequencies file
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_frqFile = null;
+
+    /**
+     * Positions file
+     *
+     * @var Zend_Search_Lucene_Storage_File
+     */
+    private $_prxFile = null;
+
+    /**
+     * Number of written terms
+     *
+     * @var integer
+     */
+    private $_termCount;
+
+
+    /**
+     * Last saved term
+     *
+     * @var Zend_Search_Lucene_Index_Term
+     */
+    private $_prevTerm;
+
+    /**
+     * Last saved term info
+     *
+     * @var Zend_Search_Lucene_Index_TermInfo
+     */
+    private $_prevTermInfo;
+
+    /**
+     * Last saved index term
+     *
+     * @var Zend_Search_Lucene_Index_Term
+     */
+    private $_prevIndexTerm;
+
+    /**
+     * Last saved index term info
+     *
+     * @var Zend_Search_Lucene_Index_TermInfo
+     */
+    private $_prevIndexTermInfo;
+
+    /**
+     * Last term dictionary file position
+     *
+     * @var integer
+     */
+    private $_lastIndexPosition;
+
+    /**
+     * Create dicrionary, frequency and positions files and write necessary headers
+     */
+    public function initializeDictionaryFiles()
+    {
+        $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
+        $this->_tisFile->writeInt((int)0xFFFFFFFE);
+        $this->_tisFile->writeLong(0 /* dummy data for terms count */);
+        $this->_tisFile->writeInt(self::$indexInterval);
+        $this->_tisFile->writeInt(self::$skipInterval);
+
+        $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
+        $this->_tiiFile->writeInt((int)0xFFFFFFFE);
+        $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
+        $this->_tiiFile->writeInt(self::$indexInterval);
+        $this->_tiiFile->writeInt(self::$skipInterval);
+
+        /** Dump dictionary header */
+        $this->_tiiFile->writeVInt(0);                    // preffix length
+        $this->_tiiFile->writeString('');                 // suffix
+        $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
+        $this->_tiiFile->writeByte((int)0x0F);
+        $this->_tiiFile->writeVInt(0);                    // DocFreq
+        $this->_tiiFile->writeVInt(0);                    // FreqDelta
+        $this->_tiiFile->writeVInt(0);                    // ProxDelta
+        $this->_tiiFile->writeVInt(20);                   // IndexDelta
+
+        $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
+        $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
+
+        $this->_files[] = $this->_name . '.tis';
+        $this->_files[] = $this->_name . '.tii';
+        $this->_files[] = $this->_name . '.frq';
+        $this->_files[] = $this->_name . '.prx';
+
+        $this->_prevTerm          = null;
+        $this->_prevTermInfo      = null;
+        $this->_prevIndexTerm     = null;
+        $this->_prevIndexTermInfo = null;
+        $this->_lastIndexPosition = 20;
+        $this->_termCount         = 0;
+
+    }
+
+    /**
+     * Add term
+     *
+     * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
+     *
+     * @param Zend_Search_Lucene_Index_Term $termEntry
+     * @param array $termDocs
+     */
+    public function addTerm($termEntry, $termDocs)
+    {
+        $freqPointer = $this->_frqFile->tell();
+        $proxPointer = $this->_prxFile->tell();
+
+        $prevDoc = 0;
+        foreach ($termDocs as $docId => $termPositions) {
+            $docDelta = ($docId - $prevDoc)*2;
+            $prevDoc = $docId;
+            if (count($termPositions) > 1) {
+                $this->_frqFile->writeVInt($docDelta);
+                $this->_frqFile->writeVInt(count($termPositions));
+            } else {
+                $this->_frqFile->writeVInt($docDelta + 1);
+            }
+
+            $prevPosition = 0;
+            foreach ($termPositions as $position) {
+                $this->_prxFile->writeVInt($position - $prevPosition);
+                $prevPosition = $position;
+            }
+        }
+
+        if (count($termDocs) >= self::$skipInterval) {
+            /**
+             * @todo Write Skip Data to a freq file.
+             * It's not used now, but make index more optimal
+             */
+            $skipOffset = $this->_frqFile->tell() - $freqPointer;
+        } else {
+            $skipOffset = 0;
+        }
+
+        $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
+                                                  $this->_fields[$termEntry->field]->number);
+        $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
+                                                          $freqPointer, $proxPointer, $skipOffset);
+
+        $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
+
+        if (($this->_termCount + 1) % self::$indexInterval == 0) {
+            $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
+
+            $indexPosition = $this->_tisFile->tell();
+            $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
+            $this->_lastIndexPosition = $indexPosition;
+
+        }
+        $this->_termCount++;
+    }
+
+    /**
+     * Close dictionary
+     */
+    public function closeDictionaryFiles()
+    {
+        $this->_tisFile->seek(4);
+        $this->_tisFile->writeLong($this->_termCount);
+
+        $this->_tiiFile->seek(4);
+        $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
+    }
+
+
     /**
      * Dump Term Dictionary segment file entry.
      * Used to write entry to .tis or .tii files
@@ -315,22 +471,47 @@ class Zend_Search_Lucene_Index_SegmentWriter
      * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
      * @param Zend_Search_Lucene_Index_TermInfo $termInfo
      */
-    private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
+    protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
                                         &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
                                         &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
     {
         if (isset($prevTerm) && $prevTerm->field == $term->field) {
-            $prefixLength = 0;
-            while ($prefixLength < strlen($prevTerm->text) &&
-                   $prefixLength < strlen($term->text) &&
-                   $prevTerm->text{$prefixLength} == $term->text{$prefixLength}
-                  ) {
-                $prefixLength++;
+            $matchedBytes = 0;
+            $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
+            while ($matchedBytes < $maxBytes  &&
+                   $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
+                $matchedBytes++;
+            }
+
+            // Calculate actual matched UTF-8 pattern
+            $prefixBytes = 0;
+            $prefixChars = 0;
+            while ($prefixBytes < $matchedBytes) {
+                $charBytes = 1;
+                if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
+                    $charBytes++;
+                    if (ord($term->text[$prefixBytes]) & 0x20 ) {
+                        $charBytes++;
+                        if (ord($term->text[$prefixBytes]) & 0x10 ) {
+                            $charBytes++;
+                        }
+                    }
+                }
+
+                if ($prefixBytes + $charBytes > $matchedBytes) {
+                    // char crosses matched bytes boundary
+                    // skip char
+                    break;
+                }
+
+                $prefixChars++;
+                $prefixBytes += $charBytes;
             }
+
             // Write preffix length
-            $dicFile->writeVInt($prefixLength);
+            $dicFile->writeVInt($prefixChars);
             // Write suffix
-            $dicFile->writeString( substr($term->text, $prefixLength) );
+            $dicFile->writeString(substr($term->text, $prefixBytes));
         } else {
             // Write preffix length
             $dicFile->writeVInt(0);
@@ -363,107 +544,11 @@ class Zend_Search_Lucene_Index_SegmentWriter
         $prevTermInfo = $termInfo;
     }
 
-    /**
-     * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
-     */
-    private function _dumpDictionary()
-    {
-        $termKeys = array_keys($this->_termDictionary);
-        sort($termKeys, SORT_STRING);
-
-        $tisFile = $this->_directory->createFile($this->_name . '.tis');
-        $tisFile->writeInt((int)0xFFFFFFFE);
-        $tisFile->writeLong(count($termKeys));
-        $tisFile->writeInt(self::$indexInterval);
-        $tisFile->writeInt(self::$skipInterval);
-
-        $tiiFile = $this->_directory->createFile($this->_name . '.tii');
-        $tiiFile->writeInt((int)0xFFFFFFFE);
-        $tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval));
-        $tiiFile->writeInt(self::$indexInterval);
-        $tiiFile->writeInt(self::$skipInterval);
-
-        /** Dump dictionary header */
-        $tiiFile->writeVInt(0);                    // preffix length
-        $tiiFile->writeString('');                 // suffix
-        $tiiFile->writeInt((int)0xFFFFFFFF);       // field number
-        $tiiFile->writeByte((int)0x0F);
-        $tiiFile->writeVInt(0);                    // DocFreq
-        $tiiFile->writeVInt(0);                    // FreqDelta
-        $tiiFile->writeVInt(0);                    // ProxDelta
-        $tiiFile->writeVInt(20);                   // IndexDelta
-
-        $frqFile = $this->_directory->createFile($this->_name . '.frq');
-        $prxFile = $this->_directory->createFile($this->_name . '.prx');
-
-        $termCount = 1;
-
-        $prevTerm     = null;
-        $prevTermInfo = null;
-        $prevIndexTerm     = null;
-        $prevIndexTermInfo = null;
-        $prevIndexPosition = 20;
-
-        foreach ($termKeys as $termId) {
-            $freqPointer = $frqFile->tell();
-            $proxPointer = $prxFile->tell();
-
-            $prevDoc = 0;
-            foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
-                $docDelta = ($docId - $prevDoc)*2;
-                $prevDoc = $docId;
-                if (count($termPositions) > 1) {
-                    $frqFile->writeVInt($docDelta);
-                    $frqFile->writeVInt(count($termPositions));
-                } else {
-                    $frqFile->writeVInt($docDelta + 1);
-                }
-
-                $prevPosition = 0;
-                foreach ($termPositions as $position) {
-                    $prxFile->writeVInt($position - $prevPosition);
-                    $prevPosition = $position;
-                }
-            }
-
-            if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
-                /**
-                 * @todo Write Skip Data to a freq file.
-                 * It's not used now, but make index more optimal
-                 */
-                $skipOffset = $frqFile->tell() - $freqPointer;
-            } else {
-                $skipOffset = 0;
-            }
-
-            $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
-                                                      $this->_fields[$this->_termDictionary[$termId]->field]->number);
-            $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
-                                            $freqPointer, $proxPointer, $skipOffset);
-
-            $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);
-
-            if ($termCount % self::$indexInterval == 0) {
-                $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);
-
-                $indexPosition = $tisFile->tell();
-                $tiiFile->writeVInt($indexPosition - $prevIndexPosition);
-                $prevIndexPosition = $indexPosition;
-            }
-            $termCount++;
-        }
-
-        $this->_files[] = $this->_name . '.tis';
-        $this->_files[] = $this->_name . '.tii';
-        $this->_files[] = $this->_name . '.frq';
-        $this->_files[] = $this->_name . '.prx';
-    }
-
 
     /**
      * Generate compound index file
      */
-    private function _generateCFS()
+    protected function _generateCFS()
     {
         $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
         $cfsFile->writeVInt(count($this->_files));
@@ -486,8 +571,13 @@ class Zend_Search_Lucene_Index_SegmentWriter
             $cfsFile->seek($dataOffset);
 
             $dataFile = $this->_directory->getFileObject($fileName);
-            $data = $dataFile->readBytes($this->_directory->fileLength($fileName));
-            $cfsFile->writeBytes($data);
+
+            $byteCount = $this->_directory->fileLength($fileName);
+            while ($byteCount > 0) {
+                $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
+                $byteCount -= strlen($data);
+                $cfsFile->writeBytes($data);
+            }
 
             $this->_directory->deleteFile($fileName);
         }
@@ -499,21 +589,6 @@ class Zend_Search_Lucene_Index_SegmentWriter
      *
      * @return Zend_Search_Lucene_Index_SegmentInfo
      */
-    public function close()
-    {
-        if ($this->_docCount == 0) {
-            return null;
-        }
-
-        $this->_dumpFNM();
-        $this->_dumpDictionary();
-
-        $this->_generateCFS();
-
-        return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
-                                                        $this->_docCount,
-                                                        $this->_directory);
-    }
-
+    abstract public function close();
 }
 
diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php
new file mode 100644 (file)
index 0000000..7dd2bf9
--- /dev/null
@@ -0,0 +1,213 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
+{
+    /**
+     * Term Dictionary
+     * Array of the Zend_Search_Lucene_Index_Term objects
+     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
+     *
+     * @var array
+     */
+    protected $_termDictionary;
+
+    /**
+     * Documents, which contain the term
+     *
+     * @var array
+     */
+    protected $_termDocs;
+
+    /**
+     * Object constructor.
+     *
+     * @param Zend_Search_Lucene_Storage_Directory $directory
+     * @param string $name
+     */
+    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
+    {
+        parent::__construct($directory, $name);
+
+        $this->_termDocs       = array();
+        $this->_termDictionary = array();
+    }
+
+
+    /**
+     * Adds a document to this segment.
+     *
+     * @param Zend_Search_Lucene_Document $document
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function addDocument(Zend_Search_Lucene_Document $document)
+    {
+        $storedFields = array();
+        $docNorms     = array();
+        $similarity   = Zend_Search_Lucene_Search_Similarity::getDefault();
+
+        foreach ($document->getFieldNames() as $fieldName) {
+            $field = $document->getField($fieldName);
+            $this->addField($field);
+
+            if ($field->storeTermVector) {
+                /**
+                 * @todo term vector storing support
+                 */
+                throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
+            }
+
+            if ($field->isIndexed) {
+                if ($field->isTokenized) {
+                    $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
+                    $analyzer->setInput($field->value, $field->encoding);
+
+                    $position     = 0;
+                    $tokenCounter = 0;
+                    while (($token = $analyzer->nextToken()) !== null) {
+                        $tokenCounter++;
+
+                        $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
+                        $termKey = $term->key();
+
+                        if (!isset($this->_termDictionary[$termKey])) {
+                            // New term
+                            $this->_termDictionary[$termKey] = $term;
+                            $this->_termDocs[$termKey] = array();
+                            $this->_termDocs[$termKey][$this->_docCount] = array();
+                        } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
+                            // Existing term, but new term entry
+                            $this->_termDocs[$termKey][$this->_docCount] = array();
+                        }
+                        $position += $token->getPositionIncrement();
+                        $this->_termDocs[$termKey][$this->_docCount][] = $position;
+                    }
+
+                    $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
+                                                                                                   $tokenCounter)*
+                                                                           $document->boost*
+                                                                           $field->boost ));
+                } else {
+                    $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
+                    $termKey = $term->key();
+
+                    if (!isset($this->_termDictionary[$termKey])) {
+                        // New term
+                        $this->_termDictionary[$termKey] = $term;
+                        $this->_termDocs[$termKey] = array();
+                        $this->_termDocs[$termKey][$this->_docCount] = array();
+                    } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
+                        // Existing term, but new term entry
+                        $this->_termDocs[$termKey][$this->_docCount] = array();
+                    }
+                    $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
+
+                    $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
+                                                                           $document->boost*
+                                                                           $field->boost ));
+                }
+            }
+
+            if ($field->isStored) {
+                $storedFields[] = $field;
+            }
+        }
+
+
+        foreach ($this->_fields as $fieldName => $field) {
+            if (!$field->isIndexed) {
+                continue;
+            }
+
+            if (!isset($this->_norms[$fieldName])) {
+                $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
+                                                       $this->_docCount);
+            }
+
+            if (isset($docNorms[$fieldName])){
+                $this->_norms[$fieldName] .= $docNorms[$fieldName];
+            } else {
+                $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
+            }
+        }
+
+        $this->addStoredFields($storedFields);
+    }
+
+
+    /**
+     * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
+     */
+    protected function _dumpDictionary()
+    {
+        ksort($this->_termDictionary, SORT_STRING);
+
+        $this->initializeDictionaryFiles();
+
+        foreach ($this->_termDictionary as $termId => $term) {
+            $this->addTerm($term, $this->_termDocs[$termId]);
+        }
+
+        $this->closeDictionaryFiles();
+    }
+
+
+    /**
+     * Close segment, write it to disk and return segment info
+     *
+     * @return Zend_Search_Lucene_Index_SegmentInfo
+     */
+    public function close()
+    {
+        if ($this->_docCount == 0) {
+            return null;
+        }
+
+        $this->_dumpFNM();
+        $this->_dumpDictionary();
+
+        $this->_generateCFS();
+
+        return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
+                                                        $this->_docCount,
+                                                        $this->_directory);
+    }
+
+}
+
diff --git a/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php b/search/Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php
new file mode 100644 (file)
index 0000000..ba0e202
--- /dev/null
@@ -0,0 +1,94 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentWriter */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter
+{
+    /**
+     * Object constructor.
+     *
+     * @param Zend_Search_Lucene_Storage_Directory $directory
+     * @param string $name
+     */
+    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
+    {
+        parent::__construct($directory, $name);
+    }
+
+
+    /**
+     * Create stored fields files and open them for write
+     */
+    public function createStoredFieldsFiles()
+    {
+        $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+        $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
+
+        $this->_files[] = $this->_name . '.fdx';
+        $this->_files[] = $this->_name . '.fdt';
+    }
+
+    public function addNorm($fieldName, $normVector)
+    {
+        if (isset($this->_norms[$fieldName])) {
+            $this->_norms[$fieldName] .= $normVector;
+        } else {
+            $this->_norms[$fieldName] = $normVector;
+        }
+    }
+
+    /**
+     * Close segment, write it to disk and return segment info
+     *
+     * @return Zend_Search_Lucene_Index_SegmentInfo
+     */
+    public function close()
+    {
+        if ($this->_docCount == 0) {
+            return null;
+        }
+
+        $this->_dumpFNM();
+        $this->_generateCFS();
+
+        return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
+                                                        $this->_docCount,
+                                                        $this->_directory);
+    }
+}
+
index 3deffa9..465b4ef 100644 (file)
@@ -15,7 +15,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -31,7 +31,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Index_Term
@@ -52,21 +52,57 @@ class Zend_Search_Lucene_Index_Term
 
 
     /**
-     * @todo docblock
+     * Object constructor
      */
-    public function __construct( $text, $field = 'contents' )
+    public function __construct($text, $field = null)
     {
-        $this->field = $field;
-        $this->text = $text;
+        $this->field = ($field === null)?  Zend_Search_Lucene::getDefaultSearchField() : $field;
+        $this->text  = $text;
     }
 
 
     /**
-     * @todo docblock
+     * Returns term key
+     *
+     * @return string
      */
     public function key()
     {
         return $this->field . chr(0) . $this->text;
     }
+
+    /**
+     * Get term prefix
+     *
+     * @param integer $length
+     * @return string
+     */
+    public static function getPrefix($str, $length)
+    {
+        $prefixBytes = 0;
+        $prefixChars = 0;
+        while ($prefixBytes < strlen($str)  &&  $prefixChars < $length) {
+            $charBytes = 1;
+            if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
+                $charBytes++;
+                if (ord($str[$prefixBytes]) & 0x20 ) {
+                    $charBytes++;
+                    if (ord($str[$prefixBytes]) & 0x10 ) {
+                        $charBytes++;
+                    }
+                }
+            }
+
+            if ($prefixBytes + $charBytes > strlen($str)) {
+                // wrong character
+                break;
+            }
+
+            $prefixChars++;
+            $prefixBytes += $charBytes;
+        }
+
+        return substr($str, 0, $prefixBytes);
+    }
 }
 
index 7dcfcc8..95f7cfc 100644 (file)
@@ -15,7 +15,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
@@ -26,7 +26,7 @@
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Index_TermInfo
index ef6c655..8e32f4e 100644 (file)
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 
 
-/** Zend_Search_Lucene_Index_SegmentWriter */
-require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
+/** Zend_Search_Lucene_Index_SegmentWriter_ */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
 
 /** Zend_Search_Lucene_Index_SegmentInfo */
-require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentMerger */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentMerger.php';
+
 
 
 /**
  * @category   Zend
  * @package    Zend_Search_Lucene
  * @subpackage Index
- * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  */
 class Zend_Search_Lucene_Index_Writer
 {
     /**
-     * @todo Implement segment merger
-     * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage.
      * @todo Implement Analyzer substitution
      * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
      *       temporary index files
@@ -46,74 +48,92 @@ class Zend_Search_Lucene_Index_Writer
      */
 
     /**
-     * File system adapter.
+     * Number of documents required before the buffered in-memory
+     * documents are written into a new Segment
      *
-     * @var Zend_Search_Lucene_Storage_Directory
+     * Default value is 10
+     *
+     * @var integer
      */
-    private $_directory = null;
-
+    public $maxBufferedDocs = 10;
 
     /**
-     * Index version
-     * Counts how often the index has been changed by adding or deleting docs
+     * Largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
      *
      * @var integer
      */
-    private $_version;
+    public $maxMergeDocs = PHP_INT_MAX;
 
     /**
-     * Segment name counter.
-     * Used to name new segments .
+     * Determines how often segment indices are merged by addDocument().
+     *
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     *
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     *
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
      *
      * @var integer
      */
-    private $_segmentNameCounter;
+    public $mergeFactor = 10;
 
     /**
-     * Number of the segments in the index
+     * File system adapter.
      *
-     * @var inteher
+     * @var Zend_Search_Lucene_Storage_Directory
      */
-    private $_segments;
+    private $_directory = null;
+
 
     /**
-     * Determines how often segment indices
-     * are merged by addDocument().
+     * Changes counter.
      *
      * @var integer
      */
-    public $mergeFactor;
+    private $_versionUpdate = 0;
 
     /**
-     * Determines the minimal number of documents required before
-     * the buffered in-memory documents are merging and a new Segment
-     * is created.
+     * List of the segments, created by index writer
+     * Array of Zend_Search_Lucene_Index_SegmentInfo objects
      *
-     * @var integer
+     * @var array
      */
-    public $minMergeDocs;
+    private $_newSegments = array();
 
     /**
-     * Determines the largest number of documents ever merged by addDocument().
+     * List of segments to be deleted on commit
      *
-     * @var integer
+     * @var array
      */
-    public $maxMergeDocs;
+    private $_segmentsToDelete = array();
 
     /**
-     * List of the segments, created by index writer
-     * Array of Zend_Search_Lucene_Index_SegmentInfo objects
+     * Current segment to add documents
      *
-     * @var array
+     * @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
      */
-    private $_newSegments;
+    private $_currentSegment = null;
 
     /**
-     * Current segment to add documents
+     * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
+     *
+     * It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
      *
-     * @var Zend_Search_Lucene_Index_SegmentWriter
+     * @var array Zend_Search_Lucene_Index_SegmentInfo
      */
-    private $_currentSegment;
+    private $_segmentInfos;
 
     /**
      * List of indexfiles extensions
@@ -131,7 +151,8 @@ class Zend_Search_Lucene_Index_Writer
                                              '.tvx' => '.tvx',
                                              '.tvd' => '.tvd',
                                              '.tvf' => '.tvf',
-                                             '.del' => '.del'  );
+                                             '.del' => '.del',
+                                             '.sti' => '.sti' );
 
     /**
      * Opens the index for writing
@@ -142,11 +163,13 @@ class Zend_Search_Lucene_Index_Writer
      * index or overwrite the existing one.
      *
      * @param Zend_Search_Lucene_Storage_Directory $directory
+     * @param array $segmentInfos
      * @param boolean $create
      */
-    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false)
+    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $create = false)
     {
-        $this->_directory = $directory;
+        $this->_directory    = $directory;
+        $this->_segmentInfos = &$segmentInfos;
 
         if ($create) {
             foreach ($this->_directory->fileList() as $file) {
@@ -159,8 +182,13 @@ class Zend_Search_Lucene_Index_Writer
             }
             $segmentsFile = $this->_directory->createFile('segments');
             $segmentsFile->writeInt((int)0xFFFFFFFF);
-            // write version
-            $segmentsFile->writeLong(0);
+
+            // write version (is initialized by current time
+            // $segmentsFile->writeLong((int)microtime(true));
+            $version = microtime(true);
+            $segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
+            $segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
+
             // write name counter
             $segmentsFile->writeInt(0);
             // write segment counter
@@ -169,27 +197,13 @@ class Zend_Search_Lucene_Index_Writer
             $deletableFile = $this->_directory->createFile('deletable');
             // write counter
             $deletableFile->writeInt(0);
-
-            $this->_version            = 0;
-            $this->_segmentNameCounter = 0;
-            $this->_segments           = 0;
         } else {
             $segmentsFile = $this->_directory->getFileObject('segments');
             $format = $segmentsFile->readInt();
             if ($format != (int)0xFFFFFFFF) {
                 throw new Zend_Search_Lucene_Exception('Wrong segments file format');
             }
-
-            // read version
-            $this->_version            = $segmentsFile->readLong();
-            // read counter
-            $this->_segmentNameCounter = $segmentsFile->readInt();
-            // read segment counter
-            $this->_segments           = $segmentsFile->readInt();
         }
-
-        $this->_newSegments = array();
-        $this->_currentSegment = null;
     }
 
     /**
@@ -201,49 +215,218 @@ class Zend_Search_Lucene_Index_Writer
     {
         if ($this->_currentSegment === null) {
             $this->_currentSegment =
-                new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName());
+                new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
         }
         $this->_currentSegment->addDocument($document);
-        $this->_version++;
+
+        if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
+            $this->commit();
+        }
+
+        $this->_versionUpdate++;
+
+        $this->_maybeMergeSegments();
+    }
+
+
+    /**
+     * Merge segments if necessary
+     */
+    private function _maybeMergeSegments()
+    {
+        $segmentSizes = array();
+        foreach ($this->_segmentInfos as $segId => $segmentInfo) {
+            $segmentSizes[$segId] = $segmentInfo->count();
+        }
+
+        $mergePool   = array();
+        $poolSize    = 0;
+        $sizeToMerge = $this->maxBufferedDocs;
+        asort($segmentSizes, SORT_NUMERIC);
+        foreach ($segmentSizes as $segId => $size) {
+            // Check, if segment comes into a new merging block
+            while ($size >= $sizeToMerge) {
+                // Merge previous block if it's large enough
+                if ($poolSize >= $sizeToMerge) {
+                    $this->_mergeSegments($mergePool);
+                }
+                $mergePool   = array();
+                $poolSize    = 0;
+
+                $sizeToMerge *= $this->mergeFactor;
+
+                if ($sizeToMerge > $this->maxMergeDocs) {
+                    return;
+                }
+            }
+
+            $mergePool[] = $this->_segmentInfos[$segId];
+            $poolSize += $size;
+        }
+
+        if ($poolSize >= $sizeToMerge) {
+            $this->_mergeSegments($mergePool);
+        }
     }
 
+    /**
+     * Merge specified segments
+     *
+     * $segments is an array of SegmentInfo objects
+     *
+     * @param array $segments
+     */
+    private function _mergeSegments($segments)
+    {
+        // Try to get exclusive non-blocking lock to the 'index.optimization.lock'
+        // Skip optimization if it's performed by other process right now
+        $optimizationLock = $this->_directory->createFile('index.optimization.lock');
+        if (!$optimizationLock->lock(LOCK_EX,true)) {
+            return;
+        }
+
+        $newName = $this->_newSegmentName();
+        $merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
+                                                             $newName);
+        foreach ($segments as $segmentInfo) {
+            $merger->addSource($segmentInfo);
+            $this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
+        }
+
+        $newSegment = $merger->merge();
+        if ($newSegment !== null) {
+            $this->_newSegments[$newSegment->getName()] = $newSegment;
+        }
+
+        $this->commit();
 
+        // optimization is finished
+        $optimizationLock->unlock();
+    }
 
     /**
      * Update segments file by adding current segment to a list
-     * @todo !!!!!Finish the implementation
      *
      * @throws Zend_Search_Lucene_Exception
      */
     private function _updateSegments()
     {
-        $segmentsFile   = $this->_directory->getFileObject('segments');
-        $newSegmentFile = $this->_directory->createFile('segments.new');
+        // Get an exclusive index lock
+        // Wait, until all parallel searchers or indexers won't stop
+        // and stop all next searchers, while we are updating segments file
+        $lock = $this->_directory->getFileObject('index.lock');
+        if (!$lock->lock(LOCK_EX)) {
+            throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
+        }
 
-        $newSegmentFile->writeInt((int)0xFFFFFFFF);
-        $newSegmentFile->writeLong($this->_version);
-        $newSegmentFile->writeInt($this->_segmentNameCounter);
 
-        $this->_segments += count($this->_newSegments);
-        $newSegmentFile->writeInt($this->_segments);
+        // Do not share file handlers to get file updates from other sessions.
+        $segmentsFile   = $this->_directory->getFileObject('segments', false);
+        $newSegmentFile = $this->_directory->createFile('segments.new', false);
 
-        $segmentsFile->seek(20);
-        $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20));
+        // Write format marker
+        $newSegmentFile->writeInt((int)0xFFFFFFFF);
 
-        foreach ($this->_newSegments as $segmentName => $segmentInfo) {
-            $newSegmentFile->writeString($segmentName);
+        // Write index version
+        $segmentsFile->seek(4, SEEK_CUR);
+        // $version = $segmentsFile->readLong() + $this->_versionUpdate;
+        // Process version on 32-bit platforms
+        $versionHigh = $segmentsFile->readInt();
+        $versionLow  = $segmentsFile->readInt();
+        $version = $versionHigh * ((double)0xFFFFFFFF + 1) +
+                   (($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow);
+        $version += $this->_versionUpdate;
+        $this->_versionUpdate = 0;
+        $newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
+        $newSegmentFile->writeInt((int)($version & 0xFFFFFFFF));
+
+        // Write segment name counter
+        $newSegmentFile->writeInt($segmentsFile->readInt());
+
+        // Get number of segments offset
+        $numOfSegmentsOffset = $newSegmentFile->tell();
+        // Write number of segemnts
+        $segmentsCount = $segmentsFile->readInt();
+        $newSegmentFile->writeInt(0);  // Write dummy data (segment counter)
+
+        $segments = array();
+        for ($count = 0; $count < $segmentsCount; $count++) {
+            $segName = $segmentsFile->readString();
+            $segSize = $segmentsFile->readInt();
+
+            if (!in_array($segName, $this->_segmentsToDelete)) {
+                $newSegmentFile->writeString($segName);
+                $newSegmentFile->writeInt($segSize);
+
+                $segments[$segName] = $segSize;
+            }
+        }
+        $segmentsFile->close();
+
+        $segmentsCount = count($segments) + count($this->_newSegments);
+
+        // Remove segments, not listed in $segments (deleted)
+        // Load segments, not listed in $this->_segmentInfos
+        foreach ($this->_segmentInfos as $segId => $segInfo) {
+            if (isset($segments[$segInfo->getName()])) {
+                // Segment is already included into $this->_segmentInfos
+                unset($segments[$segInfo->getName()]);
+            } else {
+                // remove deleted segment from a list
+                unset($this->_segmentInfos[$segId]);
+            }
+        }
+        // $segments contains a list of segments to load
+        // do it later
+
+        foreach ($this->_newSegments as $segName => $segmentInfo) {
+            $newSegmentFile->writeString($segName);
             $newSegmentFile->writeInt($segmentInfo->count());
+
+            $this->_segmentInfos[] = $segmentInfo;
         }
+        $this->_newSegments = array();
 
+        $newSegmentFile->seek($numOfSegmentsOffset);
+        $newSegmentFile->writeInt($segmentsCount);  // Update segments count
+        $newSegmentFile->close();
         $this->_directory->renameFile('segments.new', 'segments');
+
+
+        // Segments file update is finished
+        // Switch back to shared lock mode
+        $lock->lock(LOCK_SH);
+
+
+        $fileList = $this->_directory->fileList();
+        foreach ($this->_segmentsToDelete as $nameToDelete) {
+            foreach (self::$_indexExtensions as $ext) {
+                if ($this->_directory->fileExists($nameToDelete . $ext)) {
+                    $this->_directory->deleteFile($nameToDelete . $ext);
+                }
+            }
+
+            foreach ($fileList as $file) {
+                if (substr($file, 0, strlen($nameToDelete) + 2) == ($nameToDelete . '.f') &&
+                    ctype_digit( substr($file, strlen($nameToDelete) + 2) )) {
+                        $this->_directory->deleteFile($file);
+                    }
+            }
+        }
+        $this->_segmentsToDelete = array();
+
+        // Load segments, created by other process
+        foreach ($segments as $segName => $segSize) {
+            // Load new segments
+            $this->_segmentInfos[] = new Zend_Search_Lucene_Index_SegmentInfo($segName,
+                                                                              $segSize,
+                                                                              $this->_directory);
+        }
     }
 
 
     /**
      * Commit current changes
-     * returns array of new segments
-     *
-     * @return array
      */
     public function commit()
     {
@@ -255,14 +438,10 @@ class Zend_Search_Lucene_Index_Writer
             $this->_currentSegment = null;
         }
 
-        if (count($this->_newSegments) != 0) {
+        if (count($this->_newSegments)      != 0 ||
+            count($this->_segmentsToDelete) != 0) {
             $this->_updateSegments();
         }
-
-        $result = $this->_newSegments;
-        $this->_newSegments = array();
-
-        return $result;
     }
 
 
@@ -279,43 +458,16 @@ class Zend_Search_Lucene_Index_Writer
          */
     }
 
-
-    /**
-     * Returns the number of documents currently in this index.
-     *
-     * @return integer
-     */
-    public function docCount($readers)
-    {
-        /**
-         * @todo implementation
-         */
-    }
-
-
-    /**
-     * Flushes all changes to an index and closes all associated files.
-     *
-     */
-    public function close()
-    {
-        /**
-         * @todo implementation
-         */
-    }
-
-
     /**
      * Merges all segments together into a single segment, optimizing
      * an index for search.
+     * Input is an array of Zend_Search_Lucene_Index_SegmentInfo objects
      *
-     * return void
+     * @throws Zend_Search_Lucene_Exception
      */
     public function optimize()
     {
-        /**
-         * @todo implementation
-         */
+        $this->_mergeSegments($this->_segmentInfos);
     }
 
     /**
@@ -325,7 +477,30 @@ class Zend_Search_Lucene_Index_Writer
      */
     private function _newSegmentName()
     {
-        return '_' . base_convert($this->_segmentNameCounter++, 10, 36);
+        // Do not share file handler to get file updates from other sessions.
+        $segmentsFile = $this->_directory->getFileObject('segments', false);
+
+        // Get exclusive segments file lock
+        // We have guarantee, that we will not intersect with _updateSegments() call
+        // of other process, because it needs exclusive index lock and waits
+        // until all other searchers won't stop
+        if (!$segmentsFile->lock(LOCK_EX)) {
+            throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
+        }
+
+        $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
+        $segmentNameCounter = $segmentsFile->readInt();
+
+        $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
+        $segmentsFile->writeInt($segmentNameCounter + 1);
+
+        // Flash output to guarantee that wrong value will not be loaded between unlock and
+        // return (which calls $segmentsFile destructor)
+        $segmentsFile->flush();
+
+        $segmentsFile->unlock();
+
+        return '_' . base_convert($segmentNameCounter, 10, 36);
     }
 
 }
diff --git a/search/Zend/Search/Lucene/Interface.php b/search/Zend/Search/Lucene/Interface.php
new file mode 100644 (file)
index 0000000..58c75b6
--- /dev/null
@@ -0,0 +1,330 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+interface Zend_Search_Lucene_Interface
+{
+    /**
+     * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
+     *
+     * @return Zend_Search_Lucene_Storage_Directory
+     */
+    public function getDirectory();
+
+    /**
+     * Returns the total number of documents in this index (including deleted documents).
+     *
+     * @return integer
+     */
+    public function count();
+
+    /**
+     * Returns one greater than the largest possible document number.
+     * This may be used to, e.g., determine how big to allocate a structure which will have
+     * an element for every document number in an index.
+     *
+     * @return integer
+     */
+    public function maxDoc();
+
+    /**
+     * Returns the total number of non-deleted documents in this index.
+     *
+     * @return integer
+     */
+    public function numDocs();
+
+    /**
+     * Checks, that document is deleted
+     *
+     * @param integer $id
+     * @return boolean
+     * @throws Zend_Search_Lucene_Exception    Exception is thrown if $id is out of the range
+     */
+    public function isDeleted($id);
+
+    /**
+     * Set default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * Default value is null
+     *
+     * @param string $fieldName
+     */
+    public static function setDefaultSearchField($fieldName);
+
+    /**
+     * Get default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * @return string
+     */
+    public static function getDefaultSearchField();
+
+    /**
+     * Retrieve index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMaxBufferedDocs();
+
+    /**
+     * Set index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @param integer $maxBufferedDocs
+     */
+    public function setMaxBufferedDocs($maxBufferedDocs);
+
+    /**
+     * Retrieve index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @return integer
+     */
+    public function getMaxMergeDocs();
+
+    /**
+     * Set index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMaxMergeDocs($maxMergeDocs);
+
+    /**
+     * Retrieve index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMergeFactor();
+
+    /**
+     * Set index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMergeFactor($mergeFactor);
+
+    /**
+     * Performs a query against the index and returns an array
+     * of Zend_Search_Lucene_Search_QueryHit objects.
+     * Input is a string or Zend_Search_Lucene_Search_Query.
+     *
+     * @param mixed $query
+     * @return array Zend_Search_Lucene_Search_QueryHit
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function find($query);
+
+    /**
+     * Returns a list of all unique field names that exist in this index.
+     *
+     * @param boolean $indexed
+     * @return array
+     */
+    public function getFieldNames($indexed = false);
+
+    /**
+     * Returns a Zend_Search_Lucene_Document object for the document
+     * number $id in this index.
+     *
+     * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+     * @return Zend_Search_Lucene_Document
+     */
+    public function getDocument($id);
+
+    /**
+     * Returns true if index contain documents with specified term.
+     *
+     * Is used for query optimization.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return boolean
+     */
+    public function hasTerm(Zend_Search_Lucene_Index_Term $term);
+
+    /**
+     * Returns IDs of all the documents containing term.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return array
+     */
+    public function termDocs(Zend_Search_Lucene_Index_Term $term);
+
+    /**
+     * Returns an array of all term freqs.
+     * Return array structure: array( docId => freq, ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return integer
+     */
+    public function termFreqs(Zend_Search_Lucene_Index_Term $term);
+
+    /**
+     * Returns an array of all term positions in the documents.
+     * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return array
+     */
+    public function termPositions(Zend_Search_Lucene_Index_Term $term);
+
+    /**
+     * Returns the number of documents in this index containing the $term.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return integer
+     */
+    public function docFreq(Zend_Search_Lucene_Index_Term $term);
+
+    /**
+     * Retrive similarity used by index reader
+     *
+     * @return Zend_Search_Lucene_Search_Similarity
+     */
+    public function getSimilarity();
+
+    /**
+     * Returns a normalization factor for "field, document" pair.
+     *
+     * @param integer $id
+     * @param string $fieldName
+     * @return float
+     */
+    public function norm($id, $fieldName);
+
+    /**
+     * Returns true if any documents have been deleted from this index.
+     *
+     * @return boolean
+     */
+    public function hasDeletions();
+
+    /**
+     * Deletes a document from the index.
+     * $id is an internal document id
+     *
+     * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function delete($id);
+
+    /**
+     * Adds a document to this index.
+     *
+     * @param Zend_Search_Lucene_Document $document
+     */
+    public function addDocument(Zend_Search_Lucene_Document $document);
+
+    /**
+     * Commit changes resulting from delete() or undeleteAll() operations.
+     */
+    public function commit();
+
+    /**
+     * Optimize index.
+     *
+     * Merges all segments into one
+     */
+    public function optimize();
+
+    /**
+     * Returns an array of all terms in this index.
+     *
+     * @return array
+     */
+    public function terms();
+
+    /**
+     * Undeletes all documents currently marked as deleted in this index.
+     */
+    public function undeleteAll();
+
+
+    /**
+     * Add reference to the index object
+     *
+     * @internal
+     */
+    public function addReference();
+
+    /**
+     * Remove reference from the index object
+     *
+     * When reference count becomes zero, index is closed and resources are cleaned up
+     *
+     * @internal
+     */
+    public function removeReference();
+}
diff --git a/search/Zend/Search/Lucene/PriorityQueue.php b/search/Zend/Search/Lucene/PriorityQueue.php
new file mode 100644 (file)
index 0000000..4e844b1
--- /dev/null
@@ -0,0 +1,170 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+
+/**
+ * Abstract Priority Queue
+ *
+ * It implements a priority queue.
+ * Please go to "Data Structures and Algorithms",
+ * Aho, Hopcroft, and Ullman, Addison-Wesley, 1983 (corrected 1987 edition),
+ * for implementation details.
+ *
+ * It provides O(log(N)) time of put/pop operations, where N is a size of queue
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+abstract class Zend_Search_Lucene_PriorityQueue
+{
+    /**
+     * Queue heap
+     *
+     * Heap contains balanced partial ordered binary tree represented in array
+     * [0] - top of the tree
+     * [1] - first child of [0]
+     * [2] - second child of [0]
+     * ...
+     * [2*n + 1] - first child of [n]
+     * [2*n + 2] - second child of [n]
+     *
+     * @var array
+     */
+    private $_heap = array();
+
+
+    /**
+     * Add element to the queue
+     *
+     * O(log(N)) time
+     *
+     * @param mixed $element
+     */
+    public function put($element)
+    {
+        $nodeId   = count($this->_heap);
+        $parentId = ($nodeId-1) >> 1;   // floor( ($nodeId-1)/2 )
+
+        while ($nodeId != 0  &&  $this->_less($element, $this->_heap[$parentId])) {
+            // Move parent node down
+            $this->_heap[$nodeId] = $this->_heap[$parentId];
+
+            // Move pointer to the next level of tree
+            $nodeId   = $parentId;
+            $parentId = ($nodeId-1) >> 1;   // floor( ($nodeId-1)/2 )
+        }
+
+        // Put new node into the tree
+        $this->_heap[$nodeId] = $element;
+    }
+
+
+    /**
+     * Return least element of the queue
+     *
+     * Constant time
+     *
+     * @return mixed
+     */
+    public function top()
+    {
+        if (count($this->_heap) == 0) {
+            return null;
+        }
+
+        return $this->_heap[0];
+    }
+
+
+    /**
+     * Removes and return least element of the queue
+     *
+     * O(log(N)) time
+     *
+     * @return mixed
+     */
+    public function pop()
+    {
+        if (count($this->_heap) == 0) {
+            return null;
+        }
+
+        $top = $this->_heap[0];
+        $lastId = count($this->_heap) - 1;
+
+        /**
+         * Find appropriate position for last node
+         */
+        $nodeId  = 0;     // Start from a top
+        $childId = 1;     // First child
+
+        // Choose smaller child
+        if ($lastId > 2  &&  $this->_less($this->_heap[2], $this->_heap[1])) {
+            $childId = 2;
+        }
+
+        while ($childId < $lastId  &&
+               $this->_less($this->_heap[$childId], $this->_heap[$lastId])
+          ) {
+            // Move child node up
+            $this->_heap[$nodeId] = $this->_heap[$childId];
+
+            $nodeId  = $childId;               // Go down
+            $childId = ($nodeId << 1) + 1;     // First child
+
+            // Choose smaller child
+            if (($childId+1) < $lastId  &&
+                $this->_less($this->_heap[$childId+1], $this->_heap[$childId])
+               ) {
+                $childId++;
+            }
+        }
+
+        // Move last element to the new position
+        $this->_heap[$nodeId] = $this->_heap[$lastId];
+        unset($this->_heap[$lastId]);
+
+        return $top;
+    }
+
+
+    /**
+     * Clear queue
+     */
+    public function clear()
+    {
+        $this->_heap = array();
+    }
+
+
+    /**
+     * Compare elements
+     *
+     * Returns true, if $el1 is less than $el2; else otherwise
+     *
+     * @param mixed $el1
+     * @param mixed $el2
+     * @return boolean
+     */
+    abstract protected function _less($el1, $el2);
+}
+
diff --git a/search/Zend/Search/Lucene/Proxy.php b/search/Zend/Search/Lucene/Proxy.php
new file mode 100644 (file)
index 0000000..53fb150
--- /dev/null
@@ -0,0 +1,468 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+
+/** Zend_Search_Lucene_Interface */
+require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php';
+
+
+/**
+ * Proxy class intended to be used in userland.
+ *
+ * It tracks, when index object goes out of scope and forces ndex closing
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Proxy implements Zend_Search_Lucene_Interface
+{
+    /**
+     * Index object
+     *
+     * @var Zend_Search_Lucene_Interface
+     */
+    private $_index;
+
+    /**
+     * Object constructor
+     *
+     * @param Zend_Search_Lucene_Interface $index
+     */
+    public function __construct(Zend_Search_Lucene_Interface $index)
+    {
+        $this->_index = $index;
+        $this->_index->addReference();
+    }
+
+    /**
+     * Object destructor
+     */
+    public function __destruct()
+    {
+        if ($this->_index !== null) {
+            // This code is invoked if Zend_Search_Lucene_Interface object constructor throws an exception
+            $this->_index->removeReference();
+        }
+        $this->_index = null;
+    }
+
+    /**
+     * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
+     *
+     * @return Zend_Search_Lucene_Storage_Directory
+     */
+    public function getDirectory()
+    {
+        return $this->_index->getDirectory();
+    }
+
+    /**
+     * Returns the total number of documents in this index (including deleted documents).
+     *
+     * @return integer
+     */
+    public function count()
+    {
+        return $this->_index->count();
+    }
+
+    /**
+     * Returns one greater than the largest possible document number.
+     * This may be used to, e.g., determine how big to allocate a structure which will have
+     * an element for every document number in an index.
+     *
+     * @return integer
+     */
+    public function maxDoc()
+    {
+        return $this->_index->maxDoc();
+    }
+
+    /**
+     * Returns the total number of non-deleted documents in this index.
+     *
+     * @return integer
+     */
+    public function numDocs()
+    {
+        return $this->_index->numDocs();
+    }
+
+    /**
+     * Checks, that document is deleted
+     *
+     * @param integer $id
+     * @return boolean
+     * @throws Zend_Search_Lucene_Exception    Exception is thrown if $id is out of the range
+     */
+    public function isDeleted($id)
+    {
+        return $this->_index->isDeleted($id);
+    }
+
+    /**
+     * Set default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * Default value is null
+     *
+     * @param string $fieldName
+     */
+    public static function setDefaultSearchField($fieldName)
+    {
+        Zend_Search_Lucene::setDefaultSearchField($fieldName);
+    }
+
+    /**
+     * Get default search field.
+     *
+     * Null means, that search is performed through all fields by default
+     *
+     * @return string
+     */
+    public static function getDefaultSearchField()
+    {
+        return Zend_Search_Lucene::getDefaultSearchField();
+    }
+
+    /**
+     * Retrieve index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMaxBufferedDocs()
+    {
+        return $this->_index->getMaxBufferedDocs();
+    }
+
+    /**
+     * Set index maxBufferedDocs option
+     *
+     * maxBufferedDocs is a minimal number of documents required before
+     * the buffered in-memory documents are written into a new Segment
+     *
+     * Default value is 10
+     *
+     * @param integer $maxBufferedDocs
+     */
+    public function setMaxBufferedDocs($maxBufferedDocs)
+    {
+        $this->_index->setMaxBufferedDocs($maxBufferedDocs);
+    }
+
+
+    /**
+     * Retrieve index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @return integer
+     */
+    public function getMaxMergeDocs()
+    {
+        return $this->_index->getMaxMergeDocs();
+    }
+
+    /**
+     * Set index maxMergeDocs option
+     *
+     * maxMergeDocs is a largest number of documents ever merged by addDocument().
+     * Small values (e.g., less than 10,000) are best for interactive indexing,
+     * as this limits the length of pauses while indexing to a few seconds.
+     * Larger values are best for batched indexing and speedier searches.
+     *
+     * Default value is PHP_INT_MAX
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMaxMergeDocs($maxMergeDocs)
+    {
+        $this->_index->setMaxMergeDocs($maxMergeDocs);
+    }
+
+
+    /**
+     * Retrieve index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @return integer
+     */
+    public function getMergeFactor()
+    {
+        return $this->_index->getMergeFactor();
+    }
+
+    /**
+     * Set index mergeFactor option
+     *
+     * mergeFactor determines how often segment indices are merged by addDocument().
+     * With smaller values, less RAM is used while indexing,
+     * and searches on unoptimized indices are faster,
+     * but indexing speed is slower.
+     * With larger values, more RAM is used during indexing,
+     * and while searches on unoptimized indices are slower,
+     * indexing is faster.
+     * Thus larger values (> 10) are best for batch index creation,
+     * and smaller values (< 10) for indices that are interactively maintained.
+     *
+     * Default value is 10
+     *
+     * @param integer $maxMergeDocs
+     */
+    public function setMergeFactor($mergeFactor)
+    {
+        $this->_index->setMergeFactor($mergeFactor);
+    }
+
+    /**
+     * Performs a query against the index and returns an array
+     * of Zend_Search_Lucene_Search_QueryHit objects.
+     * Input is a string or Zend_Search_Lucene_Search_Query.
+     *
+     * @param mixed $query
+     * @return array Zend_Search_Lucene_Search_QueryHit
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function find($query)
+    {
+        // actual parameter list
+        $parameters = func_get_args();
+
+        // invoke $this->_index->find() method with specified parameters
+        return call_user_func_array(array(&$this->_index, 'find'), $parameters);
+    }
+
+    /**
+     * Returns a list of all unique field names that exist in this index.
+     *
+     * @param boolean $indexed
+     * @return array
+     */
+    public function getFieldNames($indexed = false)
+    {
+        return $this->_index->getFieldNames($indexed);
+    }
+
+    /**
+     * Returns a Zend_Search_Lucene_Document object for the document
+     * number $id in this index.
+     *
+     * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+     * @return Zend_Search_Lucene_Document
+     */
+    public function getDocument($id)
+    {
+        return $this->_index->getDocument($id);
+    }
+
+    /**
+     * Returns true if index contain documents with specified term.
+     *
+     * Is used for query optimization.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return boolean
+     */
+    public function hasTerm(Zend_Search_Lucene_Index_Term $term)
+    {
+        return $this->_index->hasTerm($term);
+    }
+
+    /**
+     * Returns IDs of all the documents containing term.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return array
+     */
+    public function termDocs(Zend_Search_Lucene_Index_Term $term)
+    {
+        return $this->_index->termDocs($term);
+    }
+
+    /**
+     * Returns an array of all term freqs.
+     * Return array structure: array( docId => freq, ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return integer
+     */
+    public function termFreqs(Zend_Search_Lucene_Index_Term $term)
+    {
+        return $this->_index->termFreqs($term);
+    }
+
+    /**
+     * Returns an array of all term positions in the documents.
+     * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return array
+     */
+    public function termPositions(Zend_Search_Lucene_Index_Term $term)
+    {
+        return $this->_index->termPositions($term);
+    }
+
+    /**
+     * Returns the number of documents in this index containing the $term.
+     *
+     * @param Zend_Search_Lucene_Index_Term $term
+     * @return integer
+     */
+    public function docFreq(Zend_Search_Lucene_Index_Term $term)
+    {
+        return $this->_index->docFreq($term);
+    }
+
+    /**
+     * Retrive similarity used by index reader
+     *
+     * @return Zend_Search_Lucene_Search_Similarity
+     */
+    public function getSimilarity()
+    {
+        return $this->_index->getSimilarity();
+    }
+
+    /**
+     * Returns a normalization factor for "field, document" pair.
+     *
+     * @param integer $id
+     * @param string $fieldName
+     * @return float
+     */
+    public function norm($id, $fieldName)
+    {
+        return $this->_index->norm($id, $fieldName);
+    }
+
+    /**
+     * Returns true if any documents have been deleted from this index.
+     *
+     * @return boolean
+     */
+    public function hasDeletions()
+    {
+        return $this->_index->hasDeletions();
+    }
+
+    /**
+     * Deletes a document from the index.
+     * $id is an internal document id
+     *
+     * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function delete($id)
+    {
+        return $this->_index->delete($id);
+    }
+
+    /**
+     * Adds a document to this index.
+     *
+     * @param Zend_Search_Lucene_Document $document
+     */
+    public function addDocument(Zend_Search_Lucene_Document $document)
+    {
+        $this->_index->addDocument($document);
+    }
+
+    /**
+     * Commit changes resulting from delete() or undeleteAll() operations.
+     */
+    public function commit()
+    {
+        $this->_index->commit();
+    }
+
+    /**
+     * Optimize index.
+     *
+     * Merges all segments into one
+     */
+    public function optimize()
+    {
+        $this->_index->optimize();
+    }
+
+    /**
+     * Returns an array of all terms in this index.
+     *
+     * @return array
+     */
+    public function terms()
+    {
+        return $this->_index->terms();
+    }
+
+    /**
+     * Undeletes all documents currently marked as deleted in this index.
+     */
+    public function undeleteAll()
+    {
+        return $this->_index->undeleteAll();
+    }
+
+    /**
+     * Add reference to the index object
+     *
+     * @internal
+     */
+    public function addReference()
+    {
+        return $this->_index->addReference();
+    }
+
+    /**
+     * Remove reference from the index object
+     *
+     * When reference count becomes zero, index is closed and resources are cleaned up
+     *
+     * @internal
+     */
+    public function removeReference()
+    {
+        return $this->_index->removeReference();
+    }
+}
index 06f7b48..799a19e 100644 (file)
@@ -1,14 +1,7 @@
 @todo
 
-- Improve API: fix ZSearchMultiTermQuery($terms, $signs);
-
-- Analysis and indexing engine
-
-- Additional queries: phrase, wildcard, proximity, and range
+- Additional queries: wildcard, proximity, and range
 
 - Better class-level docblocks (most functions okay)
 
-- Some Windows issues(?) during indexing
-
-- Finish renaming classes to PEAR-like conventions