Initial commit
[moodle.git] / search / Zend / Search / Lucene / Index / SegmentWriter.php
1 <?php
2 /**
3  * Zend Framework
4  *
5  * LICENSE
6  *
7  * This source file is subject to the new BSD license that is bundled
8  * with this package in the file LICENSE.txt.
9  * It is also available through the world-wide-web at this URL:
10  * http://framework.zend.com/license/new-bsd
11  * If you did not receive a copy of the license and are unable to
12  * obtain it through the world-wide-web, please send an email
13  * to license@zend.com so we can send you a copy immediately.
14  *
15  * @category   Zend
16  * @package    Zend_Search_Lucene
17  * @subpackage Index
18  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
20  */
23 /** Zend_Search_Lucene_Exception */
24 require_once 'Zend/Search/Lucene/Exception.php';
26 /** Zend_Search_Lucene_Analysis_Analyzer */
27 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
29 /** Zend_Search_Lucene_Index_SegmentInfo */
30 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
33 /**
34  * @category   Zend
35  * @package    Zend_Search_Lucene
36  * @subpackage Index
37  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
38  * @license    http://framework.zend.com/license/new-bsd     New BSD License
39  */
40 class Zend_Search_Lucene_Index_SegmentWriter
41 {
42     /**
43      * Expert: The fraction of terms in the "dictionary" which should be stored
44      * in RAM.  Smaller values use more memory, but make searching slightly
45      * faster, while larger values use less memory and make searching slightly
46      * slower.  Searching is typically not dominated by dictionary lookup, so
47      * tweaking this is rarely useful.
48      *
49      * @var integer
50      */
51     static public $indexInterval = 128;
53     /** Expert: The fraction of TermDocs entries stored in skip tables.
54      * Larger values result in smaller indexes, greater acceleration, but fewer
55      * accelerable cases, while smaller values result in bigger indexes,
56      * less acceleration and more
57      * accelerable cases. More detailed experiments would be useful here.
58      *
59      * 0x0x7FFFFFFF indicates that we don't use skip data
60      * Default value is 16
61      *
62      * @var integer
63      */
64     static public $skipInterval = 0x7FFFFFFF;
66     /**
67      * Number of docs in a segment
68      *
69      * @var integer
70      */
71     private $_docCount;
73     /**
74      * Segment name
75      *
76      * @var string
77      */
78     private $_name;
80     /**
81      * File system adapter.
82      *
83      * @var Zend_Search_Lucene_Storage_Directory
84      */
85     private $_directory;
87     /**
88      * List of the index files.
89      * Used for automatic compound file generation
90      *
91      * @var unknown_type
92      */
93     private $_files;
95     /**
96      * Term Dictionary
97      * Array of the Zend_Search_Lucene_Index_Term objects
98      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
99      *
100      * @var array
101      */
102     private $_termDictionary;
104     /**
105      * Documents, which contain the term
106      *
107      * @var array
108      */
109     private $_termDocs;
111     /**
112      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
113      *
114      * @var array
115      */
116     private $_fields;
118     /**
119      * Sizes of the indexed fields.
120      * Used for normalization factors calculation.
121      *
122      * @var array
123      */
124     private $_fieldLengths;
126     /**
127      * '.fdx'  file - Stored Fields, the field index.
128      *
129      * @var Zend_Search_Lucene_Storage_File
130      */
131     private $_fdxFile;
133     /**
134      * '.fdt'  file - Stored Fields, the field data.
135      *
136      * @var Zend_Search_Lucene_Storage_File
137      */
138     private $_fdtFile;
141     /**
142      * Object constructor.
143      *
144      * @param Zend_Search_Lucene_Storage_Directory $directory
145      * @param string $name
146      */
147     public function __construct($directory, $name)
148     {
149         $this->_directory = $directory;
150         $this->_name      = $name;
151         $this->_docCount  = 0;
153         $this->_fields         = array();
154         $this->_termDocs       = array();
155         $this->_files          = array();
156         $this->_norms          = array();
157         $this->_fieldLengths   = array();
158         $this->_termDictionary = array();
160         $this->_fdxFile = null;
161         $this->_fdtFile = null;
162     }
165     /**
166      * Add field to the segment
167      *
168      * @param Zend_Search_Lucene_Field $field
169      */
170     private function _addFieldInfo(Zend_Search_Lucene_Field $field)
171     {
172         if (!isset($this->_fields[$field->name])) {
173             $this->_fields[$field->name] =
174                                 new Zend_Search_Lucene_Index_FieldInfo($field->name,
175                                                                        $field->isIndexed,
176                                                                        count($this->_fields),
177                                                                        $field->storeTermVector);
178         } else {
179             $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
180             $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
181         }
182     }
185     /**
186      * Adds a document to this segment.
187      *
188      * @param Zend_Search_Lucene_Document $document
189      * @throws Zend_Search_Lucene_Exception
190      */
191     public function addDocument(Zend_Search_Lucene_Document $document)
192     {
193         $storedFields = array();
195         foreach ($document->getFieldNames() as $fieldName) {
196             $field = $document->getField($fieldName);
197             $this->_addFieldInfo($field);
199             if ($field->storeTermVector) {
200                 /**
201                  * @todo term vector storing support
202                  */
203                 throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
204             }
206             if ($field->isIndexed) {
207                 if ($field->isTokenized) {
208                     $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
209                 } else {
210                     $tokenList = array();
211                     $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
212                 }
213                 $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
215                 $position = 0;
216                 foreach ($tokenList as $token) {
217                     $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
218                     $termKey = $term->key();
220                     if (!isset($this->_termDictionary[$termKey])) {
221                         // New term
222                         $this->_termDictionary[$termKey] = $term;
223                         $this->_termDocs[$termKey] = array();
224                         $this->_termDocs[$termKey][$this->_docCount] = array();
225                     } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
226                         // Existing term, but new term entry
227                         $this->_termDocs[$termKey][$this->_docCount] = array();
228                     }
229                     $position += $token->getPositionIncrement();
230                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
231                 }
232             }
234             if ($field->isStored) {
235                 $storedFields[] = $field;
236             }
237         }
239         if (count($storedFields) != 0) {
240             if (!isset($this->_fdxFile)) {
241                 $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
242                 $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
244                 $this->_files[] = $this->_name . '.fdx';
245                 $this->_files[] = $this->_name . '.fdt';
246             }
248             $this->_fdxFile->writeLong($this->_fdtFile->tell());
249             $this->_fdtFile->writeVInt(count($storedFields));
250             foreach ($storedFields as $field) {
251                 $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
252                 $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
253                              ($field->isBinary ?    0x02 : 0x00) |
254                              0x00; /* 0x04 - third bit, compressed (ZLIB) */
255                 $this->_fdtFile->writeByte($fieldBits);
256                 if ($field->isBinary) {
257                     $this->_fdtFile->writeVInt(strlen($field->stringValue));
258                     $this->_fdtFile->writeBytes($field->stringValue);
259                 } else {
260                     $this->_fdtFile->writeString($field->stringValue);
261                 }
262             }
263         }
265         $this->_docCount++;
266     }
269     /**
270      * Dump Field Info (.fnm) segment file
271      */
272     private function _dumpFNM()
273     {
274         $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
275         $fnmFile->writeVInt(count($this->_fields));
277         foreach ($this->_fields as $field) {
278             $fnmFile->writeString($field->name);
279             $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
280                                 ($field->storeTermVector ? 0x02 : 0x00)
281 // not supported yet            0x04 /* term positions are stored with the term vectors */ |
282 // not supported yet            0x08 /* term offsets are stored with the term vectors */   |
283                                );
285             if ($field->isIndexed) {
286                 $fieldNum   = $this->_fields[$field->name]->number;
287                 $fieldName  = $field->name;
288                 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
289                 $norm       = '';
291                 for ($count = 0; $count < $this->_docCount; $count++) {
292                     $numTokens = isset($this->_fieldLengths[$fieldName][$count]) ?
293                                       $this->_fieldLengths[$fieldName][$count] : 0;
294                     $norm .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, $numTokens)));
295                 }
297                 $normFileName = $this->_name . '.f' . $fieldNum;
298                 $fFile = $this->_directory->createFile($normFileName);
299                 $fFile->writeBytes($norm);
300                 $this->_files[] = $normFileName;
301             }
302         }
304         $this->_files[] = $this->_name . '.fnm';
305     }
308     /**
309      * Dump Term Dictionary segment file entry.
310      * Used to write entry to .tis or .tii files
311      *
312      * @param Zend_Search_Lucene_Storage_File $dicFile
313      * @param Zend_Search_Lucene_Index_Term $prevTerm
314      * @param Zend_Search_Lucene_Index_Term $term
315      * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
316      * @param Zend_Search_Lucene_Index_TermInfo $termInfo
317      */
318     private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
319                                         &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
320                                         &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
321     {
322         if (isset($prevTerm) && $prevTerm->field == $term->field) {
323             $prefixLength = 0;
324             while ($prefixLength < strlen($prevTerm->text) &&
325                    $prefixLength < strlen($term->text) &&
326                    $prevTerm->text{$prefixLength} == $term->text{$prefixLength}
327                   ) {
328                 $prefixLength++;
329             }
330             // Write preffix length
331             $dicFile->writeVInt($prefixLength);
332             // Write suffix
333             $dicFile->writeString( substr($term->text, $prefixLength) );
334         } else {
335             // Write preffix length
336             $dicFile->writeVInt(0);
337             // Write suffix
338             $dicFile->writeString($term->text);
339         }
340         // Write field number
341         $dicFile->writeVInt($term->field);
342         // DocFreq (the count of documents which contain the term)
343         $dicFile->writeVInt($termInfo->docFreq);
345         $prevTerm = $term;
347         if (!isset($prevTermInfo)) {
348             // Write FreqDelta
349             $dicFile->writeVInt($termInfo->freqPointer);
350             // Write ProxDelta
351             $dicFile->writeVInt($termInfo->proxPointer);
352         } else {
353             // Write FreqDelta
354             $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
355             // Write ProxDelta
356             $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
357         }
358         // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
359         if ($termInfo->skipOffset != 0) {
360             $dicFile->writeVInt($termInfo->skipOffset);
361         }
363         $prevTermInfo = $termInfo;
364     }
366     /**
367      * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
368      */
369     private function _dumpDictionary()
370     {
371         $termKeys = array_keys($this->_termDictionary);
372         sort($termKeys, SORT_STRING);
374         $tisFile = $this->_directory->createFile($this->_name . '.tis');
375         $tisFile->writeInt((int)0xFFFFFFFE);
376         $tisFile->writeLong(count($termKeys));
377         $tisFile->writeInt(self::$indexInterval);
378         $tisFile->writeInt(self::$skipInterval);
380         $tiiFile = $this->_directory->createFile($this->_name . '.tii');
381         $tiiFile->writeInt((int)0xFFFFFFFE);
382         $tiiFile->writeLong(ceil((count($termKeys) + 2)/self::$indexInterval));
383         $tiiFile->writeInt(self::$indexInterval);
384         $tiiFile->writeInt(self::$skipInterval);
386         /** Dump dictionary header */
387         $tiiFile->writeVInt(0);                    // preffix length
388         $tiiFile->writeString('');                 // suffix
389         $tiiFile->writeInt((int)0xFFFFFFFF);       // field number
390         $tiiFile->writeByte((int)0x0F);
391         $tiiFile->writeVInt(0);                    // DocFreq
392         $tiiFile->writeVInt(0);                    // FreqDelta
393         $tiiFile->writeVInt(0);                    // ProxDelta
394         $tiiFile->writeVInt(20);                   // IndexDelta
396         $frqFile = $this->_directory->createFile($this->_name . '.frq');
397         $prxFile = $this->_directory->createFile($this->_name . '.prx');
399         $termCount = 1;
401         $prevTerm     = null;
402         $prevTermInfo = null;
403         $prevIndexTerm     = null;
404         $prevIndexTermInfo = null;
405         $prevIndexPosition = 20;
407         foreach ($termKeys as $termId) {
408             $freqPointer = $frqFile->tell();
409             $proxPointer = $prxFile->tell();
411             $prevDoc = 0;
412             foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
413                 $docDelta = ($docId - $prevDoc)*2;
414                 $prevDoc = $docId;
415                 if (count($termPositions) > 1) {
416                     $frqFile->writeVInt($docDelta);
417                     $frqFile->writeVInt(count($termPositions));
418                 } else {
419                     $frqFile->writeVInt($docDelta + 1);
420                 }
422                 $prevPosition = 0;
423                 foreach ($termPositions as $position) {
424                     $prxFile->writeVInt($position - $prevPosition);
425                     $prevPosition = $position;
426                 }
427             }
429             if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
430                 /**
431                  * @todo Write Skip Data to a freq file.
432                  * It's not used now, but make index more optimal
433                  */
434                 $skipOffset = $frqFile->tell() - $freqPointer;
435             } else {
436                 $skipOffset = 0;
437             }
439             $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
440                                                       $this->_fields[$this->_termDictionary[$termId]->field]->number);
441             $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
442                                             $freqPointer, $proxPointer, $skipOffset);
444             $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);
446             if ($termCount % self::$indexInterval == 0) {
447                 $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);
449                 $indexPosition = $tisFile->tell();
450                 $tiiFile->writeVInt($indexPosition - $prevIndexPosition);
451                 $prevIndexPosition = $indexPosition;
452             }
453             $termCount++;
454         }
456         $this->_files[] = $this->_name . '.tis';
457         $this->_files[] = $this->_name . '.tii';
458         $this->_files[] = $this->_name . '.frq';
459         $this->_files[] = $this->_name . '.prx';
460     }
463     /**
464      * Generate compound index file
465      */
466     private function _generateCFS()
467     {
468         $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
469         $cfsFile->writeVInt(count($this->_files));
471         $dataOffsetPointers = array();
472         foreach ($this->_files as $fileName) {
473             $dataOffsetPointers[$fileName] = $cfsFile->tell();
474             $cfsFile->writeLong(0); // write dummy data
475             $cfsFile->writeString($fileName);
476         }
478         foreach ($this->_files as $fileName) {
479             // Get actual data offset
480             $dataOffset = $cfsFile->tell();
481             // Seek to the data offset pointer
482             $cfsFile->seek($dataOffsetPointers[$fileName]);
483             // Write actual data offset value
484             $cfsFile->writeLong($dataOffset);
485             // Seek back to the end of file
486             $cfsFile->seek($dataOffset);
488             $dataFile = $this->_directory->getFileObject($fileName);
489             $data = $dataFile->readBytes($this->_directory->fileLength($fileName));
490             $cfsFile->writeBytes($data);
492             $this->_directory->deleteFile($fileName);
493         }
494     }
497     /**
498      * Close segment, write it to disk and return segment info
499      *
500      * @return Zend_Search_Lucene_Index_SegmentInfo
501      */
502     public function close()
503     {
504         if ($this->_docCount == 0) {
505             return null;
506         }
508         $this->_dumpFNM();
509         $this->_dumpDictionary();
511         $this->_generateCFS();
513         return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
514                                                         $this->_docCount,
515                                                         $this->_directory);
516     }