Initial commit
[moodle.git] / search / Zend / Search / Lucene / Index / SegmentInfo.php
1 <?php
2 /**
3  * Zend Framework
4  *
5  * LICENSE
6  *
7  * This source file is subject to the new BSD license that is bundled
8  * with this package in the file LICENSE.txt.
9  * It is also available through the world-wide-web at this URL:
10  * http://framework.zend.com/license/new-bsd
11  * If you did not receive a copy of the license and are unable to
12  * obtain it through the world-wide-web, please send an email
13  * to license@zend.com so we can send you a copy immediately.
14  *
15  * @category   Zend
16  * @package    Zend_Search_Lucene
17  * @subpackage Index
18  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
20  */
23 /** Zend_Search_Lucene_Exception */
24 require_once 'Zend/Search/Lucene/Exception.php';
27 /**
28  * @category   Zend
29  * @package    Zend_Search_Lucene
30  * @subpackage Index
31  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
32  * @license    http://framework.zend.com/license/new-bsd     New BSD License
33  */
34 class Zend_Search_Lucene_Index_SegmentInfo
35 {
36     /**
37      * Number of docs in a segment
38      *
39      * @var integer
40      */
41     private $_docCount;
43     /**
44      * Segment name
45      *
46      * @var string
47      */
48     private $_name;
50     /**
51      * Term Dictionary Index
52      * Array of the Zend_Search_Lucene_Index_Term objects
53      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
54      *
55      * @var array
56      */
57     private $_termDictionary;
59     /**
60      * Term Dictionary Index TermInfos
61      * Array of the Zend_Search_Lucene_Index_TermInfo objects
62      *
63      * @var array
64      */
65     private $_termDictionaryInfos;
67     /**
68      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
69      *
70      * @var array
71      */
72     private $_fields;
74     /**
75      * Field positions in a dictionary.
76      * (Term dictionary contains filelds ordered by names)
77      *
78      * @var array
79      */
80     private $_fieldsDicPositions;
83     /**
84      * Associative array where the key is the file name and the value is data offset
85      * in a compound segment file (.csf).
86      *
87      * @var array
88      */
89     private $_segFiles;
91     /**
92      * File system adapter.
93      *
94      * @var Zend_Search_Lucene_Storage_Directory_Filesystem
95      */
96     private $_directory;
98     /**
99      * Normalization factors.
100      * An array fieldName => normVector
101      * normVector is a binary string.
102      * Each byte corresponds to an indexed document in a segment and
103      * encodes normalization factor (float value, encoded by
104      * Zend_Search_Lucene_Search_Similarity::encodeNorm())
105      *
106      * @var array
107      */
108     private $_norms = array();
110     /**
111      * List of deleted documents.
112      * bitset if bitset extension is loaded or array otherwise.
113      *
114      * @var mixed
115      */
116     private $_deleted;
118     /**
119      * $this->_deleted update flag
120      *
121      * @var boolean
122      */
123     private $_deletedDirty = false;
125     /**
126      * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
127      * Documents count and Directory as a parameter.
128      *
129      * @param string $name
130      * @param integer $docCount
131      * @param Zend_Search_Lucene_Storage_Directory $directory
132      */
133     public function __construct($name, $docCount, $directory)
134     {
135         $this->_name = $name;
136         $this->_docCount = $docCount;
137         $this->_directory = $directory;
138         $this->_termDictionary = null;
140         $this->_segFiles = array();
141         if ($this->_directory->fileExists($name . '.cfs')) {
142             $cfsFile = $this->_directory->getFileObject($name . '.cfs');
143             $segFilesCount = $cfsFile->readVInt();
145             for ($count = 0; $count < $segFilesCount; $count++) {
146                 $dataOffset = $cfsFile->readLong();
147                 $fileName = $cfsFile->readString();
148                 $this->_segFiles[$fileName] = $dataOffset;
149             }
150         }
152         $fnmFile = $this->openCompoundFile('.fnm');
153         $fieldsCount = $fnmFile->readVInt();
154         $fieldNames = array();
155         $fieldNums  = array();
156         $this->_fields = array();
157         for ($count=0; $count < $fieldsCount; $count++) {
158             $fieldName = $fnmFile->readString();
159             $fieldBits = $fnmFile->readByte();
160             $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
161                                                                             $fieldBits & 1,
162                                                                             $count,
163                                                                             $fieldBits & 2 );
164             if ($fieldBits & 0x10) {
165                 // norms are omitted for the indexed field
166                 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
167             }
169             $fieldNums[$count]  = $count;
170             $fieldNames[$count] = $fieldName;
171         }
172         array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
173         $this->_fieldsDicPositions = array_flip($fieldNums);
175         try {
176             $delFile = $this->openCompoundFile('.del');
178             $byteCount = $delFile->readInt();
179             $byteCount = ceil($byteCount/8);
180             $bitCount  = $delFile->readInt();
182             if ($bitCount == 0) {
183                 $delBytes = '';
184             } else {
185                 $delBytes = $delFile->readBytes($byteCount);
186             }
188             if (extension_loaded('bitset')) {
189                 $this->_deleted = $delBytes;
190             } else {
191                 $this->_deleted = array();
192                 for ($count = 0; $count < $byteCount; $count++) {
193                     $byte = ord($delBytes{$count});
194                     for ($bit = 0; $bit < 8; $bit++) {
195                         if ($byte & (1<<$bit)) {
196                             $this->_deleted[$count*8 + $bit] = 1;
197                         }
198                     }
199                 }
201             }
202         } catch(Zend_Search_Exception $e) {
203             if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
204                 $this->_deleted = null;
205             } else {
206                 throw $e;
207             }
208         }
209     }
211     /**
212      * Opens index file stoted within compound index file
213      *
214      * @param string $extension
215      * @throws Zend_Search_Lucene_Exception
216      * @return Zend_Search_Lucene_Storage_File
217      */
218     public function openCompoundFile($extension)
219     {
220         $filename = $this->_name . $extension;
222         // Try to open common file first
223         if ($this->_directory->fileExists($filename)) {
224             return $this->_directory->getFileObject($filename);
225         }
227         if( !isset($this->_segFiles[$filename]) ) {
228             throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
229                                        . $filename . ' file.' );
230         }
232         $file = $this->_directory->getFileObject( $this->_name.".cfs" );
233         $file->seek($this->_segFiles[$filename]);
234         return $file;
235     }
237     /**
238      * Returns field index or -1 if field is not found
239      *
240      * @param string $fieldName
241      * @return integer
242      */
243     public function getFieldNum($fieldName)
244     {
245         foreach( $this->_fields as $field ) {
246             if( $field->name == $fieldName ) {
247                 return $field->number;
248             }
249         }
251         return -1;
252     }
254     /**
255      * Returns field info for specified field
256      *
257      * @param integer $fieldNum
258      * @return ZSearchFieldInfo
259      */
260     public function getField($fieldNum)
261     {
262         return $this->_fields[$fieldNum];
263     }
265     /**
266      * Returns array of fields.
267      * if $indexed parameter is true, then returns only indexed fields.
268      *
269      * @param boolean $indexed
270      * @return array
271      */
272     public function getFields($indexed = false)
273     {
274         $result = array();
275         foreach( $this->_fields as $field ) {
276             if( (!$indexed) || $field->isIndexed ) {
277                 $result[ $field->name ] = $field->name;
278             }
279         }
280         return $result;
281     }
283     /**
284      * Returns the total number of documents in this segment.
285      *
286      * @return integer
287      */
288     public function count()
289     {
290         return $this->_docCount;
291     }
293     /**
294      * Get field position in a fields dictionary
295      *
296      * @param integer $fieldNum
297      * @return integer
298      */
299     private function _getFieldPosition($fieldNum) {
300         // Treat values which are not in a translation table as a 'direct value'
301         return isset($this->_fieldsDicPositions[$fieldNum]) ?
302                            $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
303     }
305     /**
306      * Loads Term dictionary from TermInfoIndex file
307      */
308     protected function _loadDictionary()
309     {
310         if ($this->_termDictionary !== null) {
311             return;
312         }
314         $this->_termDictionary = array();
315         $this->_termDictionaryInfos = array();
317         $tiiFile = $this->openCompoundFile('.tii');
318         $tiVersion = $tiiFile->readInt();
319         if ($tiVersion != (int)0xFFFFFFFE) {
320             throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
321         }
323         $indexTermCount = $tiiFile->readLong();
324                           $tiiFile->readInt();  // IndexInterval
325         $skipInterval   = $tiiFile->readInt();
327         $prevTerm     = '';
328         $freqPointer  =  0;
329         $proxPointer  =  0;
330         $indexPointer =  0;
331         for ($count = 0; $count < $indexTermCount; $count++) {
332             $termPrefixLength = $tiiFile->readVInt();
333             $termSuffix       = $tiiFile->readString();
334             $termValue        = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;
336             $termFieldNum     = $tiiFile->readVInt();
337             $docFreq          = $tiiFile->readVInt();
338             $freqPointer     += $tiiFile->readVInt();
339             $proxPointer     += $tiiFile->readVInt();
340             if( $docFreq >= $skipInterval ) {
341                 $skipDelta = $tiiFile->readVInt();
342             } else {
343                 $skipDelta = 0;
344             }
346             $indexPointer += $tiiFile->readVInt();
348             $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
349             $this->_termDictionaryInfos[] =
350                 new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
351             $prevTerm = $termValue;
352         }
353     }
356     /**
357      * Return segment name
358      *
359      * @return string
360      */
361     public function getName()
362     {
363         return $this->_name;
364     }
367     /**
368      * Scans terms dictionary and returns term info
369      *
370      * @param Zend_Search_Lucene_Index_Term $term
371      * @return Zend_Search_Lucene_Index_TermInfo
372      */
373     public function getTermInfo($term)
374     {
375         $this->_loadDictionary();
377         $searchField = $this->getFieldNum($term->field);
379         if ($searchField == -1) {
380             return null;
381         }
382         $searchDicField = $this->_getFieldPosition($searchField);
384         // search for appropriate value in dictionary
385         $lowIndex = 0;
386         $highIndex = count($this->_termDictionary)-1;
387         while ($highIndex >= $lowIndex) {
388             // $mid = ($highIndex - $lowIndex)/2;
389             $mid = ($highIndex + $lowIndex) >> 1;
390             $midTerm = $this->_termDictionary[$mid];
392             $fieldNum = $this->_getFieldPosition($midTerm->field);
393             $delta = $searchDicField - $fieldNum;
394             if ($delta == 0) {
395                 $delta = strcmp($term->text, $midTerm->text);
396             }
398             if ($delta < 0) {
399                 $highIndex = $mid-1;
400             } elseif ($delta > 0) {
401                 $lowIndex  = $mid+1;
402             } else {
403                 return $this->_termDictionaryInfos[$mid]; // We got it!
404             }
405         }
407         if ($highIndex == -1) {
408             // Term is out of the dictionary range
409             return null;
410         }
412         $prevPosition = $highIndex;
413         $prevTerm = $this->_termDictionary[$prevPosition];
414         $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];
416         $tisFile = $this->openCompoundFile('.tis');
417         $tiVersion = $tisFile->readInt();
418         if ($tiVersion != (int)0xFFFFFFFE) {
419             throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
420         }
422         $termCount     = $tisFile->readLong();
423         $indexInterval = $tisFile->readInt();
424         $skipInterval  = $tisFile->readInt();
426         $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);
428         $termValue    = $prevTerm->text;
429         $termFieldNum = $prevTerm->field;
430         $freqPointer = $prevTermInfo->freqPointer;
431         $proxPointer = $prevTermInfo->proxPointer;
432         for ($count = $prevPosition*$indexInterval + 1;
433              $count < $termCount &&
434              ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
435               ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
436                strcmp($termValue, $term->text) < 0) );
437              $count++) {
438             $termPrefixLength = $tisFile->readVInt();
439             $termSuffix       = $tisFile->readString();
440             $termFieldNum     = $tisFile->readVInt();
441             $termValue        = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;
443             $docFreq      = $tisFile->readVInt();
444             $freqPointer += $tisFile->readVInt();
445             $proxPointer += $tisFile->readVInt();
446             if( $docFreq >= $skipInterval ) {
447                 $skipOffset = $tisFile->readVInt();
448             } else {
449                 $skipOffset = 0;
450             }
451         }
453         if ($termFieldNum == $searchField && $termValue == $term->text) {
454             return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
455         } else {
456             return null;
457         }
458     }
460     /**
461      * Returns normalization factor for specified documents
462      *
463      * @param integer $id
464      * @param string $fieldName
465      * @return string
466      */
467     public function norm($id, $fieldName)
468     {
469         $fieldNum = $this->getFieldNum($fieldName);
471         if ( !($this->_fields[$fieldNum]->isIndexed) ) {
472             return null;
473         }
475         if ( !isset( $this->_norms[$fieldNum] )) {
476             $fFile = $this->openCompoundFile('.f' . $fieldNum);
477             $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
478         }
480         return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
481     }
484     /**
485      * Returns true if any documents have been deleted from this index segment.
486      *
487      * @return boolean
488      */
489     public function hasDeletions()
490     {
491         return $this->_deleted !== null;
492     }
495     /**
496      * Deletes a document from the index segment.
497      * $id is an internal document id
498      *
499      * @param integer
500      */
501     public function delete($id)
502     {
503         $this->_deletedDirty = true;
505         if (extension_loaded('bitset')) {
506             if ($this->_deleted === null) {
507                 $this->_deleted = bitset_empty($id);
508             }
509             bitset_incl($this->_deleted, $id);
510         } else {
511             if ($this->_deleted === null) {
512                 $this->_deleted = array();
513             }
515             $this->_deleted[$id] = 1;
516         }
517     }
519     /**
520      * Checks, that document is deleted
521      *
522      * @param integer
523      * @return boolean
524      */
525     public function isDeleted($id)
526     {
527         if ($this->_deleted === null) {
528             return false;
529         }
531         if (extension_loaded('bitset')) {
532             return bitset_in($this->_deleted, $id);
533         } else {
534             return isset($this->_deleted[$id]);
535         }
536     }
539     /**
540      * Write changes if it's necessary.
541      */
542     public function writeChanges()
543     {
544         if (!$this->_deletedDirty) {
545             return;
546         }
548         if (extension_loaded('bitset')) {
549             $delBytes = $this->_deleted;
550             $bitCount = count(bitset_to_array($delBytes));
551         } else {
552             $byteCount = floor($this->_docCount/8)+1;
553             $delBytes = str_repeat(chr(0), $byteCount);
554             for ($count = 0; $count < $byteCount; $count++) {
555                 $byte = 0;
556                 for ($bit = 0; $bit < 8; $bit++) {
557                     if (isset($this->_deleted[$count*8 + $bit])) {
558                         $byte |= (1<<$bit);
559                     }
560                 }
561                 $delBytes{$count} = chr($byte);
562             }
563             $bitCount = count($this->_deleted);
564         }
567         $delFile = $this->_directory->createFile($this->_name . '.del');
568         $delFile->writeInt($this->_docCount);
569         $delFile->writeInt($bitCount);
570         $delFile->writeBytes($delBytes);
572         $this->_deletedDirty = false;
573     }