MDL-12234, fixing unicode issues with global search
[moodle.git] / search / Zend / Search / Lucene.php
CommitLineData
682d4032 1<?php
2/**
3 * Zend Framework
4 *
5 * LICENSE
6 *
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
14 *
15 * @category Zend
16 * @package Zend_Search_Lucene
8cfbeb81 17 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
682d4032 18 * @license http://framework.zend.com/license/new-bsd New BSD License
19 */
20
21
22/** Zend_Search_Lucene_Exception */
8cfbeb81 23require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
682d4032 24
25/** Zend_Search_Lucene_Document */
8cfbeb81 26require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
27
28/** Zend_Search_Lucene_Document_Html */
29require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php';
682d4032 30
31/** Zend_Search_Lucene_Storage_Directory */
8cfbeb81 32require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php';
33
34/** Zend_Search_Lucene_Storage_File_Memory */
35require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Memory.php';
682d4032 36
37/** Zend_Search_Lucene_Index_Term */
8cfbeb81 38require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php';
682d4032 39
40/** Zend_Search_Lucene_Index_TermInfo */
8cfbeb81 41require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/TermInfo.php';
682d4032 42
43/** Zend_Search_Lucene_Index_SegmentInfo */
8cfbeb81 44require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
682d4032 45
46/** Zend_Search_Lucene_Index_FieldInfo */
8cfbeb81 47require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/FieldInfo.php';
682d4032 48
49/** Zend_Search_Lucene_Index_Writer */
8cfbeb81 50require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Writer.php';
682d4032 51
52/** Zend_Search_Lucene_Search_QueryParser */
8cfbeb81 53require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php';
682d4032 54
55/** Zend_Search_Lucene_Search_QueryHit */
8cfbeb81 56require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryHit.php';
682d4032 57
58/** Zend_Search_Lucene_Search_Similarity */
8cfbeb81 59require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php';
60
61/** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */
62require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php';
63
64
65/** Zend_Search_Lucene_Interface */
66require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php';
67
68/** Zend_Search_Lucene_Proxy */
69require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Proxy.php';
682d4032 70
71
72/**
73 * @category Zend
74 * @package Zend_Search_Lucene
8cfbeb81 75 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
682d4032 76 * @license http://framework.zend.com/license/new-bsd New BSD License
77 */
8cfbeb81 78class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
682d4032 79{
8cfbeb81 80 /**
81 * Default field name for search
82 *
83 * Null means search through all fields
84 *
85 * @var string
86 */
87 private static $_defaultSearchField = null;
88
682d4032 89 /**
90 * File system adapter.
91 *
92 * @var Zend_Search_Lucene_Storage_Directory
93 */
94 private $_directory = null;
95
96 /**
97 * File system adapter closing option
98 *
99 * @var boolean
100 */
101 private $_closeDirOnExit = true;
102
103 /**
104 * Writer for this index, not instantiated unless required.
105 *
106 * @var Zend_Search_Lucene_Index_Writer
107 */
108 private $_writer = null;
109
110 /**
111 * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
112 *
113 * @var array Zend_Search_Lucene_Index_SegmentInfo
114 */
115 private $_segmentInfos = array();
116
117 /**
118 * Number of documents in this index.
119 *
120 * @var integer
121 */
122 private $_docCount = 0;
123
124 /**
125 * Flag for index changes
126 *
127 * @var boolean
128 */
129 private $_hasChanges = false;
130
8cfbeb81 131
132 /**
133 * Index lock object
134 *
135 * @var Zend_Search_Lucene_Storage_File
136 */
137 private $_lock;
138
139 /**
140 * Signal, that index is already closed, changes are fixed and resources are cleaned up
141 *
142 * @var boolean
143 */
144 private $_closed = false;
145
146 /**
147 * Number of references to the index object
148 *
149 * @var integer
150 */
151 private $_refCount = 0;
152
153
154 /**
155 * Create index
156 *
157 * @param mixed $directory
158 * @return Zend_Search_Lucene_Interface
159 */
160 public static function create($directory)
161 {
162 return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
163 }
164
165 /**
166 * Open index
167 *
168 * @param mixed $directory
169 * @return Zend_Search_Lucene_Interface
170 */
171 public static function open($directory)
172 {
173 return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
174 }
175
682d4032 176 /**
177 * Opens the index.
178 *
179 * IndexReader constructor needs Directory as a parameter. It should be
180 * a string with a path to the index folder or a Directory object.
181 *
182 * @param mixed $directory
183 * @throws Zend_Search_Lucene_Exception
184 */
185 public function __construct($directory = null, $create = false)
186 {
187 if ($directory === null) {
188 throw new Zend_Search_Exception('No index directory specified');
189 }
190
191 if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) {
192 $this->_directory = $directory;
193 $this->_closeDirOnExit = false;
194 } else {
195 $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
196 $this->_closeDirOnExit = true;
197 }
198
8cfbeb81 199
200 // Get a shared lock to the index
201 $this->_lock = $this->_directory->createFile('index.lock');
202
203 $this->_segmentInfos = array();
204
682d4032 205 if ($create) {
8cfbeb81 206 // Throw an exception if index is under processing now
207 if (!$this->_lock->lock(LOCK_EX, true)) {
208 throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now');
209 }
210
211 // Writer will create segments file for empty segments list
212 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, true);
213
214 if (!$this->_lock->lock(LOCK_SH)) {
215 throw new Zend_Search_Lucene_Exception('Can\'t reduce lock level from Exclusive to Shared');
216 }
682d4032 217 } else {
8cfbeb81 218 // Wait if index is under switching from one set of segments to another (Index_Writer::_updateSegments())
219 if (!$this->_lock->lock(LOCK_SH)) {
220 throw new Zend_Search_Lucene_Exception('Can\'t obtain shared index lock');
221 }
682d4032 222 $this->_writer = null;
223 }
224
682d4032 225
226 $segmentsFile = $this->_directory->getFileObject('segments');
227
228 $format = $segmentsFile->readInt();
229
230 if ($format != (int)0xFFFFFFFF) {
231 throw new Zend_Search_Lucene_Exception('Wrong segments file format');
232 }
233
234 // read version
8cfbeb81 235 // $segmentsFile->readLong();
236 $segmentsFile->readInt(); $segmentsFile->readInt();
682d4032 237
8cfbeb81 238 // read segment name counter
682d4032 239 $segmentsFile->readInt();
240
241 $segments = $segmentsFile->readInt();
242
243 $this->_docCount = 0;
244
245 // read segmentInfos
246 for ($count = 0; $count < $segments; $count++) {
247 $segName = $segmentsFile->readString();
248 $segSize = $segmentsFile->readInt();
249 $this->_docCount += $segSize;
250
8cfbeb81 251 $this->_segmentInfos[] =
682d4032 252 new Zend_Search_Lucene_Index_SegmentInfo($segName,
253 $segSize,
254 $this->_directory);
255 }
256 }
257
682d4032 258 /**
8cfbeb81 259 * Close current index and free resources
682d4032 260 */
8cfbeb81 261 private function _close()
682d4032 262 {
8cfbeb81 263 if ($this->_closed) {
264 // index is already closed and resources are cleaned up
265 return;
266 }
267
682d4032 268 $this->commit();
269
8cfbeb81 270 // Free shared lock
271 $this->_lock->unlock();
272
682d4032 273 if ($this->_closeDirOnExit) {
274 $this->_directory->close();
275 }
8cfbeb81 276
277 $this->_directory = null;
278 $this->_writer = null;
279 $this->_segmentInfos = null;
280
281 $this->_closed = true;
282 }
283
284 /**
285 * Add reference to the index object
286 *
287 * @internal
288 */
289 public function addReference()
290 {
291 $this->_refCount++;
292 }
293
294 /**
295 * Remove reference from the index object
296 *
297 * When reference count becomes zero, index is closed and resources are cleaned up
298 *
299 * @internal
300 */
301 public function removeReference()
302 {
303 $this->_refCount--;
304
305 if ($this->_refCount == 0) {
306 $this->_close();
307 }
308 }
309
310 /**
311 * Object destructor
312 */
313 public function __destruct()
314 {
315 $this->_close();
682d4032 316 }
317
318 /**
319 * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
320 *
8cfbeb81 321 * @internal
682d4032 322 * @return Zend_Search_Lucene_Index_Writer
323 */
324 public function getIndexWriter()
325 {
326 if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
8cfbeb81 327 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos);
682d4032 328 }
329
330 return $this->_writer;
331 }
332
333
334 /**
335 * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
336 *
337 * @return Zend_Search_Lucene_Storage_Directory
338 */
339 public function getDirectory()
340 {
341 return $this->_directory;
342 }
343
344
345 /**
8cfbeb81 346 * Returns the total number of documents in this index (including deleted documents).
682d4032 347 *
348 * @return integer
349 */
350 public function count()
351 {
352 return $this->_docCount;
353 }
354
8cfbeb81 355 /**
356 * Returns one greater than the largest possible document number.
357 * This may be used to, e.g., determine how big to allocate a structure which will have
358 * an element for every document number in an index.
359 *
360 * @return integer
361 */
362 public function maxDoc()
363 {
364 return $this->count();
365 }
366
367 /**
368 * Returns the total number of non-deleted documents in this index.
369 *
370 * @return integer
371 */
372 public function numDocs()
373 {
374 $numDocs = 0;
375
376 foreach ($this->_segmentInfos as $segmentInfo) {
377 $numDocs += $segmentInfo->numDocs();
378 }
379
380 return $numDocs;
381 }
382
383 /**
384 * Checks, that document is deleted
385 *
386 * @param integer $id
387 * @return boolean
388 * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
389 */
390 public function isDeleted($id)
391 {
392 if ($id >= $this->_docCount) {
393 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
394 }
395
396 $segmentStartId = 0;
397 foreach ($this->_segmentInfos as $segmentInfo) {
398 if ($segmentStartId + $segmentInfo->count() > $id) {
399 break;
400 }
401
402 $segmentStartId += $segmentInfo->count();
403 }
404
405 return $segmentInfo->isDeleted($id - $segmentStartId);
406 }
407
408 /**
409 * Set default search field.
410 *
411 * Null means, that search is performed through all fields by default
412 *
413 * Default value is null
414 *
415 * @param string $fieldName
416 */
417 public static function setDefaultSearchField($fieldName)
418 {
419 self::$_defaultSearchField = $fieldName;
420 }
421
422 /**
423 * Get default search field.
424 *
425 * Null means, that search is performed through all fields by default
426 *
427 * @return string
428 */
429 public static function getDefaultSearchField()
430 {
431 return self::$_defaultSearchField;
432 }
433
434 /**
435 * Retrieve index maxBufferedDocs option
436 *
437 * maxBufferedDocs is a minimal number of documents required before
438 * the buffered in-memory documents are written into a new Segment
439 *
440 * Default value is 10
441 *
442 * @return integer
443 */
444 public function getMaxBufferedDocs()
445 {
446 return $this->getIndexWriter()->maxBufferedDocs;
447 }
448
449 /**
450 * Set index maxBufferedDocs option
451 *
452 * maxBufferedDocs is a minimal number of documents required before
453 * the buffered in-memory documents are written into a new Segment
454 *
455 * Default value is 10
456 *
457 * @param integer $maxBufferedDocs
458 */
459 public function setMaxBufferedDocs($maxBufferedDocs)
460 {
461 $this->getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
462 }
463
464 /**
465 * Retrieve index maxMergeDocs option
466 *
467 * maxMergeDocs is a largest number of documents ever merged by addDocument().
468 * Small values (e.g., less than 10,000) are best for interactive indexing,
469 * as this limits the length of pauses while indexing to a few seconds.
470 * Larger values are best for batched indexing and speedier searches.
471 *
472 * Default value is PHP_INT_MAX
473 *
474 * @return integer
475 */
476 public function getMaxMergeDocs()
477 {
478 return $this->getIndexWriter()->maxMergeDocs;
479 }
480
481 /**
482 * Set index maxMergeDocs option
483 *
484 * maxMergeDocs is a largest number of documents ever merged by addDocument().
485 * Small values (e.g., less than 10,000) are best for interactive indexing,
486 * as this limits the length of pauses while indexing to a few seconds.
487 * Larger values are best for batched indexing and speedier searches.
488 *
489 * Default value is PHP_INT_MAX
490 *
491 * @param integer $maxMergeDocs
492 */
493 public function setMaxMergeDocs($maxMergeDocs)
494 {
495 $this->getIndexWriter()->maxMergeDocs = $maxMergeDocs;
496 }
497
498 /**
499 * Retrieve index mergeFactor option
500 *
501 * mergeFactor determines how often segment indices are merged by addDocument().
502 * With smaller values, less RAM is used while indexing,
503 * and searches on unoptimized indices are faster,
504 * but indexing speed is slower.
505 * With larger values, more RAM is used during indexing,
506 * and while searches on unoptimized indices are slower,
507 * indexing is faster.
508 * Thus larger values (> 10) are best for batch index creation,
509 * and smaller values (< 10) for indices that are interactively maintained.
510 *
511 * Default value is 10
512 *
513 * @return integer
514 */
515 public function getMergeFactor()
516 {
517 return $this->getIndexWriter()->mergeFactor;
518 }
519
520 /**
521 * Set index mergeFactor option
522 *
523 * mergeFactor determines how often segment indices are merged by addDocument().
524 * With smaller values, less RAM is used while indexing,
525 * and searches on unoptimized indices are faster,
526 * but indexing speed is slower.
527 * With larger values, more RAM is used during indexing,
528 * and while searches on unoptimized indices are slower,
529 * indexing is faster.
530 * Thus larger values (> 10) are best for batch index creation,
531 * and smaller values (< 10) for indices that are interactively maintained.
532 *
533 * Default value is 10
534 *
535 * @param integer $maxMergeDocs
536 */
537 public function setMergeFactor($mergeFactor)
538 {
539 $this->getIndexWriter()->mergeFactor = $mergeFactor;
540 }
682d4032 541
542 /**
543 * Performs a query against the index and returns an array
544 * of Zend_Search_Lucene_Search_QueryHit objects.
545 * Input is a string or Zend_Search_Lucene_Search_Query.
546 *
547 * @param mixed $query
8cfbeb81 548 * @return array Zend_Search_Lucene_Search_QueryHit
549 * @throws Zend_Search_Lucene_Exception
682d4032 550 */
551 public function find($query)
552 {
553 if (is_string($query)) {
554 $query = Zend_Search_Lucene_Search_QueryParser::parse($query);
555 }
556
557 if (!$query instanceof Zend_Search_Lucene_Search_Query) {
558 throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
559 }
560
561 $this->commit();
562
8cfbeb81 563 $hits = array();
682d4032 564 $scores = array();
8cfbeb81 565 $ids = array();
566
567 $query = $query->rewrite($this)->optimize($this);
568
569 $query->execute($this);
570
571 $topScore = 0;
682d4032 572
8cfbeb81 573 foreach ($query->matchedDocs() as $id => $num) {
574 $docScore = $query->score($id, $this);
682d4032 575 if( $docScore != 0 ) {
576 $hit = new Zend_Search_Lucene_Search_QueryHit($this);
8cfbeb81 577 $hit->id = $id;
682d4032 578 $hit->score = $docScore;
579
8cfbeb81 580 $hits[] = $hit;
581 $ids[] = $id;
682d4032 582 $scores[] = $docScore;
8cfbeb81 583
584 if ($docScore > $topScore) {
585 $topScore = $docScore;
586 }
587 }
588 }
589
590 if (count($hits) == 0) {
591 // skip sorting, which may cause a error on empty index
592 return array();
593 }
594
595 if ($topScore > 1) {
596 foreach ($hits as $hit) {
597 $hit->score /= $topScore;
598 }
599 }
600
601 if (func_num_args() == 1) {
602 // sort by scores
603 array_multisort($scores, SORT_DESC, SORT_NUMERIC,
604 $ids, SORT_ASC, SORT_NUMERIC,
605 $hits);
606 } else {
607 // sort by given field names
608
609 $argList = func_get_args();
610 $fieldNames = $this->getFieldNames();
611 $sortArgs = array();
612
613 for ($count = 1; $count < count($argList); $count++) {
614 $fieldName = $argList[$count];
615
616 if (!is_string($fieldName)) {
617 throw new Zend_Search_Lucene_Exception('Field name must be a string.');
618 }
619
620 if (!in_array($fieldName, $fieldNames)) {
621 throw new Zend_Search_Lucene_Exception('Wrong field name.');
622 }
623
624 $valuesArray = array();
625 foreach ($hits as $hit) {
626 try {
627 $value = $hit->getDocument()->getFieldValue($fieldName);
628 } catch (Zend_Search_Lucene_Exception $e) {
629 if (strpos($e->getMessage(), 'not found') === false) {
630 throw $e;
631 } else {
632 $value = null;
633 }
634 }
635
636 $valuesArray[] = $value;
637 }
638
639 $sortArgs[] = $valuesArray;
640
641 if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
642 $count++;
643 $sortArgs[] = $argList[$count];
644
645 if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
646 $count++;
647 $sortArgs[] = $argList[$count];
648 } else {
649 if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) {
650 $sortArgs[] = SORT_REGULAR;
651 } else {
652 $sortArgs[] = SORT_ASC;
653 }
654 }
655 } else {
656 $sortArgs[] = SORT_ASC;
657 $sortArgs[] = SORT_REGULAR;
658 }
682d4032 659 }
8cfbeb81 660
661 // Sort by id's if values are equal
662 $sortArgs[] = $ids;
663 $sortArgs[] = SORT_ASC;
664 $sortArgs[] = SORT_NUMERIC;
665
666 // Array to be sorted
667 $sortArgs[] = &$hits;
668
669 // Do sort
670 call_user_func_array('array_multisort', $sortArgs);
682d4032 671 }
682d4032 672
673 return $hits;
674 }
675
676
677 /**
678 * Returns a list of all unique field names that exist in this index.
679 *
680 * @param boolean $indexed
681 * @return array
682 */
683 public function getFieldNames($indexed = false)
684 {
685 $result = array();
686 foreach( $this->_segmentInfos as $segmentInfo ) {
687 $result = array_merge($result, $segmentInfo->getFields($indexed));
688 }
689 return $result;
690 }
691
692
693 /**
694 * Returns a Zend_Search_Lucene_Document object for the document
695 * number $id in this index.
696 *
697 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
698 * @return Zend_Search_Lucene_Document
699 */
700 public function getDocument($id)
701 {
702 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
703 /* @var $id Zend_Search_Lucene_Search_QueryHit */
704 $id = $id->id;
705 }
706
707 if ($id >= $this->_docCount) {
708 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
709 }
710
8cfbeb81 711 $segmentStartId = 0;
712 foreach ($this->_segmentInfos as $segmentInfo) {
713 if ($segmentStartId + $segmentInfo->count() > $id) {
714 break;
715 }
716
717 $segmentStartId += $segmentInfo->count();
682d4032 718 }
682d4032 719
8cfbeb81 720 $fdxFile = $segmentInfo->openCompoundFile('.fdx');
682d4032 721 $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
722 $fieldValuesPosition = $fdxFile->readLong();
723
8cfbeb81 724 $fdtFile = $segmentInfo->openCompoundFile('.fdt');
725 $fdtFile->seek($fieldValuesPosition, SEEK_CUR);
682d4032 726 $fieldCount = $fdtFile->readVInt();
727
728 $doc = new Zend_Search_Lucene_Document();
8cfbeb81 729 for ($count = 0; $count < $fieldCount; $count++) {
682d4032 730 $fieldNum = $fdtFile->readVInt();
731 $bits = $fdtFile->readByte();
732
8cfbeb81 733 $fieldInfo = $segmentInfo->getField($fieldNum);
682d4032 734
8cfbeb81 735 if (!($bits & 2)) { // Text data
682d4032 736 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
737 $fdtFile->readString(),
8cfbeb81 738 'UTF-8',
682d4032 739 true,
740 $fieldInfo->isIndexed,
741 $bits & 1 );
8cfbeb81 742 } else { // Binary data
682d4032 743 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
744 $fdtFile->readBinary(),
8cfbeb81 745 '',
682d4032 746 true,
747 $fieldInfo->isIndexed,
8cfbeb81 748 $bits & 1,
749 true );
682d4032 750 }
751
752 $doc->addField($field);
753 }
754
755 return $doc;
756 }
757
758
759 /**
8cfbeb81 760 * Returns true if index contain documents with specified term.
761 *
762 * Is used for query optimization.
763 *
764 * @param Zend_Search_Lucene_Index_Term $term
765 * @return boolean
766 */
767 public function hasTerm(Zend_Search_Lucene_Index_Term $term)
768 {
769 foreach ($this->_segmentInfos as $segInfo) {
770 if ($segInfo->getTermInfo($term) instanceof Zend_Search_Lucene_Index_TermInfo) {
771 return true;
772 }
773 }
774
775 return false;
776 }
777
778 /**
779 * Returns IDs of all the documents containing term.
682d4032 780 *
781 * @param Zend_Search_Lucene_Index_Term $term
782 * @return array
783 */
784 public function termDocs(Zend_Search_Lucene_Index_Term $term)
785 {
786 $result = array();
787 $segmentStartDocId = 0;
788
789 foreach ($this->_segmentInfos as $segInfo) {
790 $termInfo = $segInfo->getTermInfo($term);
791
792 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
793 $segmentStartDocId += $segInfo->count();
794 continue;
795 }
796
797 $frqFile = $segInfo->openCompoundFile('.frq');
798 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
799 $docId = 0;
800 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
801 $docDelta = $frqFile->readVInt();
802 if( $docDelta % 2 == 1 ) {
803 $docId += ($docDelta-1)/2;
804 } else {
805 $docId += $docDelta/2;
806 // read freq
807 $frqFile->readVInt();
808 }
809
810 $result[] = $segmentStartDocId + $docId;
811 }
812
813 $segmentStartDocId += $segInfo->count();
814 }
815
816 return $result;
817 }
818
819
820 /**
8cfbeb81 821 * Returns an array of all term freqs.
822 * Result array structure: array(docId => freq, ...)
682d4032 823 *
824 * @param Zend_Search_Lucene_Index_Term $term
8cfbeb81 825 * @return integer
682d4032 826 */
8cfbeb81 827 public function termFreqs(Zend_Search_Lucene_Index_Term $term)
682d4032 828 {
829 $result = array();
830 $segmentStartDocId = 0;
8cfbeb81 831 foreach ($this->_segmentInfos as $segmentInfo) {
832 $result += $segmentInfo->termFreqs($term, $segmentStartDocId);
682d4032 833
8cfbeb81 834 $segmentStartDocId += $segmentInfo->count();
835 }
682d4032 836
8cfbeb81 837 return $result;
838 }
682d4032 839
8cfbeb81 840 /**
841 * Returns an array of all term positions in the documents.
842 * Result array structure: array(docId => array(pos1, pos2, ...), ...)
843 *
844 * @param Zend_Search_Lucene_Index_Term $term
845 * @return array
846 */
847 public function termPositions(Zend_Search_Lucene_Index_Term $term)
848 {
849 $result = array();
850 $segmentStartDocId = 0;
851 foreach ($this->_segmentInfos as $segmentInfo) {
852 $result += $segmentInfo->termPositions($term, $segmentStartDocId);
682d4032 853
8cfbeb81 854 $segmentStartDocId += $segmentInfo->count();
682d4032 855 }
856
857 return $result;
858 }
859
860
861 /**
862 * Returns the number of documents in this index containing the $term.
863 *
864 * @param Zend_Search_Lucene_Index_Term $term
865 * @return integer
866 */
867 public function docFreq(Zend_Search_Lucene_Index_Term $term)
868 {
869 $result = 0;
870 foreach ($this->_segmentInfos as $segInfo) {
871 $termInfo = $segInfo->getTermInfo($term);
872 if ($termInfo !== null) {
873 $result += $termInfo->docFreq;
874 }
875 }
876
877 return $result;
878 }
879
880
881 /**
882 * Retrive similarity used by index reader
883 *
884 * @return Zend_Search_Lucene_Search_Similarity
885 */
886 public function getSimilarity()
887 {
888 return Zend_Search_Lucene_Search_Similarity::getDefault();
889 }
890
891
892 /**
893 * Returns a normalization factor for "field, document" pair.
894 *
895 * @param integer $id
896 * @param string $fieldName
8cfbeb81 897 * @return float
682d4032 898 */
8cfbeb81 899 public function norm($id, $fieldName)
682d4032 900 {
901 if ($id >= $this->_docCount) {
902 return null;
903 }
904
905 $segmentStartId = 0;
906 foreach ($this->_segmentInfos as $segInfo) {
907 if ($segmentStartId + $segInfo->count() > $id) {
908 break;
909 }
910
911 $segmentStartId += $segInfo->count();
912 }
913
914 if ($segInfo->isDeleted($id - $segmentStartId)) {
915 return 0;
916 }
917
918 return $segInfo->norm($id - $segmentStartId, $fieldName);
919 }
920
921 /**
922 * Returns true if any documents have been deleted from this index.
923 *
924 * @return boolean
925 */
926 public function hasDeletions()
927 {
928 foreach ($this->_segmentInfos as $segmentInfo) {
929 if ($segmentInfo->hasDeletions()) {
930 return true;
931 }
932 }
933
934 return false;
935 }
936
937
938 /**
939 * Deletes a document from the index.
940 * $id is an internal document id
941 *
942 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
943 * @throws Zend_Search_Lucene_Exception
944 */
945 public function delete($id)
946 {
947 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
948 /* @var $id Zend_Search_Lucene_Search_QueryHit */
949 $id = $id->id;
950 }
951
952 if ($id >= $this->_docCount) {
953 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
954 }
955
8cfbeb81 956 $segmentStartId = 0;
957 foreach ($this->_segmentInfos as $segmentInfo) {
958 if ($segmentStartId + $segmentInfo->count() > $id) {
959 break;
960 }
961
962 $segmentStartId += $segmentInfo->count();
682d4032 963 }
8cfbeb81 964 $segmentInfo->delete($id - $segmentStartId);
682d4032 965
966 $this->_hasChanges = true;
682d4032 967 }
968
969
970
971 /**
972 * Adds a document to this index.
973 *
974 * @param Zend_Search_Lucene_Document $document
975 */
976 public function addDocument(Zend_Search_Lucene_Document $document)
977 {
8cfbeb81 978 $this->getIndexWriter()->addDocument($document);
979 $this->_docCount++;
682d4032 980 }
981
982
8cfbeb81 983 /**
984 * Update document counter
985 */
986 private function _updateDocCount()
987 {
988 $this->_docCount = 0;
989 foreach ($this->_segmentInfos as $segInfo) {
990 $this->_docCount += $segInfo->count();
991 }
992 }
993
682d4032 994 /**
995 * Commit changes resulting from delete() or undeleteAll() operations.
996 *
8cfbeb81 997 * @todo undeleteAll processing.
682d4032 998 */
999 public function commit()
1000 {
1001 if ($this->_hasChanges) {
1002 foreach ($this->_segmentInfos as $segInfo) {
1003 $segInfo->writeChanges();
1004 }
1005
1006 $this->_hasChanges = false;
1007 }
1008
1009 if ($this->_writer !== null) {
8cfbeb81 1010 $this->_writer->commit();
1011
1012 $this->_updateDocCount();
682d4032 1013 }
1014 }
1015
1016
8cfbeb81 1017 /**
1018 * Optimize index.
1019 *
1020 * Merges all segments into one
1021 */
1022 public function optimize()
1023 {
1024 // Commit changes if any changes have been made
1025 $this->commit();
1026
1027 if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
1028 $this->getIndexWriter()->optimize();
1029 $this->_updateDocCount();
1030 }
1031 }
1032
682d4032 1033
1034 /**
1035 * Returns an array of all terms in this index.
1036 *
682d4032 1037 * @return array
1038 */
1039 public function terms()
1040 {
8cfbeb81 1041 $result = array();
1042
1043 $segmentInfoQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue();
1044
1045 foreach ($this->_segmentInfos as $segmentInfo) {
1046 $segmentInfo->reset();
1047
1048 // Skip "empty" segments
1049 if ($segmentInfo->currentTerm() !== null) {
1050 $segmentInfoQueue->put($segmentInfo);
1051 }
1052 }
1053
1054 while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
1055 if ($segmentInfoQueue->top() === null ||
1056 $segmentInfoQueue->top()->currentTerm()->key() !=
1057 $segmentInfo->currentTerm()->key()) {
1058 // We got new term
1059 $result[] = $segmentInfo->currentTerm();
1060 }
1061
1062 $segmentInfo->nextTerm();
1063 // check, if segment dictionary is finished
1064 if ($segmentInfo->currentTerm() !== null) {
1065 // Put segment back into the priority queue
1066 $segmentInfoQueue->put($segmentInfo);
1067 }
1068 }
1069
1070 return $result;
682d4032 1071 }
1072
1073
8cfbeb81 1074 /*************************************************************************
1075 @todo UNIMPLEMENTED
1076 *************************************************************************/
682d4032 1077 /**
1078 * Undeletes all documents currently marked as deleted in this index.
1079 *
1080 * @todo Implementation
1081 */
1082 public function undeleteAll()
1083 {}
8cfbeb81 1084}