search engine lang file (EN-UTF8)
[moodle.git] / search / Zend / Search / Lucene.php
CommitLineData
682d4032 1<?php
2/**
3 * Zend Framework
4 *
5 * LICENSE
6 *
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
14 *
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
18 * @license http://framework.zend.com/license/new-bsd New BSD License
19 */
20
21
22/** Zend_Search_Lucene_Exception */
23require_once 'Zend/Search/Lucene/Exception.php';
24
25/** Zend_Search_Lucene_Document */
26require_once 'Zend/Search/Lucene/Document.php';
27
28/** Zend_Search_Lucene_Storage_Directory */
29require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
30
31/** Zend_Search_Lucene_Index_Term */
32require_once 'Zend/Search/Lucene/Index/Term.php';
33
34/** Zend_Search_Lucene_Index_TermInfo */
35require_once 'Zend/Search/Lucene/Index/TermInfo.php';
36
37/** Zend_Search_Lucene_Index_SegmentInfo */
38require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
39
40/** Zend_Search_Lucene_Index_FieldInfo */
41require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
42
43/** Zend_Search_Lucene_Index_Writer */
44require_once 'Zend/Search/Lucene/Index/Writer.php';
45
46/** Zend_Search_Lucene_Search_QueryParser */
47require_once 'Zend/Search/Lucene/Search/QueryParser.php';
48
49/** Zend_Search_Lucene_Search_QueryHit */
50require_once 'Zend/Search/Lucene/Search/QueryHit.php';
51
52/** Zend_Search_Lucene_Search_Similarity */
53require_once 'Zend/Search/Lucene/Search/Similarity.php';
54
55
56/**
57 * @category Zend
58 * @package Zend_Search_Lucene
59 * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
60 * @license http://framework.zend.com/license/new-bsd New BSD License
61 */
62class Zend_Search_Lucene
63{
64 /**
65 * File system adapter.
66 *
67 * @var Zend_Search_Lucene_Storage_Directory
68 */
69 private $_directory = null;
70
71 /**
72 * File system adapter closing option
73 *
74 * @var boolean
75 */
76 private $_closeDirOnExit = true;
77
78 /**
79 * Writer for this index, not instantiated unless required.
80 *
81 * @var Zend_Search_Lucene_Index_Writer
82 */
83 private $_writer = null;
84
85 /**
86 * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
87 *
88 * @var array Zend_Search_Lucene_Index_SegmentInfo
89 */
90 private $_segmentInfos = array();
91
92 /**
93 * Number of documents in this index.
94 *
95 * @var integer
96 */
97 private $_docCount = 0;
98
99 /**
100 * Flag for index changes
101 *
102 * @var boolean
103 */
104 private $_hasChanges = false;
105
106 /**
107 * Opens the index.
108 *
109 * IndexReader constructor needs Directory as a parameter. It should be
110 * a string with a path to the index folder or a Directory object.
111 *
112 * @param mixed $directory
113 * @throws Zend_Search_Lucene_Exception
114 */
115 public function __construct($directory = null, $create = false)
116 {
117 if ($directory === null) {
118 throw new Zend_Search_Exception('No index directory specified');
119 }
120
121 if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) {
122 $this->_directory = $directory;
123 $this->_closeDirOnExit = false;
124 } else {
125 $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
126 $this->_closeDirOnExit = true;
127 }
128
129 if ($create) {
130 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true);
131 } else {
132 $this->_writer = null;
133 }
134
135 $this->_segmentInfos = array();
136
137 $segmentsFile = $this->_directory->getFileObject('segments');
138
139 $format = $segmentsFile->readInt();
140
141 if ($format != (int)0xFFFFFFFF) {
142 throw new Zend_Search_Lucene_Exception('Wrong segments file format');
143 }
144
145 // read version
146 $segmentsFile->readLong();
147
148 // read counter
149 $segmentsFile->readInt();
150
151 $segments = $segmentsFile->readInt();
152
153 $this->_docCount = 0;
154
155 // read segmentInfos
156 for ($count = 0; $count < $segments; $count++) {
157 $segName = $segmentsFile->readString();
158 $segSize = $segmentsFile->readInt();
159 $this->_docCount += $segSize;
160
161 $this->_segmentInfos[$count] =
162 new Zend_Search_Lucene_Index_SegmentInfo($segName,
163 $segSize,
164 $this->_directory);
165 }
166 }
167
168
169 /**
170 * Object destructor
171 */
172 public function __destruct()
173 {
174 $this->commit();
175
176 if ($this->_closeDirOnExit) {
177 $this->_directory->close();
178 }
179 }
180
181 /**
182 * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
183 *
184 * @return Zend_Search_Lucene_Index_Writer
185 */
186 public function getIndexWriter()
187 {
188 if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
189 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
190 }
191
192 return $this->_writer;
193 }
194
195
196 /**
197 * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
198 *
199 * @return Zend_Search_Lucene_Storage_Directory
200 */
201 public function getDirectory()
202 {
203 return $this->_directory;
204 }
205
206
207 /**
208 * Returns the total number of documents in this index.
209 *
210 * @return integer
211 */
212 public function count()
213 {
214 return $this->_docCount;
215 }
216
217
218 /**
219 * Performs a query against the index and returns an array
220 * of Zend_Search_Lucene_Search_QueryHit objects.
221 * Input is a string or Zend_Search_Lucene_Search_Query.
222 *
223 * @param mixed $query
224 * @return array ZSearchHit
225 */
226 public function find($query)
227 {
228 if (is_string($query)) {
229 $query = Zend_Search_Lucene_Search_QueryParser::parse($query);
230 }
231
232 if (!$query instanceof Zend_Search_Lucene_Search_Query) {
233 throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
234 }
235
236 $this->commit();
237
238 $hits = array();
239 $scores = array();
240
241 $docNum = $this->count();
242 for( $count=0; $count < $docNum; $count++ ) {
243 $docScore = $query->score( $count, $this);
244 if( $docScore != 0 ) {
245 $hit = new Zend_Search_Lucene_Search_QueryHit($this);
246 $hit->id = $count;
247 $hit->score = $docScore;
248
249 $hits[] = $hit;
250 $scores[] = $docScore;
251 }
252 }
253 array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits);
254
255 return $hits;
256 }
257
258
259 /**
260 * Returns a list of all unique field names that exist in this index.
261 *
262 * @param boolean $indexed
263 * @return array
264 */
265 public function getFieldNames($indexed = false)
266 {
267 $result = array();
268 foreach( $this->_segmentInfos as $segmentInfo ) {
269 $result = array_merge($result, $segmentInfo->getFields($indexed));
270 }
271 return $result;
272 }
273
274
275 /**
276 * Returns a Zend_Search_Lucene_Document object for the document
277 * number $id in this index.
278 *
279 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
280 * @return Zend_Search_Lucene_Document
281 */
282 public function getDocument($id)
283 {
284 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
285 /* @var $id Zend_Search_Lucene_Search_QueryHit */
286 $id = $id->id;
287 }
288
289 if ($id >= $this->_docCount) {
290 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
291 }
292
293 $segCount = 0;
294 $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
295 while( $nextSegmentStartId <= $id ) {
296 $segCount++;
297 $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
298 }
299 $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
300
301 $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx');
302 $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
303 $fieldValuesPosition = $fdxFile->readLong();
304
305 $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt');
306 $fdtFile->seek( $fieldValuesPosition, SEEK_CUR );
307 $fieldCount = $fdtFile->readVInt();
308
309 $doc = new Zend_Search_Lucene_Document();
310 for( $count = 0; $count < $fieldCount; $count++ ) {
311 $fieldNum = $fdtFile->readVInt();
312 $bits = $fdtFile->readByte();
313
314 $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum);
315
316 if( !($bits & 2) ) { // Text data
317 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
318 $fdtFile->readString(),
319 true,
320 $fieldInfo->isIndexed,
321 $bits & 1 );
322 } else {
323 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
324 $fdtFile->readBinary(),
325 true,
326 $fieldInfo->isIndexed,
327 $bits & 1 );
328 }
329
330 $doc->addField($field);
331 }
332
333 return $doc;
334 }
335
336
337 /**
338 * Returns an array of all the documents which contain term.
339 *
340 * @param Zend_Search_Lucene_Index_Term $term
341 * @return array
342 */
343 public function termDocs(Zend_Search_Lucene_Index_Term $term)
344 {
345 $result = array();
346 $segmentStartDocId = 0;
347
348 foreach ($this->_segmentInfos as $segInfo) {
349 $termInfo = $segInfo->getTermInfo($term);
350
351 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
352 $segmentStartDocId += $segInfo->count();
353 continue;
354 }
355
356 $frqFile = $segInfo->openCompoundFile('.frq');
357 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
358 $docId = 0;
359 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
360 $docDelta = $frqFile->readVInt();
361 if( $docDelta % 2 == 1 ) {
362 $docId += ($docDelta-1)/2;
363 } else {
364 $docId += $docDelta/2;
365 // read freq
366 $frqFile->readVInt();
367 }
368
369 $result[] = $segmentStartDocId + $docId;
370 }
371
372 $segmentStartDocId += $segInfo->count();
373 }
374
375 return $result;
376 }
377
378
379 /**
380 * Returns an array of all term positions in the documents.
381 * Return array structure: array( docId => array( pos1, pos2, ...), ...)
382 *
383 * @param Zend_Search_Lucene_Index_Term $term
384 * @return array
385 */
386 public function termPositions(Zend_Search_Lucene_Index_Term $term)
387 {
388 $result = array();
389 $segmentStartDocId = 0;
390 foreach( $this->_segmentInfos as $segInfo ) {
391 $termInfo = $segInfo->getTermInfo($term);
392
393 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
394 $segmentStartDocId += $segInfo->count();
395 continue;
396 }
397
398 $frqFile = $segInfo->openCompoundFile('.frq');
399 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
400 $freqs = array();
401 $docId = 0;
402
403 for( $count = 0; $count < $termInfo->docFreq; $count++ ) {
404 $docDelta = $frqFile->readVInt();
405 if( $docDelta % 2 == 1 ) {
406 $docId += ($docDelta-1)/2;
407 $freqs[ $docId ] = 1;
408 } else {
409 $docId += $docDelta/2;
410 $freqs[ $docId ] = $frqFile->readVInt();
411 }
412 }
413
414 $prxFile = $segInfo->openCompoundFile('.prx');
415 $prxFile->seek($termInfo->proxPointer,SEEK_CUR);
416 foreach ($freqs as $docId => $freq) {
417 $termPosition = 0;
418 $positions = array();
419
420 for ($count = 0; $count < $freq; $count++ ) {
421 $termPosition += $prxFile->readVInt();
422 $positions[] = $termPosition;
423 }
424
425 $result[ $segmentStartDocId + $docId ] = $positions;
426 }
427
428 $segmentStartDocId += $segInfo->count();
429 }
430
431 return $result;
432 }
433
434
435 /**
436 * Returns the number of documents in this index containing the $term.
437 *
438 * @param Zend_Search_Lucene_Index_Term $term
439 * @return integer
440 */
441 public function docFreq(Zend_Search_Lucene_Index_Term $term)
442 {
443 $result = 0;
444 foreach ($this->_segmentInfos as $segInfo) {
445 $termInfo = $segInfo->getTermInfo($term);
446 if ($termInfo !== null) {
447 $result += $termInfo->docFreq;
448 }
449 }
450
451 return $result;
452 }
453
454
455 /**
456 * Retrive similarity used by index reader
457 *
458 * @return Zend_Search_Lucene_Search_Similarity
459 */
460 public function getSimilarity()
461 {
462 return Zend_Search_Lucene_Search_Similarity::getDefault();
463 }
464
465
466 /**
467 * Returns a normalization factor for "field, document" pair.
468 *
469 * @param integer $id
470 * @param string $fieldName
471 * @return Zend_Search_Lucene_Document
472 */
473 public function norm( $id, $fieldName )
474 {
475 if ($id >= $this->_docCount) {
476 return null;
477 }
478
479 $segmentStartId = 0;
480 foreach ($this->_segmentInfos as $segInfo) {
481 if ($segmentStartId + $segInfo->count() > $id) {
482 break;
483 }
484
485 $segmentStartId += $segInfo->count();
486 }
487
488 if ($segInfo->isDeleted($id - $segmentStartId)) {
489 return 0;
490 }
491
492 return $segInfo->norm($id - $segmentStartId, $fieldName);
493 }
494
495 /**
496 * Returns true if any documents have been deleted from this index.
497 *
498 * @return boolean
499 */
500 public function hasDeletions()
501 {
502 foreach ($this->_segmentInfos as $segmentInfo) {
503 if ($segmentInfo->hasDeletions()) {
504 return true;
505 }
506 }
507
508 return false;
509 }
510
511
512 /**
513 * Deletes a document from the index.
514 * $id is an internal document id
515 *
516 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
517 * @throws Zend_Search_Lucene_Exception
518 */
519 public function delete($id)
520 {
521 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
522 /* @var $id Zend_Search_Lucene_Search_QueryHit */
523 $id = $id->id;
524 }
525
526 if ($id >= $this->_docCount) {
527 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
528 }
529
530 $segCount = 0;
531 $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
532 while( $nextSegmentStartId <= $id ) {
533 $segCount++;
534 $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
535 }
536
537 $this->_hasChanges = true;
538 $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
539 $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId);
540 }
541
542
543
544 /**
545 * Adds a document to this index.
546 *
547 * @param Zend_Search_Lucene_Document $document
548 */
549 public function addDocument(Zend_Search_Lucene_Document $document)
550 {
551 if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
552 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
553 }
554
555 $this->_writer->addDocument($document);
556 }
557
558
559 /**
560 * Commit changes resulting from delete() or undeleteAll() operations.
561 *
562 * @todo delete() and undeleteAll processing.
563 */
564 public function commit()
565 {
566 if ($this->_hasChanges) {
567 foreach ($this->_segmentInfos as $segInfo) {
568 $segInfo->writeChanges();
569 }
570
571 $this->_hasChanges = false;
572 }
573
574 if ($this->_writer !== null) {
575 foreach ($this->_writer->commit() as $segmentName => $segmentInfo) {
576 if ($segmentInfo !== null) {
577 $this->_segmentInfos[] = $segmentInfo;
578 $this->_docCount += $segmentInfo->count();
579 } else {
580 foreach ($this->_segmentInfos as $segId => $segInfo) {
581 if ($segInfo->getName() == $segmentName) {
582 unset($this->_segmentInfos[$segId]);
583 }
584 }
585 }
586 }
587 }
588 }
589
590
591 /*************************************************************************
592 @todo UNIMPLEMENTED
593 *************************************************************************/
594
595 /**
596 * Returns an array of all terms in this index.
597 *
598 * @todo Implementation
599 * @return array
600 */
601 public function terms()
602 {
603 return array();
604 }
605
606
607 /**
608 * Undeletes all documents currently marked as deleted in this index.
609 *
610 * @todo Implementation
611 */
612 public function undeleteAll()
613 {}
614}