682d4032 |
1 | <?php |
2 | /** |
3 | * Zend Framework |
4 | * |
5 | * LICENSE |
6 | * |
7 | * This source file is subject to the new BSD license that is bundled |
8 | * with this package in the file LICENSE.txt. |
9 | * It is also available through the world-wide-web at this URL: |
10 | * http://framework.zend.com/license/new-bsd |
11 | * If you did not receive a copy of the license and are unable to |
12 | * obtain it through the world-wide-web, please send an email |
13 | * to license@zend.com so we can send you a copy immediately. |
14 | * |
15 | * @category Zend |
16 | * @package Zend_Search_Lucene |
17 | * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) |
18 | * @license http://framework.zend.com/license/new-bsd New BSD License |
19 | */ |
20 | |
21 | |
22 | /** Zend_Search_Lucene_Exception */ |
23 | require_once 'Zend/Search/Lucene/Exception.php'; |
24 | |
25 | /** Zend_Search_Lucene_Document */ |
26 | require_once 'Zend/Search/Lucene/Document.php'; |
27 | |
28 | /** Zend_Search_Lucene_Storage_Directory */ |
29 | require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php'; |
30 | |
31 | /** Zend_Search_Lucene_Index_Term */ |
32 | require_once 'Zend/Search/Lucene/Index/Term.php'; |
33 | |
34 | /** Zend_Search_Lucene_Index_TermInfo */ |
35 | require_once 'Zend/Search/Lucene/Index/TermInfo.php'; |
36 | |
37 | /** Zend_Search_Lucene_Index_SegmentInfo */ |
38 | require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; |
39 | |
40 | /** Zend_Search_Lucene_Index_FieldInfo */ |
41 | require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; |
42 | |
43 | /** Zend_Search_Lucene_Index_Writer */ |
44 | require_once 'Zend/Search/Lucene/Index/Writer.php'; |
45 | |
46 | /** Zend_Search_Lucene_Search_QueryParser */ |
47 | require_once 'Zend/Search/Lucene/Search/QueryParser.php'; |
48 | |
49 | /** Zend_Search_Lucene_Search_QueryHit */ |
50 | require_once 'Zend/Search/Lucene/Search/QueryHit.php'; |
51 | |
52 | /** Zend_Search_Lucene_Search_Similarity */ |
53 | require_once 'Zend/Search/Lucene/Search/Similarity.php'; |
54 | |
55 | |
56 | /** |
57 | * @category Zend |
58 | * @package Zend_Search_Lucene |
59 | * @copyright Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com) |
60 | * @license http://framework.zend.com/license/new-bsd New BSD License |
61 | */ |
62 | class Zend_Search_Lucene |
63 | { |
64 | /** |
65 | * File system adapter. |
66 | * |
67 | * @var Zend_Search_Lucene_Storage_Directory |
68 | */ |
69 | private $_directory = null; |
70 | |
71 | /** |
72 | * File system adapter closing option |
73 | * |
74 | * @var boolean |
75 | */ |
76 | private $_closeDirOnExit = true; |
77 | |
78 | /** |
79 | * Writer for this index, not instantiated unless required. |
80 | * |
81 | * @var Zend_Search_Lucene_Index_Writer |
82 | */ |
83 | private $_writer = null; |
84 | |
85 | /** |
86 | * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. |
87 | * |
88 | * @var array Zend_Search_Lucene_Index_SegmentInfo |
89 | */ |
90 | private $_segmentInfos = array(); |
91 | |
92 | /** |
93 | * Number of documents in this index. |
94 | * |
95 | * @var integer |
96 | */ |
97 | private $_docCount = 0; |
98 | |
99 | /** |
100 | * Flag for index changes |
101 | * |
102 | * @var boolean |
103 | */ |
104 | private $_hasChanges = false; |
105 | |
106 | /** |
107 | * Opens the index. |
108 | * |
109 | * IndexReader constructor needs Directory as a parameter. It should be |
110 | * a string with a path to the index folder or a Directory object. |
111 | * |
112 | * @param mixed $directory |
113 | * @throws Zend_Search_Lucene_Exception |
114 | */ |
115 | public function __construct($directory = null, $create = false) |
116 | { |
117 | if ($directory === null) { |
118 | throw new Zend_Search_Exception('No index directory specified'); |
119 | } |
120 | |
121 | if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { |
122 | $this->_directory = $directory; |
123 | $this->_closeDirOnExit = false; |
124 | } else { |
125 | $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); |
126 | $this->_closeDirOnExit = true; |
127 | } |
128 | |
129 | if ($create) { |
130 | $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); |
131 | } else { |
132 | $this->_writer = null; |
133 | } |
134 | |
135 | $this->_segmentInfos = array(); |
136 | |
137 | $segmentsFile = $this->_directory->getFileObject('segments'); |
138 | |
139 | $format = $segmentsFile->readInt(); |
140 | |
141 | if ($format != (int)0xFFFFFFFF) { |
142 | throw new Zend_Search_Lucene_Exception('Wrong segments file format'); |
143 | } |
144 | |
145 | // read version |
146 | $segmentsFile->readLong(); |
147 | |
148 | // read counter |
149 | $segmentsFile->readInt(); |
150 | |
151 | $segments = $segmentsFile->readInt(); |
152 | |
153 | $this->_docCount = 0; |
154 | |
155 | // read segmentInfos |
156 | for ($count = 0; $count < $segments; $count++) { |
157 | $segName = $segmentsFile->readString(); |
158 | $segSize = $segmentsFile->readInt(); |
159 | $this->_docCount += $segSize; |
160 | |
161 | $this->_segmentInfos[$count] = |
162 | new Zend_Search_Lucene_Index_SegmentInfo($segName, |
163 | $segSize, |
164 | $this->_directory); |
165 | } |
166 | } |
167 | |
168 | |
169 | /** |
170 | * Object destructor |
171 | */ |
172 | public function __destruct() |
173 | { |
174 | $this->commit(); |
175 | |
176 | if ($this->_closeDirOnExit) { |
177 | $this->_directory->close(); |
178 | } |
179 | } |
180 | |
181 | /** |
182 | * Returns an instance of Zend_Search_Lucene_Index_Writer for the index |
183 | * |
184 | * @return Zend_Search_Lucene_Index_Writer |
185 | */ |
186 | public function getIndexWriter() |
187 | { |
188 | if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { |
189 | $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); |
190 | } |
191 | |
192 | return $this->_writer; |
193 | } |
194 | |
195 | |
196 | /** |
197 | * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. |
198 | * |
199 | * @return Zend_Search_Lucene_Storage_Directory |
200 | */ |
201 | public function getDirectory() |
202 | { |
203 | return $this->_directory; |
204 | } |
205 | |
206 | |
207 | /** |
208 | * Returns the total number of documents in this index. |
209 | * |
210 | * @return integer |
211 | */ |
212 | public function count() |
213 | { |
214 | return $this->_docCount; |
215 | } |
216 | |
217 | |
218 | /** |
219 | * Performs a query against the index and returns an array |
220 | * of Zend_Search_Lucene_Search_QueryHit objects. |
221 | * Input is a string or Zend_Search_Lucene_Search_Query. |
222 | * |
223 | * @param mixed $query |
224 | * @return array ZSearchHit |
225 | */ |
226 | public function find($query) |
227 | { |
228 | if (is_string($query)) { |
229 | $query = Zend_Search_Lucene_Search_QueryParser::parse($query); |
230 | } |
231 | |
232 | if (!$query instanceof Zend_Search_Lucene_Search_Query) { |
233 | throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); |
234 | } |
235 | |
236 | $this->commit(); |
237 | |
238 | $hits = array(); |
239 | $scores = array(); |
240 | |
241 | $docNum = $this->count(); |
242 | for( $count=0; $count < $docNum; $count++ ) { |
243 | $docScore = $query->score( $count, $this); |
244 | if( $docScore != 0 ) { |
245 | $hit = new Zend_Search_Lucene_Search_QueryHit($this); |
246 | $hit->id = $count; |
247 | $hit->score = $docScore; |
248 | |
249 | $hits[] = $hit; |
250 | $scores[] = $docScore; |
251 | } |
252 | } |
253 | array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); |
254 | |
255 | return $hits; |
256 | } |
257 | |
258 | |
259 | /** |
260 | * Returns a list of all unique field names that exist in this index. |
261 | * |
262 | * @param boolean $indexed |
263 | * @return array |
264 | */ |
265 | public function getFieldNames($indexed = false) |
266 | { |
267 | $result = array(); |
268 | foreach( $this->_segmentInfos as $segmentInfo ) { |
269 | $result = array_merge($result, $segmentInfo->getFields($indexed)); |
270 | } |
271 | return $result; |
272 | } |
273 | |
274 | |
275 | /** |
276 | * Returns a Zend_Search_Lucene_Document object for the document |
277 | * number $id in this index. |
278 | * |
279 | * @param integer|Zend_Search_Lucene_Search_QueryHit $id |
280 | * @return Zend_Search_Lucene_Document |
281 | */ |
282 | public function getDocument($id) |
283 | { |
284 | if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { |
285 | /* @var $id Zend_Search_Lucene_Search_QueryHit */ |
286 | $id = $id->id; |
287 | } |
288 | |
289 | if ($id >= $this->_docCount) { |
290 | throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); |
291 | } |
292 | |
293 | $segCount = 0; |
294 | $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); |
295 | while( $nextSegmentStartId <= $id ) { |
296 | $segCount++; |
297 | $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); |
298 | } |
299 | $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); |
300 | |
301 | $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); |
302 | $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); |
303 | $fieldValuesPosition = $fdxFile->readLong(); |
304 | |
305 | $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); |
306 | $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); |
307 | $fieldCount = $fdtFile->readVInt(); |
308 | |
309 | $doc = new Zend_Search_Lucene_Document(); |
310 | for( $count = 0; $count < $fieldCount; $count++ ) { |
311 | $fieldNum = $fdtFile->readVInt(); |
312 | $bits = $fdtFile->readByte(); |
313 | |
314 | $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); |
315 | |
316 | if( !($bits & 2) ) { // Text data |
317 | $field = new Zend_Search_Lucene_Field($fieldInfo->name, |
318 | $fdtFile->readString(), |
319 | true, |
320 | $fieldInfo->isIndexed, |
321 | $bits & 1 ); |
322 | } else { |
323 | $field = new Zend_Search_Lucene_Field($fieldInfo->name, |
324 | $fdtFile->readBinary(), |
325 | true, |
326 | $fieldInfo->isIndexed, |
327 | $bits & 1 ); |
328 | } |
329 | |
330 | $doc->addField($field); |
331 | } |
332 | |
333 | return $doc; |
334 | } |
335 | |
336 | |
337 | /** |
338 | * Returns an array of all the documents which contain term. |
339 | * |
340 | * @param Zend_Search_Lucene_Index_Term $term |
341 | * @return array |
342 | */ |
343 | public function termDocs(Zend_Search_Lucene_Index_Term $term) |
344 | { |
345 | $result = array(); |
346 | $segmentStartDocId = 0; |
347 | |
348 | foreach ($this->_segmentInfos as $segInfo) { |
349 | $termInfo = $segInfo->getTermInfo($term); |
350 | |
351 | if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { |
352 | $segmentStartDocId += $segInfo->count(); |
353 | continue; |
354 | } |
355 | |
356 | $frqFile = $segInfo->openCompoundFile('.frq'); |
357 | $frqFile->seek($termInfo->freqPointer,SEEK_CUR); |
358 | $docId = 0; |
359 | for( $count=0; $count < $termInfo->docFreq; $count++ ) { |
360 | $docDelta = $frqFile->readVInt(); |
361 | if( $docDelta % 2 == 1 ) { |
362 | $docId += ($docDelta-1)/2; |
363 | } else { |
364 | $docId += $docDelta/2; |
365 | // read freq |
366 | $frqFile->readVInt(); |
367 | } |
368 | |
369 | $result[] = $segmentStartDocId + $docId; |
370 | } |
371 | |
372 | $segmentStartDocId += $segInfo->count(); |
373 | } |
374 | |
375 | return $result; |
376 | } |
377 | |
378 | |
379 | /** |
380 | * Returns an array of all term positions in the documents. |
381 | * Return array structure: array( docId => array( pos1, pos2, ...), ...) |
382 | * |
383 | * @param Zend_Search_Lucene_Index_Term $term |
384 | * @return array |
385 | */ |
386 | public function termPositions(Zend_Search_Lucene_Index_Term $term) |
387 | { |
388 | $result = array(); |
389 | $segmentStartDocId = 0; |
390 | foreach( $this->_segmentInfos as $segInfo ) { |
391 | $termInfo = $segInfo->getTermInfo($term); |
392 | |
393 | if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { |
394 | $segmentStartDocId += $segInfo->count(); |
395 | continue; |
396 | } |
397 | |
398 | $frqFile = $segInfo->openCompoundFile('.frq'); |
399 | $frqFile->seek($termInfo->freqPointer,SEEK_CUR); |
400 | $freqs = array(); |
401 | $docId = 0; |
402 | |
403 | for( $count = 0; $count < $termInfo->docFreq; $count++ ) { |
404 | $docDelta = $frqFile->readVInt(); |
405 | if( $docDelta % 2 == 1 ) { |
406 | $docId += ($docDelta-1)/2; |
407 | $freqs[ $docId ] = 1; |
408 | } else { |
409 | $docId += $docDelta/2; |
410 | $freqs[ $docId ] = $frqFile->readVInt(); |
411 | } |
412 | } |
413 | |
414 | $prxFile = $segInfo->openCompoundFile('.prx'); |
415 | $prxFile->seek($termInfo->proxPointer,SEEK_CUR); |
416 | foreach ($freqs as $docId => $freq) { |
417 | $termPosition = 0; |
418 | $positions = array(); |
419 | |
420 | for ($count = 0; $count < $freq; $count++ ) { |
421 | $termPosition += $prxFile->readVInt(); |
422 | $positions[] = $termPosition; |
423 | } |
424 | |
425 | $result[ $segmentStartDocId + $docId ] = $positions; |
426 | } |
427 | |
428 | $segmentStartDocId += $segInfo->count(); |
429 | } |
430 | |
431 | return $result; |
432 | } |
433 | |
434 | |
435 | /** |
436 | * Returns the number of documents in this index containing the $term. |
437 | * |
438 | * @param Zend_Search_Lucene_Index_Term $term |
439 | * @return integer |
440 | */ |
441 | public function docFreq(Zend_Search_Lucene_Index_Term $term) |
442 | { |
443 | $result = 0; |
444 | foreach ($this->_segmentInfos as $segInfo) { |
445 | $termInfo = $segInfo->getTermInfo($term); |
446 | if ($termInfo !== null) { |
447 | $result += $termInfo->docFreq; |
448 | } |
449 | } |
450 | |
451 | return $result; |
452 | } |
453 | |
454 | |
455 | /** |
456 | * Retrive similarity used by index reader |
457 | * |
458 | * @return Zend_Search_Lucene_Search_Similarity |
459 | */ |
460 | public function getSimilarity() |
461 | { |
462 | return Zend_Search_Lucene_Search_Similarity::getDefault(); |
463 | } |
464 | |
465 | |
466 | /** |
467 | * Returns a normalization factor for "field, document" pair. |
468 | * |
469 | * @param integer $id |
470 | * @param string $fieldName |
471 | * @return Zend_Search_Lucene_Document |
472 | */ |
473 | public function norm( $id, $fieldName ) |
474 | { |
475 | if ($id >= $this->_docCount) { |
476 | return null; |
477 | } |
478 | |
479 | $segmentStartId = 0; |
480 | foreach ($this->_segmentInfos as $segInfo) { |
481 | if ($segmentStartId + $segInfo->count() > $id) { |
482 | break; |
483 | } |
484 | |
485 | $segmentStartId += $segInfo->count(); |
486 | } |
487 | |
488 | if ($segInfo->isDeleted($id - $segmentStartId)) { |
489 | return 0; |
490 | } |
491 | |
492 | return $segInfo->norm($id - $segmentStartId, $fieldName); |
493 | } |
494 | |
495 | /** |
496 | * Returns true if any documents have been deleted from this index. |
497 | * |
498 | * @return boolean |
499 | */ |
500 | public function hasDeletions() |
501 | { |
502 | foreach ($this->_segmentInfos as $segmentInfo) { |
503 | if ($segmentInfo->hasDeletions()) { |
504 | return true; |
505 | } |
506 | } |
507 | |
508 | return false; |
509 | } |
510 | |
511 | |
512 | /** |
513 | * Deletes a document from the index. |
514 | * $id is an internal document id |
515 | * |
516 | * @param integer|Zend_Search_Lucene_Search_QueryHit $id |
517 | * @throws Zend_Search_Lucene_Exception |
518 | */ |
519 | public function delete($id) |
520 | { |
521 | if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { |
522 | /* @var $id Zend_Search_Lucene_Search_QueryHit */ |
523 | $id = $id->id; |
524 | } |
525 | |
526 | if ($id >= $this->_docCount) { |
527 | throw new Zend_Search_Lucene_Exception('Document id is out of the range.'); |
528 | } |
529 | |
530 | $segCount = 0; |
531 | $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); |
532 | while( $nextSegmentStartId <= $id ) { |
533 | $segCount++; |
534 | $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); |
535 | } |
536 | |
537 | $this->_hasChanges = true; |
538 | $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); |
539 | $this->_segmentInfos[ $segCount ]->delete($id - $segmentStartId); |
540 | } |
541 | |
542 | |
543 | |
544 | /** |
545 | * Adds a document to this index. |
546 | * |
547 | * @param Zend_Search_Lucene_Document $document |
548 | */ |
549 | public function addDocument(Zend_Search_Lucene_Document $document) |
550 | { |
551 | if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { |
552 | $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); |
553 | } |
554 | |
555 | $this->_writer->addDocument($document); |
556 | } |
557 | |
558 | |
559 | /** |
560 | * Commit changes resulting from delete() or undeleteAll() operations. |
561 | * |
562 | * @todo delete() and undeleteAll processing. |
563 | */ |
564 | public function commit() |
565 | { |
566 | if ($this->_hasChanges) { |
567 | foreach ($this->_segmentInfos as $segInfo) { |
568 | $segInfo->writeChanges(); |
569 | } |
570 | |
571 | $this->_hasChanges = false; |
572 | } |
573 | |
574 | if ($this->_writer !== null) { |
575 | foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { |
576 | if ($segmentInfo !== null) { |
577 | $this->_segmentInfos[] = $segmentInfo; |
578 | $this->_docCount += $segmentInfo->count(); |
579 | } else { |
580 | foreach ($this->_segmentInfos as $segId => $segInfo) { |
581 | if ($segInfo->getName() == $segmentName) { |
582 | unset($this->_segmentInfos[$segId]); |
583 | } |
584 | } |
585 | } |
586 | } |
587 | } |
588 | } |
589 | |
590 | |
591 | /************************************************************************* |
592 | @todo UNIMPLEMENTED |
593 | *************************************************************************/ |
594 | |
595 | /** |
596 | * Returns an array of all terms in this index. |
597 | * |
598 | * @todo Implementation |
599 | * @return array |
600 | */ |
601 | public function terms() |
602 | { |
603 | return array(); |
604 | } |
605 | |
606 | |
607 | /** |
608 | * Undeletes all documents currently marked as deleted in this index. |
609 | * |
610 | * @todo Implementation |
611 | */ |
612 | public function undeleteAll() |
613 | {} |
614 | } |