Initial commit
[moodle.git] / search / Zend / Search / Lucene / Search / Similarity.php
1 <?php
2 /**
3  * Zend Framework
4  *
5  * LICENSE
6  *
7  * This source file is subject to the new BSD license that is bundled
8  * with this package in the file LICENSE.txt.
9  * It is also available through the world-wide-web at this URL:
10  * http://framework.zend.com/license/new-bsd
11  * If you did not receive a copy of the license and are unable to
12  * obtain it through the world-wide-web, please send an email
13  * to license@zend.com so we can send you a copy immediately.
14  *
15  * @category   Zend
16  * @package    Zend_Search_Lucene
17  * @subpackage Search
18  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
20  */
23 /** Zend_Search_Lucene_Search_Similarity_Default */
24 require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
27 /**
28  * @category   Zend
29  * @package    Zend_Search_Lucene
30  * @subpackage Search
31  * @copyright  Copyright (c) 2006 Zend Technologies USA Inc. (http://www.zend.com)
32  * @license    http://framework.zend.com/license/new-bsd     New BSD License
33  */
34 abstract class Zend_Search_Lucene_Search_Similarity
35 {
36     /**
37      * The Similarity implementation used by default.
38      *
39      * @var Zend_Search_Lucene_Search_Similarity
40      */
41     static private $_defaultImpl;
43     /**
44      * Cache of decoded bytes.
45      * Array of floats
46      *
47      * @var array
48      */
49     static private $_normTable = array( 0   => 0.0,
50                                         1   => 5.820766E-10,
51                                         2   => 6.9849193E-10,
52                                         3   => 8.1490725E-10,
53                                         4   => 9.313226E-10,
54                                         5   => 1.1641532E-9,
55                                         6   => 1.3969839E-9,
56                                         7   => 1.6298145E-9,
57                                         8   => 1.8626451E-9,
58                                         9   => 2.3283064E-9,
59                                         10  => 2.7939677E-9,
60                                         11  => 3.259629E-9,
61                                         12  => 3.7252903E-9,
62                                         13  => 4.656613E-9,
63                                         14  => 5.5879354E-9,
64                                         15  => 6.519258E-9,
65                                         16  => 7.4505806E-9,
66                                         17  => 9.313226E-9,
67                                         18  => 1.1175871E-8,
68                                         19  => 1.3038516E-8,
69                                         20  => 1.4901161E-8,
70                                         21  => 1.8626451E-8,
71                                         22  => 2.2351742E-8,
72                                         23  => 2.6077032E-8,
73                                         24  => 2.9802322E-8,
74                                         25  => 3.7252903E-8,
75                                         26  => 4.4703484E-8,
76                                         27  => 5.2154064E-8,
77                                         28  => 5.9604645E-8,
78                                         29  => 7.4505806E-8,
79                                         30  => 8.940697E-8,
80                                         31  => 1.0430813E-7,
81                                         32  => 1.1920929E-7,
82                                         33  => 1.4901161E-7,
83                                         34  => 1.7881393E-7,
84                                         35  => 2.0861626E-7,
85                                         36  => 2.3841858E-7,
86                                         37  => 2.9802322E-7,
87                                         38  => 3.5762787E-7,
88                                         39  => 4.172325E-7,
89                                         40  => 4.7683716E-7,
90                                         41  => 5.9604645E-7,
91                                         42  => 7.1525574E-7,
92                                         43  => 8.34465E-7,
93                                         44  => 9.536743E-7,
94                                         45  => 1.1920929E-6,
95                                         46  => 1.4305115E-6,
96                                         47  => 1.66893E-6,
97                                         48  => 1.9073486E-6,
98                                         49  => 2.3841858E-6,
99                                         50  => 2.861023E-6,
100                                         51  => 3.33786E-6,
101                                         52  => 3.8146973E-6,
102                                         53  => 4.7683716E-6,
103                                         54  => 5.722046E-6,
104                                         55  => 6.67572E-6,
105                                         56  => 7.6293945E-6,
106                                         57  => 9.536743E-6,
107                                         58  => 1.1444092E-5,
108                                         59  => 1.335144E-5,
109                                         60  => 1.5258789E-5,
110                                         61  => 1.9073486E-5,
111                                         62  => 2.2888184E-5,
112                                         63  => 2.670288E-5,
113                                         64  => 3.0517578E-5,
114                                         65  => 3.8146973E-5,
115                                         66  => 4.5776367E-5,
116                                         67  => 5.340576E-5,
117                                         68  => 6.1035156E-5,
118                                         69  => 7.6293945E-5,
119                                         70  => 9.1552734E-5,
120                                         71  => 1.0681152E-4,
121                                         72  => 1.2207031E-4,
122                                         73  => 1.5258789E-4,
123                                         74  => 1.8310547E-4,
124                                         75  => 2.1362305E-4,
125                                         76  => 2.4414062E-4,
126                                         77  => 3.0517578E-4,
127                                         78  => 3.6621094E-4,
128                                         79  => 4.272461E-4,
129                                         80  => 4.8828125E-4,
130                                         81  => 6.1035156E-4,
131                                         82  => 7.324219E-4,
132                                         83  => 8.544922E-4,
133                                         84  => 9.765625E-4,
134                                         85  => 0.0012207031,
135                                         86  => 0.0014648438,
136                                         87  => 0.0017089844,
137                                         88  => 0.001953125,
138                                         89  => 0.0024414062,
139                                         90  => 0.0029296875,
140                                         91  => 0.0034179688,
141                                         92  => 0.00390625,
142                                         93  => 0.0048828125,
143                                         94  => 0.005859375,
144                                         95  => 0.0068359375,
145                                         96  => 0.0078125,
146                                         97  => 0.009765625,
147                                         98  => 0.01171875,
148                                         99  => 0.013671875,
149                                         100 => 0.015625,
150                                         101 => 0.01953125,
151                                         102 => 0.0234375,
152                                         103 => 0.02734375,
153                                         104 => 0.03125,
154                                         105 => 0.0390625,
155                                         106 => 0.046875,
156                                         107 => 0.0546875,
157                                         108 => 0.0625,
158                                         109 => 0.078125,
159                                         110 => 0.09375,
160                                         111 => 0.109375,
161                                         112 => 0.125,
162                                         113 => 0.15625,
163                                         114 => 0.1875,
164                                         115 => 0.21875,
165                                         116 => 0.25,
166                                         117 => 0.3125,
167                                         118 => 0.375,
168                                         119 => 0.4375,
169                                         120 => 0.5,
170                                         121 => 0.625,
171                                         122 => 0.75,
172                                         123 => 0.875,
173                                         124 => 1.0,
174                                         125 => 1.25,
175                                         126 => 1.5,
176                                         127 => 1.75,
177                                         128 => 2.0,
178                                         129 => 2.5,
179                                         130 => 3.0,
180                                         131 => 3.5,
181                                         132 => 4.0,
182                                         133 => 5.0,
183                                         134 => 6.0,
184                                         135 => 7.0,
185                                         136 => 8.0,
186                                         137 => 10.0,
187                                         138 => 12.0,
188                                         139 => 14.0,
189                                         140 => 16.0,
190                                         141 => 20.0,
191                                         142 => 24.0,
192                                         143 => 28.0,
193                                         144 => 32.0,
194                                         145 => 40.0,
195                                         146 => 48.0,
196                                         147 => 56.0,
197                                         148 => 64.0,
198                                         149 => 80.0,
199                                         150 => 96.0,
200                                         151 => 112.0,
201                                         152 => 128.0,
202                                         153 => 160.0,
203                                         154 => 192.0,
204                                         155 => 224.0,
205                                         156 => 256.0,
206                                         157 => 320.0,
207                                         158 => 384.0,
208                                         159 => 448.0,
209                                         160 => 512.0,
210                                         161 => 640.0,
211                                         162 => 768.0,
212                                         163 => 896.0,
213                                         164 => 1024.0,
214                                         165 => 1280.0,
215                                         166 => 1536.0,
216                                         167 => 1792.0,
217                                         168 => 2048.0,
218                                         169 => 2560.0,
219                                         170 => 3072.0,
220                                         171 => 3584.0,
221                                         172 => 4096.0,
222                                         173 => 5120.0,
223                                         174 => 6144.0,
224                                         175 => 7168.0,
225                                         176 => 8192.0,
226                                         177 => 10240.0,
227                                         178 => 12288.0,
228                                         179 => 14336.0,
229                                         180 => 16384.0,
230                                         181 => 20480.0,
231                                         182 => 24576.0,
232                                         183 => 28672.0,
233                                         184 => 32768.0,
234                                         185 => 40960.0,
235                                         186 => 49152.0,
236                                         187 => 57344.0,
237                                         188 => 65536.0,
238                                         189 => 81920.0,
239                                         190 => 98304.0,
240                                         191 => 114688.0,
241                                         192 => 131072.0,
242                                         193 => 163840.0,
243                                         194 => 196608.0,
244                                         195 => 229376.0,
245                                         196 => 262144.0,
246                                         197 => 327680.0,
247                                         198 => 393216.0,
248                                         199 => 458752.0,
249                                         200 => 524288.0,
250                                         201 => 655360.0,
251                                         202 => 786432.0,
252                                         203 => 917504.0,
253                                         204 => 1048576.0,
254                                         205 => 1310720.0,
255                                         206 => 1572864.0,
256                                         207 => 1835008.0,
257                                         208 => 2097152.0,
258                                         209 => 2621440.0,
259                                         210 => 3145728.0,
260                                         211 => 3670016.0,
261                                         212 => 4194304.0,
262                                         213 => 5242880.0,
263                                         214 => 6291456.0,
264                                         215 => 7340032.0,
265                                         216 => 8388608.0,
266                                         217 => 1.048576E7,
267                                         218 => 1.2582912E7,
268                                         219 => 1.4680064E7,
269                                         220 => 1.6777216E7,
270                                         221 => 2.097152E7,
271                                         222 => 2.5165824E7,
272                                         223 => 2.9360128E7,
273                                         224 => 3.3554432E7,
274                                         225 => 4.194304E7,
275                                         226 => 5.0331648E7,
276                                         227 => 5.8720256E7,
277                                         228 => 6.7108864E7,
278                                         229 => 8.388608E7,
279                                         230 => 1.00663296E8,
280                                         231 => 1.17440512E8,
281                                         232 => 1.34217728E8,
282                                         233 => 1.6777216E8,
283                                         234 => 2.01326592E8,
284                                         235 => 2.34881024E8,
285                                         236 => 2.68435456E8,
286                                         237 => 3.3554432E8,
287                                         238 => 4.02653184E8,
288                                         239 => 4.69762048E8,
289                                         240 => 5.3687091E8,
290                                         241 => 6.7108864E8,
291                                         242 => 8.0530637E8,
292                                         243 => 9.395241E8,
293                                         244 => 1.07374182E9,
294                                         245 => 1.34217728E9,
295                                         246 => 1.61061274E9,
296                                         247 => 1.87904819E9,
297                                         248 => 2.14748365E9,
298                                         249 => 2.68435456E9,
299                                         250 => 3.22122547E9,
300                                         251 => 3.75809638E9,
301                                         252 => 4.2949673E9,
302                                         253 => 5.3687091E9,
303                                         254 => 6.4424509E9,
304                                         255 => 7.5161928E9 );
307     /**
308      * Set the default Similarity implementation used by indexing and search
309      * code.
310      *
311      * @param Zend_Search_Lucene_Search_Similarity $similarity
312      */
313     static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
314     {
315         self::$_defaultImpl = $similarity;
316     }
319     /**
320      * Return the default Similarity implementation used by indexing and search
321      * code.
322      *
323      * @return Zend_Search_Lucene_Search_Similarity
324      */
325     static public function getDefault()
326     {
327         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
328             self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
329         }
331         return self::$_defaultImpl;
332     }
335     /**
336      * Computes the normalization value for a field given the total number of
337      * terms contained in a field.  These values, together with field boosts, are
338      * stored in an index and multipled into scores for hits on each field by the
339      * search code.
340      *
341      * Matches in longer fields are less precise, so implemenations of this
342      * method usually return smaller values when 'numTokens' is large,
343      * and larger values when 'numTokens' is small.
344      *
345      * That these values are computed under
346      * IndexWriter::addDocument(Document) and stored then using
347      * encodeNorm(float).  Thus they have limited precision, and documents
348      * must be re-indexed if this method is altered.
349      *
350      * fieldName - name of field
351      * numTokens - the total number of tokens contained in fields named
352      *             'fieldName' of 'doc'.
353      * Returns a normalization factor for hits on this field of this document
354      *
355      * @param string $fieldName
356      * @param integer $numTokens
357      * @return float
358      */
359     abstract public function lengthNorm($fieldName, $numTokens);
361     /**
362      * Computes the normalization value for a query given the sum of the squared
363      * weights of each of the query terms.  This value is then multipled into the
364      * weight of each query term.
365      *
366      * This does not affect ranking, but rather just attempts to make scores
367      * from different queries comparable.
368      *
369      * sumOfSquaredWeights - the sum of the squares of query term weights
370      * Returns a normalization factor for query weights
371      *
372      * @param float $sumOfSquaredWeights
373      * @return float
374      */
375     abstract public function queryNorm($sumOfSquaredWeights);
378     /**
379      *  Decodes a normalization factor stored in an index.
380      *
381      * @param integer $byte
382      * @return float
383      */
384     static public function decodeNorm($byte)
385     {
386         return self::$_normTable[$byte & 0xFF];
387     }
390     /**
391      * Encodes a normalization factor for storage in an index.
392      *
393      * The encoding uses a five-bit exponent and three-bit mantissa, thus
394      * representing values from around 7x10^9 to 2x10^-9 with about one
395      * significant decimal digit of accuracy.  Zero is also represented.
396      * Negative numbers are rounded up to zero.  Values too large to represent
397      * are rounded down to the largest representable value.  Positive values too
398      * small to represent are rounded up to the smallest positive representable
399      * value.
400      *
401      * @param float $f
402      * @return integer
403      */
404     static function encodeNorm($f)
405     {
406       return self::_floatToByte($f);
407     }
409     /**
410      * Float to byte conversion
411      *
412      * @param integer $b
413      * @return float
414      */
415     static private function _floatToByte($f)
416     {
417         // round negatives up to zero
418         if ($f <= 0.0) {
419             return 0;
420         }
422         // search for appropriate value
423         $lowIndex = 0;
424         $highIndex = 255;
425         while ($highIndex >= $lowIndex) {
426             // $mid = ($highIndex - $lowIndex)/2;
427             $mid = ($highIndex + $lowIndex) >> 1;
428             $delta = $f - self::$_normTable[$mid];
430             if ($delta < 0) {
431                 $highIndex = $mid-1;
432             } elseif ($delta > 0) {
433                 $lowIndex  = $mid+1;
434             } else {
435                 return $mid; // We got it!
436             }
437         }
439         // round to closest value
440         if ($highIndex != 255 &&
441             $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
442             return $highIndex + 1;
443         } else {
444             return $highIndex;
445         }
446     }
449     /**
450      * Computes a score factor based on a term or phrase's frequency in a
451      * document.  This value is multiplied by the idf(Term, Searcher)
452      * factor for each term in the query and these products are then summed to
453      * form the initial score for a document.
454      *
455      * Terms and phrases repeated in a document indicate the topic of the
456      * document, so implementations of this method usually return larger values
457      * when 'freq' is large, and smaller values when 'freq'
458      * is small.
459      *
460      * freq - the frequency of a term within a document
461      * Returns a score factor based on a term's within-document frequency
462      *
463      * @param float $freq
464      * @return float
465      */
466     abstract public function tf($freq);
468     /**
469      * Computes the amount of a sloppy phrase match, based on an edit distance.
470      * This value is summed for each sloppy phrase match in a document to form
471      * the frequency that is passed to tf(float).
472      *
473      * A phrase match with a small edit distance to a document passage more
474      * closely matches the document, so implementations of this method usually
475      * return larger values when the edit distance is small and smaller values
476      * when it is large.
477      *
478      * distance - the edit distance of this sloppy phrase match
479      * Returns the frequency increment for this match
480      *
481      * @param integer $distance
482      * @return float
483      */
484     abstract public function sloppyFreq($distance);
487     /**
488      * Computes a score factor for a simple term or a phrase.
489      *
490      * The default implementation is:
491      *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
492      *
493      * input - the term in question or array of terms
494      * reader - reader the document collection being searched
495      * Returns a score factor for the term
496      *
497      * @param mixed $input
498      * @param Zend_Search_Lucene $reader
499      * @return a score factor for the term
500      */
501     public function idf($input, $reader)
502     {
503         if (!is_array($input)) {
504             return $this->idfFreq($reader->docFreq($input), $reader->count());
505         } else {
506             $idf = 0.0;
507             foreach ($input as $term) {
508                 $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
509             }
510             return $idf;
511         }
512     }
514     /**
515      * Computes a score factor based on a term's document frequency (the number
516      * of documents which contain the term).  This value is multiplied by the
517      * tf(int) factor for each term in the query and these products are
518      * then summed to form the initial score for a document.
519      *
520      * Terms that occur in fewer documents are better indicators of topic, so
521      * implemenations of this method usually return larger values for rare terms,
522      * and smaller values for common terms.
523      *
524      * docFreq - the number of documents which contain the term
525      * numDocs - the total number of documents in the collection
526      * Returns a score factor based on the term's document frequency
527      *
528      * @param integer $docFreq
529      * @param integer $numDocs
530      * @return float
531      */
532     abstract public function idfFreq($docFreq, $numDocs);
534     /**
535      * Computes a score factor based on the fraction of all query terms that a
536      * document contains.  This value is multiplied into scores.
537      *
538      * The presence of a large portion of the query terms indicates a better
539      * match with the query, so implemenations of this method usually return
540      * larger values when the ratio between these parameters is large and smaller
541      * values when the ratio between them is small.
542      *
543      * overlap - the number of query terms matched in the document
544      * maxOverlap - the total number of terms in the query
545      * Returns a score factor based on term overlap with the query
546      *
547      * @param integer $overlap
548      * @param integer $maxOverlap
549      * @return float
550      */
551     abstract public function coord($overlap, $maxOverlap);