8cfbeb81 |
1 | <?php |
2 | /** |
3 | * Zend Framework |
4 | * |
5 | * LICENSE |
6 | * |
7 | * This source file is subject to the new BSD license that is bundled |
8 | * with this package in the file LICENSE.txt. |
9 | * It is also available through the world-wide-web at this URL: |
10 | * http://framework.zend.com/license/new-bsd |
11 | * If you did not receive a copy of the license and are unable to |
12 | * obtain it through the world-wide-web, please send an email |
13 | * to license@zend.com so we can send you a copy immediately. |
14 | * |
15 | * @category Zend |
16 | * @package Zend_Search_Lucene |
17 | * @subpackage Analysis |
18 | * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) |
19 | * @license http://framework.zend.com/license/new-bsd New BSD License |
20 | */ |
21 | |
22 | |
23 | /** Zend_Search_Lucene_Analysis_TokenFilter */ |
24 | require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter.php'; |
25 | require_once $CFG->dirroot.'/search/Zend/Search/Exception.php'; |
26 | |
27 | |
28 | /** |
29 | * Token filter that removes stop words. These words must be provided as array (set), example: |
30 | * $stopwords = array('the' => 1, 'an' => '1'); |
31 | * |
32 | * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter. |
33 | * |
34 | * @category Zend |
35 | * @package Zend_Search_Lucene |
36 | * @subpackage Analysis |
37 | * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) |
38 | * @license http://framework.zend.com/license/new-bsd New BSD License |
39 | */ |
40 | |
41 | class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter |
42 | { |
43 | /** |
44 | * Minimum allowed term length |
45 | * @var array |
46 | */ |
47 | private $_stopSet; |
48 | |
49 | /** |
50 | * Constructs new instance of this filter. |
51 | * |
52 | * @param array $stopwords array (set) of words that will be filtered out |
53 | */ |
54 | public function __construct($stopwords = array()) { |
55 | $this->_stopSet = array_flip($stopwords); |
56 | } |
57 | |
58 | /** |
59 | * Normalize Token or remove it (if null is returned) |
60 | * |
61 | * @param Zend_Search_Lucene_Analysis_Token $srcToken |
62 | * @return Zend_Search_Lucene_Analysis_Token |
63 | */ |
64 | public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { |
65 | if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { |
66 | $t = $srcToken->getTermText(); |
67 | return null; |
68 | } else { |
69 | return $srcToken; |
70 | } |
71 | } |
72 | |
73 | /** |
74 | * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first |
75 | * column are ignored (as comments). |
76 | * |
77 | * You can call this method one or more times. New stopwords are always added to current set. |
78 | * |
79 | * @param string $filepath full path for text file with stopwords |
80 | * @throws Zend_Search_Exception When the file doesn`t exists or is not readable. |
81 | */ |
82 | public function loadFromFile($filepath = null) { |
83 | if (! $filepath || ! file_exists($filepath)) { |
84 | throw new Zend_Search_Exception('You have to provide valid file path'); |
85 | } |
86 | $fd = fopen($filepath, "r"); |
87 | if (! $fd) { |
88 | throw new Zend_Search_Exception('Cannot open file ' . $filepath); |
89 | } |
90 | while (!feof ($fd)) { |
91 | $buffer = trim(fgets($fd)); |
92 | if (strlen($buffer) > 0 && $buffer[0] != '#') { |
93 | $this->_stopSet[$buffer] = 1; |
94 | } |
95 | } |
96 | if (!fclose($fd)) { |
97 | throw new Zend_Search_Exception('Cannot close file ' . $filepath); |
98 | } |
99 | } |
100 | } |
101 | |