MDL-12234, fixing unicode issues with global search
[moodle.git] / search / Zend / Search / Lucene / Analysis / Analyzer.php
CommitLineData
682d4032 1<?php
2/**
3 * Zend Framework
4 *
5 * LICENSE
6 *
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
14 *
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Analysis
8cfbeb81 18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
682d4032 19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 */
21
22
23/** Zend_Search_Lucene_Analysis_Token */
8cfbeb81 24require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php';
25
26/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
27require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
28
29/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
30require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
682d4032 31
32/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
8cfbeb81 33require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
682d4032 34
35/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
8cfbeb81 36require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
37
38/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
39require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
40
41/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
42require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
43
44/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
45require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
682d4032 46
8cfbeb81 47/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
48require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
682d4032 49
50
51/**
52 * An Analyzer is used to analyze text.
53 * It thus represents a policy for extracting index terms from text.
54 *
55 * Note:
56 * Lucene Java implementation is oriented to streams. It provides effective work
57 * with a huge documents (more then 20Mb).
58 * But engine itself is not oriented such documents.
59 * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
60 *
61 * @category Zend
62 * @package Zend_Search_Lucene
63 * @subpackage Analysis
8cfbeb81 64 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
682d4032 65 * @license http://framework.zend.com/license/new-bsd New BSD License
66 */
67
68abstract class Zend_Search_Lucene_Analysis_Analyzer
69{
70 /**
71 * The Analyzer implementation used by default.
72 *
73 * @var Zend_Search_Lucene_Analysis_Analyzer
74 */
8cfbeb81 75 private static $_defaultImpl;
76
77 /**
78 * Input string
79 *
80 * @var string
81 */
82 protected $_input = null;
83
84 /**
85 * Input string encoding
86 *
87 * @var string
88 */
89 protected $_encoding = '';
682d4032 90
91 /**
92 * Tokenize text to a terms
93 * Returns array of Zend_Search_Lucene_Analysis_Token objects
94 *
8cfbeb81 95 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
96 *
682d4032 97 * @param string $data
98 * @return array
99 */
001f3652 100 public function tokenize($data, $encoding = 'UTF-8')
8cfbeb81 101 {
102 $this->setInput($data, $encoding);
8cfbeb81 103 $tokenList = array();
104 while (($nextToken = $this->nextToken()) !== null) {
105 $tokenList[] = $nextToken;
106 }
107
108 return $tokenList;
109 }
110
111
112 /**
113 * Tokenization stream API
114 * Set input
115 *
116 * @param string $data
117 */
118 public function setInput($data, $encoding = '')
119 {
120 $this->_input = $data;
121 $this->_encoding = $encoding;
122 $this->reset();
123 }
124
125 /**
126 * Reset token stream
127 */
128 abstract public function reset();
129
130 /**
131 * Tokenization stream API
132 * Get next token
133 * Returns null at the end of stream
134 *
135 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
136 *
137 * @return Zend_Search_Lucene_Analysis_Token|null
138 */
139 abstract public function nextToken();
140
141
682d4032 142
143
144 /**
145 * Set the default Analyzer implementation used by indexing code.
146 *
147 * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
148 */
8cfbeb81 149 public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
682d4032 150 {
151 self::$_defaultImpl = $analyzer;
152 }
153
154
155 /**
156 * Return the default Analyzer implementation used by indexing code.
157 *
158 * @return Zend_Search_Lucene_Analysis_Analyzer
159 */
8cfbeb81 160 public static function getDefault()
682d4032 161 {
162 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
001f3652 163 self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
682d4032 164 }
165
166 return self::$_defaultImpl;
167 }
682d4032 168}
169