Commit | Line | Data |
---|---|---|
95c6aeaf DM |
1 | <?php |
2 | // This file is part of Moodle - http://moodle.org/ | |
3 | // | |
4 | // Moodle is free software: you can redistribute it and/or modify | |
5 | // it under the terms of the GNU General Public License as published by | |
6 | // the Free Software Foundation, either version 3 of the License, or | |
7 | // (at your option) any later version. | |
8 | // | |
9 | // Moodle is distributed in the hope that it will be useful, | |
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | // GNU General Public License for more details. | |
13 | // | |
14 | // You should have received a copy of the GNU General Public License | |
15 | // along with Moodle. If not, see <http://www.gnu.org/licenses/>. | |
16 | ||
17 | /** | |
18 | * Solr engine. | |
19 | * | |
20 | * @package search_solr | |
21 | * @copyright 2015 Daniel Neis Araujo | |
22 | * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later | |
23 | */ | |
24 | ||
25 | namespace search_solr; | |
26 | ||
27 | defined('MOODLE_INTERNAL') || die(); | |
28 | ||
29 | /** | |
30 | * Solr engine. | |
31 | * | |
32 | * @package search_solr | |
33 | * @copyright 2015 Daniel Neis Araujo | |
34 | * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later | |
35 | */ | |
36 | class engine extends \core_search\engine { | |
37 | ||
38 | /** | |
39 | * @var string The date format used by solr. | |
40 | */ | |
41 | const DATE_FORMAT = 'Y-m-d\TH:i:s\Z'; | |
42 | ||
43 | /** | |
44 | * @var int Commit documents interval (number of miliseconds). | |
45 | */ | |
46 | const AUTOCOMMIT_WITHIN = 15000; | |
47 | ||
48 | /** | |
4894840d | 49 | * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending. |
95c6aeaf | 50 | */ |
4894840d EM |
51 | const FRAG_SIZE = 510; |
52 | ||
53 | /** | |
54 | * Marker for the start of a highlight. | |
55 | */ | |
56 | const HIGHLIGHT_START = '@@HI_S@@'; | |
57 | ||
58 | /** | |
59 | * Marker for the end of a highlight. | |
60 | */ | |
61 | const HIGHLIGHT_END = '@@HI_E@@'; | |
95c6aeaf DM |
62 | |
63 | /** | |
64 | * @var \SolrClient | |
65 | */ | |
66 | protected $client = null; | |
67 | ||
7a4a0bc8 EM |
68 | /** |
69 | * @var bool True if we should reuse SolrClients, false if not. | |
70 | */ | |
71 | protected $cacheclient = true; | |
72 | ||
5dc4624c EM |
73 | /** |
74 | * @var \curl Direct curl object. | |
75 | */ | |
76 | protected $curl = null; | |
77 | ||
95c6aeaf DM |
78 | /** |
79 | * @var array Fields that can be highlighted. | |
80 | */ | |
4894840d | 81 | protected $highlightfields = array('title', 'content', 'description1', 'description2'); |
95c6aeaf | 82 | |
7a4a0bc8 EM |
83 | /** |
84 | * Initialises the search engine configuration. | |
85 | * | |
86 | * @return void | |
87 | */ | |
88 | public function __construct() { | |
89 | parent::__construct(); | |
90 | ||
91 | $curlversion = curl_version(); | |
92 | if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) { | |
93 | // There is a flaw with curl 7.35.0 that causes problems with client reuse. | |
94 | $this->cacheclient = false; | |
95 | } | |
96 | } | |
97 | ||
95c6aeaf DM |
98 | /** |
99 | * Prepares a Solr query, applies filters and executes it returning its results. | |
100 | * | |
101 | * @throws \core_search\engine_exception | |
f6b425e2 EM |
102 | * @param stdClass $filters Containing query and filters. |
103 | * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts. | |
95c6aeaf DM |
104 | * @return \core_search\document[] Results or false if no results |
105 | */ | |
106 | public function execute_query($filters, $usercontexts) { | |
f6b425e2 | 107 | global $USER; |
95c6aeaf DM |
108 | |
109 | // Let's keep these changes internal. | |
110 | $data = clone $filters; | |
111 | ||
112 | // If there is any problem we trigger the exception as soon as possible. | |
7a4a0bc8 | 113 | $client = $this->get_search_client(); |
95c6aeaf DM |
114 | |
115 | $serverstatus = $this->is_server_ready(); | |
116 | if ($serverstatus !== true) { | |
117 | throw new \core_search\engine_exception('engineserverstatus', 'search'); | |
118 | } | |
119 | ||
120 | $query = new \SolrQuery(); | |
cd894f84 EM |
121 | $maxrows = \core_search\manager::MAX_RESULTS; |
122 | if ($this->file_indexing_enabled()) { | |
123 | // When using file indexing and grouping, we are going to collapse results, so we want extra results. | |
124 | $maxrows *= 2; | |
125 | } | |
126 | $this->set_query($query, $data->q, $maxrows); | |
95c6aeaf DM |
127 | $this->add_fields($query); |
128 | ||
129 | // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters | |
130 | // we are really interested in caching contexts filters instead. | |
131 | if (!empty($data->title)) { | |
132 | $query->addFilterQuery('{!field cache=false f=title}' . $data->title); | |
133 | } | |
134 | if (!empty($data->areaid)) { | |
135 | // Even if it is only supposed to contain PARAM_ALPHANUMEXT, better to prevent. | |
136 | $query->addFilterQuery('{!field cache=false f=areaid}' . $data->areaid); | |
137 | } | |
138 | ||
139 | if (!empty($data->timestart) or !empty($data->timeend)) { | |
140 | if (empty($data->timestart)) { | |
141 | $data->timestart = '*'; | |
142 | } else { | |
143 | $data->timestart = \search_solr\document::format_time_for_engine($data->timestart); | |
144 | } | |
145 | if (empty($data->timeend)) { | |
146 | $data->timeend = '*'; | |
147 | } else { | |
148 | $data->timeend = \search_solr\document::format_time_for_engine($data->timeend); | |
149 | } | |
150 | ||
151 | // No cache. | |
152 | $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']'); | |
153 | } | |
154 | ||
f6b425e2 EM |
155 | // Restrict to users who are supposed to be able to see a particular result. |
156 | $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')'); | |
157 | ||
95c6aeaf DM |
158 | // And finally restrict it to the context where the user can access, we want this one cached. |
159 | // If the user can access all contexts $usercontexts value is just true, we don't need to filter | |
160 | // in that case. | |
161 | if ($usercontexts && is_array($usercontexts)) { | |
162 | if (!empty($data->areaid)) { | |
163 | $query->addFilterQuery('contextid:(' . implode(' OR ', $usercontexts[$data->areaid]) . ')'); | |
164 | } else { | |
165 | // Join all area contexts into a single array and implode. | |
166 | $allcontexts = array(); | |
167 | foreach ($usercontexts as $areacontexts) { | |
168 | foreach ($areacontexts as $contextid) { | |
169 | // Ensure they are unique. | |
170 | $allcontexts[$contextid] = $contextid; | |
171 | } | |
172 | } | |
173 | $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')'); | |
174 | } | |
175 | } | |
176 | ||
177 | try { | |
cd894f84 EM |
178 | if ($this->file_indexing_enabled()) { |
179 | // Now group records by solr_filegroupingid. Limit to 3 results per group. | |
180 | $query->setGroup(true); | |
181 | $query->setGroupLimit(3); | |
182 | $query->addGroupField('solr_filegroupingid'); | |
7a4a0bc8 | 183 | return $this->grouped_files_query_response($client->query($query)); |
cd894f84 | 184 | } else { |
7a4a0bc8 | 185 | return $this->query_response($client->query($query)); |
cd894f84 | 186 | } |
95c6aeaf DM |
187 | } catch (\SolrClientException $ex) { |
188 | debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); | |
189 | $this->queryerror = $ex->getMessage(); | |
190 | return array(); | |
191 | } catch (\SolrServerException $ex) { | |
192 | debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); | |
193 | $this->queryerror = $ex->getMessage(); | |
194 | return array(); | |
195 | } | |
196 | ||
197 | } | |
198 | ||
199 | /** | |
200 | * Prepares a new query by setting the query, start offset and rows to return. | |
201 | * @param SolrQuery $query | |
cd894f84 EM |
202 | * @param object $q Containing query and filters. |
203 | * @param null|int $maxresults The number of results to limit. manager::MAX_RESULTS if not set. | |
95c6aeaf | 204 | */ |
cd894f84 EM |
205 | protected function set_query($query, $q, $maxresults = null) { |
206 | if (!is_numeric($maxresults)) { | |
207 | $maxresults = \core_search\manager::MAX_RESULTS; | |
208 | } | |
95c6aeaf DM |
209 | |
210 | // Set hightlighting. | |
211 | $query->setHighlight(true); | |
212 | foreach ($this->highlightfields as $field) { | |
213 | $query->addHighlightField($field); | |
214 | } | |
215 | $query->setHighlightFragsize(static::FRAG_SIZE); | |
4894840d EM |
216 | $query->setHighlightSimplePre(self::HIGHLIGHT_START); |
217 | $query->setHighlightSimplePost(self::HIGHLIGHT_END); | |
218 | $query->setHighlightMergeContiguous(true); | |
95c6aeaf DM |
219 | |
220 | $query->setQuery($q); | |
221 | ||
222 | // A reasonable max. | |
cd894f84 | 223 | $query->setRows($maxresults); |
95c6aeaf DM |
224 | } |
225 | ||
226 | /** | |
227 | * Sets fields to be returned in the result. | |
228 | * | |
229 | * @param SolrQuery $query object. | |
230 | */ | |
231 | public function add_fields($query) { | |
232 | $documentclass = $this->get_document_classname(); | |
233 | $fields = array_keys($documentclass::get_default_fields_definition()); | |
234 | foreach ($fields as $field) { | |
235 | $query->addField($field); | |
236 | } | |
237 | } | |
238 | ||
239 | /** | |
240 | * Finds the key common to both highlighing and docs array returned from response. | |
241 | * @param object $response containing results. | |
242 | */ | |
243 | public function add_highlight_content($response) { | |
cd894f84 EM |
244 | if (!isset($response->highlighting)) { |
245 | // There is no highlighting to add. | |
246 | return; | |
247 | } | |
248 | ||
95c6aeaf DM |
249 | $highlightedobject = $response->highlighting; |
250 | foreach ($response->response->docs as $doc) { | |
251 | $x = $doc->id; | |
252 | $highlighteddoc = $highlightedobject->$x; | |
253 | $this->merge_highlight_field_values($doc, $highlighteddoc); | |
254 | } | |
255 | } | |
256 | ||
257 | /** | |
258 | * Adds the highlighting array values to docs array values. | |
259 | * | |
260 | * @throws \core_search\engine_exception | |
261 | * @param object $doc containing the results. | |
262 | * @param object $highlighteddoc containing the highlighted results values. | |
263 | */ | |
264 | public function merge_highlight_field_values($doc, $highlighteddoc) { | |
265 | ||
266 | foreach ($this->highlightfields as $field) { | |
267 | if (!empty($doc->$field)) { | |
268 | ||
269 | // Check that the returned value is not an array. No way we can make this work with multivalued solr fields. | |
270 | if (is_array($doc->{$field})) { | |
271 | throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field); | |
272 | } | |
273 | ||
274 | if (!empty($highlighteddoc->$field)) { | |
275 | // Replace by the highlighted result. | |
276 | $doc->$field = reset($highlighteddoc->$field); | |
277 | } | |
278 | } | |
279 | } | |
280 | } | |
281 | ||
282 | /** | |
283 | * Filters the response on Moodle side. | |
284 | * | |
285 | * @param object $queryresponse containing the response return from solr server. | |
286 | * @return array $results containing final results to be displayed. | |
287 | */ | |
288 | public function query_response($queryresponse) { | |
f6b425e2 EM |
289 | global $USER; |
290 | ||
291 | $userid = $USER->id; | |
292 | $noownerid = \core_search\manager::NO_OWNER_ID; | |
95c6aeaf DM |
293 | |
294 | $response = $queryresponse->getResponse(); | |
295 | $numgranted = 0; | |
296 | ||
297 | if (!$docs = $response->response->docs) { | |
298 | return array(); | |
299 | } | |
300 | ||
301 | if (!empty($response->response->numFound)) { | |
302 | $this->add_highlight_content($response); | |
303 | ||
304 | // Iterate through the results checking its availability and whether they are available for the user or not. | |
305 | foreach ($docs as $key => $docdata) { | |
f6b425e2 EM |
306 | if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) { |
307 | // If owneruserid is set, no other user should be able to access this record. | |
308 | unset($docs[$key]); | |
309 | continue; | |
310 | } | |
311 | ||
95c6aeaf DM |
312 | if (!$searcharea = $this->get_search_area($docdata->areaid)) { |
313 | unset($docs[$key]); | |
314 | continue; | |
315 | } | |
316 | ||
317 | $docdata = $this->standarize_solr_obj($docdata); | |
318 | ||
319 | $access = $searcharea->check_access($docdata['itemid']); | |
320 | switch ($access) { | |
321 | case \core_search\manager::ACCESS_DELETED: | |
322 | $this->delete_by_id($docdata['id']); | |
323 | unset($docs[$key]); | |
324 | break; | |
325 | case \core_search\manager::ACCESS_DENIED: | |
326 | unset($docs[$key]); | |
327 | break; | |
328 | case \core_search\manager::ACCESS_GRANTED: | |
329 | $numgranted++; | |
330 | ||
331 | // Add the doc. | |
332 | $docs[$key] = $this->to_document($searcharea, $docdata); | |
333 | break; | |
334 | } | |
335 | ||
336 | // This should never happen. | |
337 | if ($numgranted >= \core_search\manager::MAX_RESULTS) { | |
338 | $docs = array_slice($docs, 0, \core_search\manager::MAX_RESULTS, true); | |
339 | break; | |
340 | } | |
341 | } | |
342 | } | |
343 | ||
344 | return $docs; | |
345 | } | |
346 | ||
cd894f84 EM |
347 | /** |
348 | * Processes grouped file results into documents, with attached matching files. | |
349 | * | |
350 | * @param SolrQueryResponse $queryresponse The response returned from solr server | |
351 | * @return array Final results to be displayed. | |
352 | */ | |
353 | protected function grouped_files_query_response($queryresponse) { | |
354 | $response = $queryresponse->getResponse(); | |
355 | ||
356 | // If we can't find the grouping, or there are no matches in the grouping, return empty. | |
357 | if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) { | |
358 | return array(); | |
359 | } | |
360 | ||
361 | $numgranted = 0; | |
362 | $orderedids = array(); | |
363 | $completedocs = array(); | |
364 | $incompletedocs = array(); | |
365 | ||
366 | $highlightingobj = $response->highlighting; | |
367 | ||
368 | // Each group represents a "master document". | |
369 | $groups = $response->grouped->solr_filegroupingid->groups; | |
370 | foreach ($groups as $group) { | |
371 | $groupid = $group->groupValue; | |
372 | $groupdocs = $group->doclist->docs; | |
373 | $firstdoc = reset($groupdocs); | |
374 | ||
375 | if (!$searcharea = $this->get_search_area($firstdoc->areaid)) { | |
376 | // Well, this is a problem. | |
377 | continue; | |
378 | } | |
379 | ||
380 | // Check for access. | |
381 | $access = $searcharea->check_access($firstdoc->itemid); | |
382 | switch ($access) { | |
383 | case \core_search\manager::ACCESS_DELETED: | |
384 | // If deleted from Moodle, delete from index and then continue. | |
385 | $this->delete_by_id($firstdoc->id); | |
386 | continue 2; | |
387 | break; | |
388 | case \core_search\manager::ACCESS_DENIED: | |
389 | // This means we should just skip for the current user. | |
390 | continue 2; | |
391 | break; | |
392 | } | |
393 | $numgranted++; | |
394 | ||
395 | $maindoc = false; | |
396 | $fileids = array(); | |
397 | // Seperate the main document and any files returned. | |
398 | foreach ($groupdocs as $groupdoc) { | |
399 | if ($groupdoc->id == $groupid) { | |
400 | $maindoc = $groupdoc; | |
401 | } else if (isset($groupdoc->solr_fileid)) { | |
402 | $fileids[] = $groupdoc->solr_fileid; | |
403 | } | |
404 | } | |
405 | ||
406 | // Store the id of this group, in order, for later merging. | |
407 | $orderedids[] = $groupid; | |
408 | ||
409 | if (!$maindoc) { | |
410 | // We don't have the main doc, store what we know for later building. | |
411 | $incompletedocs[$groupid] = $fileids; | |
412 | } else { | |
413 | if (isset($highlightingobj->$groupid)) { | |
414 | // Merge the highlighting for this doc. | |
415 | $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid); | |
416 | } | |
417 | $docdata = $this->standarize_solr_obj($maindoc); | |
418 | $doc = $this->to_document($searcharea, $docdata); | |
419 | // Now we need to attach the result files to the doc. | |
420 | foreach ($fileids as $fileid) { | |
421 | $doc->add_stored_file($fileid); | |
422 | } | |
423 | $completedocs[$groupid] = $doc; | |
424 | } | |
425 | ||
426 | if ($numgranted >= \core_search\manager::MAX_RESULTS) { | |
427 | // We have hit the max results, we will just ignore the rest. | |
428 | break; | |
429 | } | |
430 | } | |
431 | ||
432 | $incompletedocs = $this->get_missing_docs($incompletedocs); | |
433 | ||
434 | $out = array(); | |
435 | // Now merge the complete and incomplete documents, in results order. | |
436 | foreach ($orderedids as $docid) { | |
437 | if (isset($completedocs[$docid])) { | |
438 | $out[] = $completedocs[$docid]; | |
439 | } else if (isset($incompletedocs[$docid])) { | |
440 | $out[] = $incompletedocs[$docid]; | |
441 | } | |
442 | } | |
443 | ||
444 | return $out; | |
445 | } | |
446 | ||
447 | /** | |
448 | * Retreive any missing main documents and attach provided files. | |
449 | * | |
450 | * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value | |
451 | * associated to the key should be an array of stored_files or stored file ids to attach to the result document. | |
452 | * | |
453 | * Return array also indexed by document id. | |
454 | * | |
455 | * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach. | |
456 | * @return document[] | |
457 | */ | |
458 | protected function get_missing_docs($missingdocs) { | |
459 | if (empty($missingdocs)) { | |
460 | return array(); | |
461 | } | |
462 | ||
463 | $docids = array_keys($missingdocs); | |
464 | ||
465 | // Build a custom query that will get all the missing documents. | |
466 | $query = new \SolrQuery(); | |
467 | $this->set_query($query, '*', count($docids)); | |
468 | $this->add_fields($query); | |
469 | $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')'); | |
470 | ||
471 | try { | |
472 | $results = $this->query_response($this->get_search_client()->query($query)); | |
473 | } catch (\SolrClientException $ex) { | |
474 | return array(); | |
475 | } catch (\SolrServerException $ex) { | |
476 | return array(); | |
477 | } | |
478 | ||
479 | $out = array(); | |
480 | foreach ($results as $result) { | |
481 | $resultid = $result->get('id'); | |
482 | if (!isset($missingdocs[$resultid])) { | |
483 | // We got a result we didn't expect. Skip it. | |
484 | continue; | |
485 | } | |
486 | // Attach the files. | |
487 | foreach ($missingdocs[$resultid] as $filedoc) { | |
488 | $result->add_stored_file($filedoc); | |
489 | } | |
490 | $out[$resultid] = $result; | |
491 | } | |
492 | ||
493 | return $out; | |
494 | } | |
495 | ||
95c6aeaf DM |
496 | /** |
497 | * Returns a standard php array from a \SolrObject instance. | |
498 | * | |
499 | * @param \SolrObject $obj | |
500 | * @return array The returned document as an array. | |
501 | */ | |
502 | public function standarize_solr_obj(\SolrObject $obj) { | |
503 | $properties = $obj->getPropertyNames(); | |
504 | ||
505 | $docdata = array(); | |
506 | foreach($properties as $name) { | |
507 | // http://php.net/manual/en/solrobject.getpropertynames.php#98018. | |
508 | $name = trim($name); | |
509 | $docdata[$name] = $obj->offsetGet($name); | |
510 | } | |
511 | return $docdata; | |
512 | } | |
513 | ||
514 | /** | |
515 | * Adds a document to the search engine. | |
516 | * | |
517 | * This does not commit to the search engine. | |
518 | * | |
091973db EM |
519 | * @param document $document |
520 | * @param bool $fileindexing True if file indexing is to be used | |
521 | * @return bool | |
95c6aeaf | 522 | */ |
091973db EM |
523 | public function add_document($document, $fileindexing = false) { |
524 | $docdata = $document->export_for_engine(); | |
525 | ||
cd894f84 | 526 | if (!$this->add_solr_document($docdata)) { |
091973db EM |
527 | return false; |
528 | } | |
529 | ||
cd894f84 EM |
530 | if ($fileindexing) { |
531 | // This will take care of updating all attached files in the index. | |
532 | $this->process_document_files($document); | |
533 | } | |
534 | ||
091973db EM |
535 | return true; |
536 | } | |
95c6aeaf | 537 | |
091973db EM |
538 | /** |
539 | * Adds a text document to the search engine. | |
540 | * | |
cd894f84 | 541 | * @param array $doc |
091973db EM |
542 | * @return bool |
543 | */ | |
cd894f84 | 544 | protected function add_solr_document($doc) { |
95c6aeaf DM |
545 | $solrdoc = new \SolrInputDocument(); |
546 | foreach ($doc as $field => $value) { | |
547 | $solrdoc->addField($field, $value); | |
548 | } | |
549 | ||
550 | try { | |
551 | $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN); | |
091973db | 552 | return true; |
95c6aeaf DM |
553 | } catch (\SolrClientException $e) { |
554 | debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER); | |
63d5007b EM |
555 | } catch (\SolrServerException $e) { |
556 | // We only use the first line of the message, as it's a fully java stacktrace behind it. | |
557 | $msg = strtok($e->getMessage(), "\n"); | |
558 | debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER); | |
95c6aeaf | 559 | } |
091973db EM |
560 | |
561 | return false; | |
95c6aeaf DM |
562 | } |
563 | ||
cd894f84 EM |
564 | /** |
565 | * Index files attached to the docuemnt, ensuring the index matches the current document files. | |
566 | * | |
567 | * For documents that aren't known to be new, we check the index for existing files. | |
568 | * - New files we will add. | |
569 | * - Existing and unchanged files we will skip. | |
570 | * - File that are in the index but not on the document will be deleted from the index. | |
571 | * - Files that have changed will be re-indexed. | |
572 | * | |
573 | * @param document $document | |
574 | */ | |
575 | protected function process_document_files($document) { | |
576 | if (!$this->file_indexing_enabled()) { | |
577 | return; | |
578 | } | |
579 | ||
580 | // Maximum rows to process at a time. | |
581 | $rows = 500; | |
582 | ||
583 | // Get the attached files. | |
584 | $files = $document->get_files(); | |
585 | ||
586 | // If this isn't a new document, we need to check the exiting indexed files. | |
587 | if (!$document->get_is_new()) { | |
588 | // We do this progressively, so we can handle lots of files cleanly. | |
589 | list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows); | |
590 | $count = 0; | |
591 | $idstodelete = array(); | |
592 | ||
593 | do { | |
594 | // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones. | |
595 | foreach ($indexedfiles as $indexedfile) { | |
596 | $fileid = $indexedfile->solr_fileid; | |
597 | ||
598 | if (isset($files[$fileid])) { | |
599 | // Check for changes that would mean we need to re-index the file. If so, just leave in $files. | |
600 | // Filelib does not guarantee time modified is updated, so we will check important values. | |
601 | if ($indexedfile->modified < $files[$fileid]->get_timemodified()) { | |
602 | continue; | |
603 | } | |
604 | if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) { | |
605 | continue; | |
606 | } | |
607 | if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) { | |
608 | continue; | |
609 | } | |
610 | if ($indexedfile->solr_fileindexedcontent == document::INDEXED_FILE_FALSE && | |
611 | $this->file_is_indexable($files[$fileid])) { | |
612 | // This means that the last time we indexed this file, filtering blocked it. | |
613 | // Current settings say it is indexable, so we will allow it to be indexed. | |
614 | continue; | |
615 | } | |
616 | ||
617 | // If the file is already indexed, we can just remove it from the files array and skip it. | |
618 | unset($files[$fileid]); | |
619 | } else { | |
620 | // This means we have found a file that is no longer attached, so we need to delete from the index. | |
621 | // We do it later, since this is progressive, and it could reorder results. | |
622 | $idstodelete[] = $indexedfile->id; | |
623 | } | |
624 | } | |
625 | $count += $rows; | |
626 | ||
627 | if ($count < $numfound) { | |
628 | // If we haven't hit the total count yet, fetch the next batch. | |
629 | list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows); | |
630 | } | |
631 | ||
632 | } while ($count < $numfound); | |
633 | ||
634 | // Delete files that are no longer attached. | |
635 | foreach ($idstodelete as $id) { | |
636 | // We directly delete the item using the client, as the engine delete_by_id won't work on file docs. | |
637 | $this->get_search_client()->deleteById($id); | |
638 | } | |
639 | } | |
640 | ||
641 | // Now we can actually index all the remaining files. | |
642 | foreach ($files as $file) { | |
643 | $this->add_stored_file($document, $file); | |
644 | } | |
645 | } | |
646 | ||
647 | /** | |
648 | * Get the currently indexed files for a particular document, returns the total count, and a subset of files. | |
649 | * | |
650 | * @param document $document | |
651 | * @param int $start The row to start the results on. Zero indexed. | |
652 | * @param int $rows The number of rows to fetch | |
653 | * @return array A two element array, the first is the total number of availble results, the second is an array | |
654 | * of documents for the current request. | |
655 | */ | |
656 | protected function get_indexed_files($document, $start = 0, $rows = 500) { | |
657 | // Build a custom query that will get any document files that are in our solr_filegroupingid. | |
658 | $query = new \SolrQuery(); | |
659 | ||
660 | // We want to get all file records tied to a document. | |
661 | // For efficiency, we are building our own, stripped down, query. | |
662 | $query->setQuery('*'); | |
663 | $query->setRows($rows); | |
664 | $query->setStart($start); | |
665 | // We want a consistent sorting. | |
666 | $query->addSortField('id'); | |
667 | ||
668 | // We only want the bare minimum of fields. | |
669 | $query->addField('id'); | |
670 | $query->addField('modified'); | |
671 | $query->addField('title'); | |
672 | $query->addField('solr_fileid'); | |
673 | $query->addField('solr_filecontenthash'); | |
674 | $query->addField('solr_fileindexedcontent'); | |
675 | ||
676 | $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')'); | |
677 | $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE); | |
678 | ||
679 | try { | |
680 | $response = $this->get_search_client()->query($query); | |
681 | $responsedoc = $response->getResponse(); | |
682 | ||
683 | if (empty($responsedoc->response->numFound)) { | |
684 | return array(0, array()); | |
685 | } | |
686 | $numfound = $responsedoc->response->numFound; | |
687 | ||
688 | return array($numfound, $this->convert_file_results($responsedoc)); | |
689 | } catch (\SolrClientException $ex) { | |
690 | debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); | |
691 | $this->queryerror = $ex->getMessage(); | |
692 | return array(0, array()); | |
693 | } catch (\SolrServerException $ex) { | |
694 | debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); | |
695 | $this->queryerror = $ex->getMessage(); | |
696 | return array(0, array()); | |
697 | } | |
698 | } | |
699 | ||
700 | /** | |
701 | * A very lightweight handler for getting information about already indexed files from a Solr response. | |
702 | * | |
703 | * @param SolrObject $responsedoc A Solr response document | |
704 | * @return stdClass[] An array of objects that contain the basic information for file processing. | |
705 | */ | |
706 | protected function convert_file_results($responsedoc) { | |
707 | if (!$docs = $responsedoc->response->docs) { | |
708 | return array(); | |
709 | } | |
710 | ||
711 | $out = array(); | |
712 | ||
713 | foreach ($docs as $doc) { | |
714 | // Copy the bare minimim needed info. | |
715 | $result = new \stdClass(); | |
716 | $result->id = $doc->id; | |
717 | $result->modified = document::import_time_from_engine($doc->modified); | |
718 | $result->title = $doc->title; | |
719 | $result->solr_fileid = $doc->solr_fileid; | |
720 | $result->solr_filecontenthash = $doc->solr_filecontenthash; | |
721 | $result->solr_fileindexedcontent = $doc->solr_fileindexedcontent; | |
722 | $out[] = $result; | |
723 | } | |
724 | ||
725 | return $out; | |
726 | } | |
727 | ||
728 | /** | |
729 | * Adds a file to the search engine. | |
730 | * | |
731 | * Notes about Solr and Tika indexing. We do not send the mime type, only the filename. | |
732 | * Tika has much better content type detection than Moodle, and we will have many more doc failures | |
733 | * if we try to send mime types. | |
734 | * | |
735 | * @param document $document | |
736 | * @param \stored_file $storedfile | |
737 | * @return void | |
738 | */ | |
739 | protected function add_stored_file($document, $storedfile) { | |
740 | $filedoc = $document->export_file_for_engine($storedfile); | |
741 | ||
742 | if (!$this->file_is_indexable($storedfile)) { | |
743 | // For files that we don't consider indexable, we will still place a reference in the search engine. | |
744 | $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_FALSE; | |
745 | $this->add_solr_document($filedoc); | |
746 | return; | |
747 | } | |
748 | ||
749 | $curl = $this->get_curl_object(); | |
750 | ||
751 | $url = $this->get_connection_url('/update/extract'); | |
752 | ||
753 | // This will prevent solr from automatically making fields for every tika output. | |
754 | $url->param('uprefix', 'ignored_'); | |
755 | ||
756 | // These are common fields that matches the standard *_point dynamic field and causes an error. | |
757 | $url->param('fmap.media_white_point', 'ignored_mwp'); | |
758 | $url->param('fmap.media_black_point', 'ignored_mbp'); | |
759 | ||
760 | // Copy each key to the url with literal. | |
761 | // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names. | |
762 | foreach ($filedoc as $key => $value) { | |
763 | // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours. | |
764 | $url->param('fmap.'.$key, 'ignored_'.$key); | |
765 | // Place data in a tmp field. | |
766 | $url->param('literal.mdltmp_'.$key, $value); | |
767 | // Then move to the final field. | |
768 | $url->param('fmap.mdltmp_'.$key, $key); | |
769 | } | |
770 | ||
771 | // This sets the true filename for Tika. | |
772 | $url->param('resource.name', $storedfile->get_filename()); | |
773 | ||
774 | // A giant block of code that is really just error checking around the curl request. | |
775 | try { | |
776 | // Now actually do the request. | |
777 | $result = $curl->post($url->out(false), array('myfile' => $storedfile)); | |
778 | ||
779 | $code = $curl->get_errno(); | |
780 | $info = $curl->get_info(); | |
781 | ||
782 | // Now error handling. It is just informational, since we aren't tracking per file/doc results. | |
783 | if ($code != 0) { | |
784 | // This means an internal cURL error occurred error is in result. | |
785 | $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.'; | |
786 | debugging($message, DEBUG_DEVELOPER); | |
787 | } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) { | |
788 | // Unexpected HTTP response code. | |
789 | $message = 'Error while indexing file with document id '.$filedoc['id']; | |
790 | // Try to get error message out of msg or title if it exists. | |
791 | if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) { | |
792 | $message .= ': '.$matches[1]; | |
793 | } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) { | |
794 | $message .= ': '.$matches[1]; | |
795 | } | |
796 | // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter. | |
797 | if (CLI_SCRIPT && !PHPUNIT_TEST) { | |
798 | mtrace($message); | |
799 | } | |
800 | } else { | |
801 | // Check for the expected status field. | |
802 | if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) { | |
803 | // Now check for the expected status of 0, if not, error. | |
804 | if ((int)$matches[1] !== 0) { | |
805 | $message = 'Unexpected Solr status code '.(int)$matches[1]; | |
806 | $message .= ' while indexing file with document id '.$filedoc['id'].'.'; | |
807 | debugging($message, DEBUG_DEVELOPER); | |
808 | } else { | |
809 | // The document was successfully indexed. | |
810 | return; | |
811 | } | |
812 | } else { | |
813 | // We received an unprocessable response. | |
814 | $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': '; | |
815 | $message .= strtok($result, "\n"); | |
816 | debugging($message, DEBUG_DEVELOPER); | |
817 | } | |
818 | } | |
819 | } catch (\Exception $e) { | |
820 | // There was an error, but we are not tracking per-file success, so we just continue on. | |
821 | debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER); | |
822 | } | |
823 | ||
824 | // If we get here, the document was not indexed due to an error. So we will index just the base info without the file. | |
825 | $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_ERROR; | |
826 | $this->add_solr_document($filedoc); | |
827 | } | |
828 | ||
829 | /** | |
830 | * Checks to see if a passed file is indexable. | |
831 | * | |
832 | * @param \stored_file $file The file to check | |
833 | * @return bool True if the file can be indexed | |
834 | */ | |
835 | protected function file_is_indexable($file) { | |
836 | if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) { | |
837 | // The file is too big to index. | |
838 | return false; | |
839 | } | |
840 | ||
841 | $mime = $file->get_mimetype(); | |
842 | ||
843 | if ($mime == 'application/vnd.moodle.backup') { | |
844 | // We don't index Moodle backup files. There is nothing usefully indexable in them. | |
845 | return false; | |
846 | } | |
847 | ||
848 | return true; | |
849 | } | |
850 | ||
95c6aeaf DM |
851 | /** |
852 | * Commits all pending changes. | |
853 | * | |
854 | * @return void | |
855 | */ | |
075fa912 | 856 | protected function commit() { |
95c6aeaf DM |
857 | $this->get_search_client()->commit(); |
858 | } | |
859 | ||
075fa912 EM |
860 | /** |
861 | * Do any area cleanup needed, and do anything to confirm contents. | |
862 | * | |
863 | * Return false to prevent the search area completed time and stats from being updated. | |
864 | * | |
865 | * @param \core_search\area\base $searcharea The search area that was complete | |
866 | * @param int $numdocs The number of documents that were added to the index | |
867 | * @param bool $fullindex True if a full index is being performed | |
868 | * @return bool True means that data is considered indexed | |
869 | */ | |
870 | public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) { | |
871 | $this->commit(); | |
872 | ||
873 | return true; | |
874 | } | |
875 | ||
cd894f84 EM |
876 | /** |
877 | * Return true if file indexing is supported and enabled. False otherwise. | |
878 | * | |
879 | * @return bool | |
880 | */ | |
881 | public function file_indexing_enabled() { | |
882 | return (bool)$this->config->fileindexing; | |
883 | } | |
884 | ||
95c6aeaf DM |
885 | /** |
886 | * Defragments the index. | |
887 | * | |
888 | * @return void | |
889 | */ | |
890 | public function optimize() { | |
bfd6c78f | 891 | $this->get_search_client()->optimize(1, true, false); |
95c6aeaf DM |
892 | } |
893 | ||
894 | /** | |
895 | * Deletes the specified document. | |
896 | * | |
897 | * @param string $id The document id to delete | |
898 | * @return void | |
899 | */ | |
900 | public function delete_by_id($id) { | |
cd894f84 EM |
901 | // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid. |
902 | $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id); | |
075fa912 | 903 | $this->commit(); |
95c6aeaf DM |
904 | } |
905 | ||
906 | /** | |
907 | * Delete all area's documents. | |
908 | * | |
909 | * @param string $areaid | |
910 | * @return void | |
911 | */ | |
912 | public function delete($areaid = null) { | |
913 | if ($areaid) { | |
914 | $this->get_search_client()->deleteByQuery('areaid:' . $areaid); | |
915 | } else { | |
916 | $this->get_search_client()->deleteByQuery('*:*'); | |
917 | } | |
075fa912 | 918 | $this->commit(); |
95c6aeaf DM |
919 | } |
920 | ||
921 | /** | |
922 | * Pings the Solr server using search_solr config | |
923 | * | |
924 | * @return true|string Returns true if all good or an error string. | |
925 | */ | |
926 | public function is_server_ready() { | |
927 | ||
928 | if (empty($this->config->server_hostname) || empty($this->config->indexname)) { | |
929 | return 'No solr configuration found'; | |
930 | } | |
931 | ||
7a4a0bc8 | 932 | if (!$client = $this->get_search_client(false)) { |
95c6aeaf DM |
933 | return get_string('engineserverstatus', 'search'); |
934 | } | |
935 | ||
936 | try { | |
7a4a0bc8 | 937 | @$client->ping(); |
95c6aeaf DM |
938 | } catch (\SolrClientException $ex) { |
939 | return 'Solr client error: ' . $ex->getMessage(); | |
940 | } catch (\SolrServerException $ex) { | |
941 | return 'Solr server error: ' . $ex->getMessage(); | |
942 | } | |
943 | ||
944 | // Check that setup schema has already run. | |
945 | try { | |
946 | $schema = new \search_solr\schema(); | |
947 | $schema->validate_setup(); | |
948 | } catch (\moodle_exception $e) { | |
949 | return $e->getMessage(); | |
950 | } | |
951 | ||
952 | return true; | |
953 | } | |
954 | ||
955 | /** | |
956 | * Checks if the PHP Solr extension is available. | |
957 | * | |
958 | * @return bool | |
959 | */ | |
960 | public function is_installed() { | |
961 | return function_exists('solr_get_version'); | |
962 | } | |
963 | ||
964 | /** | |
965 | * Returns the solr client instance. | |
966 | * | |
7a4a0bc8 EM |
967 | * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl. |
968 | * | |
95c6aeaf DM |
969 | * @throws \core_search\engine_exception |
970 | * @param bool $triggerexception | |
971 | * @return \SolrClient | |
972 | */ | |
973 | protected function get_search_client($triggerexception = true) { | |
974 | ||
975 | // Type comparison as it is set to false if not available. | |
976 | if ($this->client !== null) { | |
977 | return $this->client; | |
978 | } | |
979 | ||
980 | $options = array( | |
981 | 'hostname' => $this->config->server_hostname, | |
982 | 'path' => '/solr/' . $this->config->indexname, | |
983 | 'login' => !empty($this->config->server_username) ? $this->config->server_username : '', | |
984 | 'password' => !empty($this->config->server_password) ? $this->config->server_password : '', | |
985 | 'port' => !empty($this->config->server_port) ? $this->config->server_port : '', | |
5dc4624c | 986 | 'secure' => !empty($this->config->secure) ? true : false, |
95c6aeaf | 987 | 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '', |
95c6aeaf | 988 | 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '', |
5dc4624c | 989 | 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '', |
95c6aeaf DM |
990 | 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '', |
991 | 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '', | |
bfd6c78f | 992 | 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30' |
95c6aeaf DM |
993 | ); |
994 | ||
7a4a0bc8 | 995 | $client = new \SolrClient($options); |
95c6aeaf | 996 | |
7a4a0bc8 | 997 | if ($client === false && $triggerexception) { |
95c6aeaf DM |
998 | throw new \core_search\engine_exception('engineserverstatus', 'search'); |
999 | } | |
1000 | ||
7a4a0bc8 EM |
1001 | if ($this->cacheclient) { |
1002 | $this->client = $client; | |
1003 | } | |
1004 | ||
1005 | return $client; | |
95c6aeaf | 1006 | } |
5dc4624c EM |
1007 | |
1008 | /** | |
1009 | * Returns a curl object for conntecting to solr. | |
1010 | * | |
1011 | * @return \curl | |
1012 | */ | |
1013 | public function get_curl_object() { | |
1014 | if (!is_null($this->curl)) { | |
1015 | return $this->curl; | |
1016 | } | |
1017 | ||
1018 | $this->curl = new \curl(); | |
1019 | ||
1020 | $options = array(); | |
1021 | // Build the SSL options. Based on pecl-solr and general testing. | |
1022 | if (!empty($this->config->secure)) { | |
1023 | if (!empty($this->config->ssl_cert)) { | |
1024 | $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert; | |
1025 | $options['CURLOPT_SSLCERTTYPE'] = 'PEM'; | |
1026 | } | |
1027 | ||
1028 | if (!empty($this->config->ssl_key)) { | |
1029 | $options['CURLOPT_SSLKEY'] = $this->config->ssl_key; | |
1030 | $options['CURLOPT_SSLKEYTYPE'] = 'PEM'; | |
1031 | } | |
1032 | ||
1033 | if (!empty($this->config->ssl_keypassword)) { | |
1034 | $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword; | |
1035 | } | |
1036 | ||
1037 | if (!empty($this->config->ssl_cainfo)) { | |
1038 | $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo; | |
1039 | } | |
1040 | ||
1041 | if (!empty($this->config->ssl_capath)) { | |
1042 | $options['CURLOPT_CAPATH'] = $this->config->ssl_capath; | |
1043 | } | |
1044 | } | |
1045 | ||
1046 | $this->curl->setopt($options); | |
1047 | ||
1048 | if (!empty($this->config->server_username) && !empty($this->config->server_password)) { | |
1049 | $authorization = $this->config->server_username . ':' . $this->config->server_password; | |
1050 | $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization)); | |
1051 | } | |
1052 | ||
1053 | return $this->curl; | |
1054 | } | |
1055 | ||
1056 | /** | |
1057 | * Return a Moodle url object for the server connection. | |
1058 | * | |
1059 | * @param string $path The solr path to append. | |
1060 | * @return \moodle_url | |
1061 | */ | |
1062 | public function get_connection_url($path) { | |
1063 | // Must use the proper protocol, or SSL will fail. | |
1064 | $protocol = !empty($this->config->secure) ? 'https' : 'http'; | |
1065 | $url = $protocol . '://' . rtrim($this->config->server_hostname, '/'); | |
1066 | if (!empty($this->config->server_port)) { | |
1067 | $url .= ':' . $this->config->server_port; | |
1068 | } | |
1069 | $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/'); | |
1070 | ||
1071 | return new \moodle_url($url); | |
1072 | } | |
95c6aeaf | 1073 | } |