weekly release 3.1dev
[moodle.git] / search / engine / solr / classes / engine.php
CommitLineData
95c6aeaf
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Solr engine.
19 *
20 * @package search_solr
21 * @copyright 2015 Daniel Neis Araujo
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace search_solr;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Solr engine.
31 *
32 * @package search_solr
33 * @copyright 2015 Daniel Neis Araujo
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36class engine extends \core_search\engine {
37
38 /**
39 * @var string The date format used by solr.
40 */
41 const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
42
43 /**
44 * @var int Commit documents interval (number of miliseconds).
45 */
46 const AUTOCOMMIT_WITHIN = 15000;
47
48 /**
4894840d 49 * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
95c6aeaf 50 */
4894840d
EM
51 const FRAG_SIZE = 510;
52
53 /**
54 * Marker for the start of a highlight.
55 */
56 const HIGHLIGHT_START = '@@HI_S@@';
57
58 /**
59 * Marker for the end of a highlight.
60 */
61 const HIGHLIGHT_END = '@@HI_E@@';
95c6aeaf
DM
62
63 /**
64 * @var \SolrClient
65 */
66 protected $client = null;
67
7a4a0bc8
EM
68 /**
69 * @var bool True if we should reuse SolrClients, false if not.
70 */
71 protected $cacheclient = true;
72
5dc4624c
EM
73 /**
74 * @var \curl Direct curl object.
75 */
76 protected $curl = null;
77
95c6aeaf
DM
78 /**
79 * @var array Fields that can be highlighted.
80 */
4894840d 81 protected $highlightfields = array('title', 'content', 'description1', 'description2');
95c6aeaf 82
7a4a0bc8
EM
83 /**
84 * Initialises the search engine configuration.
85 *
86 * @return void
87 */
88 public function __construct() {
89 parent::__construct();
90
91 $curlversion = curl_version();
92 if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) {
93 // There is a flaw with curl 7.35.0 that causes problems with client reuse.
94 $this->cacheclient = false;
95 }
96 }
97
95c6aeaf
DM
98 /**
99 * Prepares a Solr query, applies filters and executes it returning its results.
100 *
101 * @throws \core_search\engine_exception
f6b425e2
EM
102 * @param stdClass $filters Containing query and filters.
103 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts.
95c6aeaf
DM
104 * @return \core_search\document[] Results or false if no results
105 */
106 public function execute_query($filters, $usercontexts) {
f6b425e2 107 global $USER;
95c6aeaf
DM
108
109 // Let's keep these changes internal.
110 $data = clone $filters;
111
112 // If there is any problem we trigger the exception as soon as possible.
7a4a0bc8 113 $client = $this->get_search_client();
95c6aeaf
DM
114
115 $serverstatus = $this->is_server_ready();
116 if ($serverstatus !== true) {
117 throw new \core_search\engine_exception('engineserverstatus', 'search');
118 }
119
3744ceb6 120 $query = new \SolrDisMaxQuery();
cd894f84
EM
121 $maxrows = \core_search\manager::MAX_RESULTS;
122 if ($this->file_indexing_enabled()) {
123 // When using file indexing and grouping, we are going to collapse results, so we want extra results.
124 $maxrows *= 2;
125 }
126 $this->set_query($query, $data->q, $maxrows);
95c6aeaf
DM
127 $this->add_fields($query);
128
129 // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
130 // we are really interested in caching contexts filters instead.
131 if (!empty($data->title)) {
132 $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
133 }
501801a2
EM
134 if (!empty($data->areaids)) {
135 // If areaids are specified, we want to get any that match.
136 $query->addFilterQuery('{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')');
95c6aeaf 137 }
427e3cbc
EM
138 if (!empty($data->courseids)) {
139 $query->addFilterQuery('{!cache=false}courseid:(' . implode(' OR ', $data->courseids) . ')');
140 }
95c6aeaf
DM
141
142 if (!empty($data->timestart) or !empty($data->timeend)) {
143 if (empty($data->timestart)) {
144 $data->timestart = '*';
145 } else {
146 $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
147 }
148 if (empty($data->timeend)) {
149 $data->timeend = '*';
150 } else {
151 $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
152 }
153
154 // No cache.
155 $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
156 }
157
f6b425e2
EM
158 // Restrict to users who are supposed to be able to see a particular result.
159 $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
160
95c6aeaf
DM
161 // And finally restrict it to the context where the user can access, we want this one cached.
162 // If the user can access all contexts $usercontexts value is just true, we don't need to filter
163 // in that case.
164 if ($usercontexts && is_array($usercontexts)) {
427e3cbc
EM
165 // Join all area contexts into a single array and implode.
166 $allcontexts = array();
167 foreach ($usercontexts as $areaid => $areacontexts) {
501801a2 168 if (!empty($data->areaids) && !in_array($areaid, $data->areaids)) {
427e3cbc
EM
169 // Skip unused areas.
170 continue;
95c6aeaf 171 }
427e3cbc
EM
172 foreach ($areacontexts as $contextid) {
173 // Ensure they are unique.
174 $allcontexts[$contextid] = $contextid;
175 }
176 }
177 if (empty($allcontexts)) {
178 // This means there are no valid contexts for them, so they get no results.
179 return array();
95c6aeaf 180 }
427e3cbc 181 $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
95c6aeaf
DM
182 }
183
184 try {
cd894f84
EM
185 if ($this->file_indexing_enabled()) {
186 // Now group records by solr_filegroupingid. Limit to 3 results per group.
187 $query->setGroup(true);
188 $query->setGroupLimit(3);
189 $query->addGroupField('solr_filegroupingid');
7a4a0bc8 190 return $this->grouped_files_query_response($client->query($query));
cd894f84 191 } else {
7a4a0bc8 192 return $this->query_response($client->query($query));
cd894f84 193 }
95c6aeaf
DM
194 } catch (\SolrClientException $ex) {
195 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
196 $this->queryerror = $ex->getMessage();
197 return array();
198 } catch (\SolrServerException $ex) {
199 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
200 $this->queryerror = $ex->getMessage();
201 return array();
202 }
203
204 }
205
206 /**
207 * Prepares a new query by setting the query, start offset and rows to return.
3744ceb6 208 *
95c6aeaf 209 * @param SolrQuery $query
cd894f84
EM
210 * @param object $q Containing query and filters.
211 * @param null|int $maxresults The number of results to limit. manager::MAX_RESULTS if not set.
95c6aeaf 212 */
cd894f84
EM
213 protected function set_query($query, $q, $maxresults = null) {
214 if (!is_numeric($maxresults)) {
215 $maxresults = \core_search\manager::MAX_RESULTS;
216 }
95c6aeaf
DM
217
218 // Set hightlighting.
219 $query->setHighlight(true);
220 foreach ($this->highlightfields as $field) {
221 $query->addHighlightField($field);
222 }
223 $query->setHighlightFragsize(static::FRAG_SIZE);
4894840d
EM
224 $query->setHighlightSimplePre(self::HIGHLIGHT_START);
225 $query->setHighlightSimplePost(self::HIGHLIGHT_END);
226 $query->setHighlightMergeContiguous(true);
95c6aeaf
DM
227
228 $query->setQuery($q);
229
230 // A reasonable max.
cd894f84 231 $query->setRows($maxresults);
95c6aeaf
DM
232 }
233
234 /**
235 * Sets fields to be returned in the result.
236 *
3744ceb6 237 * @param SolrDisMaxQuery|SolrQuery $query object.
95c6aeaf
DM
238 */
239 public function add_fields($query) {
240 $documentclass = $this->get_document_classname();
3744ceb6
EM
241 $fields = $documentclass::get_default_fields_definition();
242
243 $dismax = false;
546c0af5 244 if ($query instanceof \SolrDisMaxQuery) {
3744ceb6
EM
245 $dismax = true;
246 }
247
248 foreach ($fields as $key => $field) {
249 $query->addField($key);
250 if ($dismax && !empty($field['mainquery'])) {
251 // Add fields the main query should be run against.
252 $query->addQueryField($key);
253 }
95c6aeaf
DM
254 }
255 }
256
257 /**
258 * Finds the key common to both highlighing and docs array returned from response.
259 * @param object $response containing results.
260 */
261 public function add_highlight_content($response) {
cd894f84
EM
262 if (!isset($response->highlighting)) {
263 // There is no highlighting to add.
264 return;
265 }
266
95c6aeaf
DM
267 $highlightedobject = $response->highlighting;
268 foreach ($response->response->docs as $doc) {
269 $x = $doc->id;
270 $highlighteddoc = $highlightedobject->$x;
271 $this->merge_highlight_field_values($doc, $highlighteddoc);
272 }
273 }
274
275 /**
276 * Adds the highlighting array values to docs array values.
277 *
278 * @throws \core_search\engine_exception
279 * @param object $doc containing the results.
280 * @param object $highlighteddoc containing the highlighted results values.
281 */
282 public function merge_highlight_field_values($doc, $highlighteddoc) {
283
284 foreach ($this->highlightfields as $field) {
285 if (!empty($doc->$field)) {
286
287 // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
288 if (is_array($doc->{$field})) {
289 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
290 }
291
292 if (!empty($highlighteddoc->$field)) {
293 // Replace by the highlighted result.
294 $doc->$field = reset($highlighteddoc->$field);
295 }
296 }
297 }
298 }
299
300 /**
301 * Filters the response on Moodle side.
302 *
303 * @param object $queryresponse containing the response return from solr server.
304 * @return array $results containing final results to be displayed.
305 */
306 public function query_response($queryresponse) {
f6b425e2
EM
307 global $USER;
308
309 $userid = $USER->id;
310 $noownerid = \core_search\manager::NO_OWNER_ID;
95c6aeaf
DM
311
312 $response = $queryresponse->getResponse();
313 $numgranted = 0;
314
315 if (!$docs = $response->response->docs) {
316 return array();
317 }
318
319 if (!empty($response->response->numFound)) {
320 $this->add_highlight_content($response);
321
322 // Iterate through the results checking its availability and whether they are available for the user or not.
323 foreach ($docs as $key => $docdata) {
f6b425e2
EM
324 if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
325 // If owneruserid is set, no other user should be able to access this record.
326 unset($docs[$key]);
327 continue;
328 }
329
95c6aeaf
DM
330 if (!$searcharea = $this->get_search_area($docdata->areaid)) {
331 unset($docs[$key]);
332 continue;
333 }
334
335 $docdata = $this->standarize_solr_obj($docdata);
336
337 $access = $searcharea->check_access($docdata['itemid']);
338 switch ($access) {
339 case \core_search\manager::ACCESS_DELETED:
340 $this->delete_by_id($docdata['id']);
341 unset($docs[$key]);
342 break;
343 case \core_search\manager::ACCESS_DENIED:
344 unset($docs[$key]);
345 break;
346 case \core_search\manager::ACCESS_GRANTED:
347 $numgranted++;
348
349 // Add the doc.
350 $docs[$key] = $this->to_document($searcharea, $docdata);
351 break;
352 }
353
354 // This should never happen.
355 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
356 $docs = array_slice($docs, 0, \core_search\manager::MAX_RESULTS, true);
357 break;
358 }
359 }
360 }
361
362 return $docs;
363 }
364
cd894f84
EM
365 /**
366 * Processes grouped file results into documents, with attached matching files.
367 *
368 * @param SolrQueryResponse $queryresponse The response returned from solr server
369 * @return array Final results to be displayed.
370 */
371 protected function grouped_files_query_response($queryresponse) {
372 $response = $queryresponse->getResponse();
373
374 // If we can't find the grouping, or there are no matches in the grouping, return empty.
375 if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
376 return array();
377 }
378
379 $numgranted = 0;
380 $orderedids = array();
381 $completedocs = array();
382 $incompletedocs = array();
383
384 $highlightingobj = $response->highlighting;
385
386 // Each group represents a "master document".
387 $groups = $response->grouped->solr_filegroupingid->groups;
388 foreach ($groups as $group) {
389 $groupid = $group->groupValue;
390 $groupdocs = $group->doclist->docs;
391 $firstdoc = reset($groupdocs);
392
393 if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
394 // Well, this is a problem.
395 continue;
396 }
397
398 // Check for access.
399 $access = $searcharea->check_access($firstdoc->itemid);
400 switch ($access) {
401 case \core_search\manager::ACCESS_DELETED:
402 // If deleted from Moodle, delete from index and then continue.
403 $this->delete_by_id($firstdoc->id);
404 continue 2;
405 break;
406 case \core_search\manager::ACCESS_DENIED:
407 // This means we should just skip for the current user.
408 continue 2;
409 break;
410 }
411 $numgranted++;
412
413 $maindoc = false;
414 $fileids = array();
415 // Seperate the main document and any files returned.
416 foreach ($groupdocs as $groupdoc) {
417 if ($groupdoc->id == $groupid) {
418 $maindoc = $groupdoc;
419 } else if (isset($groupdoc->solr_fileid)) {
420 $fileids[] = $groupdoc->solr_fileid;
421 }
422 }
423
424 // Store the id of this group, in order, for later merging.
425 $orderedids[] = $groupid;
426
427 if (!$maindoc) {
428 // We don't have the main doc, store what we know for later building.
429 $incompletedocs[$groupid] = $fileids;
430 } else {
431 if (isset($highlightingobj->$groupid)) {
432 // Merge the highlighting for this doc.
433 $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
434 }
435 $docdata = $this->standarize_solr_obj($maindoc);
436 $doc = $this->to_document($searcharea, $docdata);
437 // Now we need to attach the result files to the doc.
438 foreach ($fileids as $fileid) {
439 $doc->add_stored_file($fileid);
440 }
441 $completedocs[$groupid] = $doc;
442 }
443
444 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
445 // We have hit the max results, we will just ignore the rest.
446 break;
447 }
448 }
449
450 $incompletedocs = $this->get_missing_docs($incompletedocs);
451
452 $out = array();
453 // Now merge the complete and incomplete documents, in results order.
454 foreach ($orderedids as $docid) {
455 if (isset($completedocs[$docid])) {
456 $out[] = $completedocs[$docid];
457 } else if (isset($incompletedocs[$docid])) {
458 $out[] = $incompletedocs[$docid];
459 }
460 }
461
462 return $out;
463 }
464
465 /**
466 * Retreive any missing main documents and attach provided files.
467 *
468 * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
469 * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
470 *
471 * Return array also indexed by document id.
472 *
473 * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
474 * @return document[]
475 */
476 protected function get_missing_docs($missingdocs) {
477 if (empty($missingdocs)) {
478 return array();
479 }
480
481 $docids = array_keys($missingdocs);
482
483 // Build a custom query that will get all the missing documents.
484 $query = new \SolrQuery();
485 $this->set_query($query, '*', count($docids));
486 $this->add_fields($query);
487 $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
488
489 try {
490 $results = $this->query_response($this->get_search_client()->query($query));
491 } catch (\SolrClientException $ex) {
492 return array();
493 } catch (\SolrServerException $ex) {
494 return array();
495 }
496
497 $out = array();
498 foreach ($results as $result) {
499 $resultid = $result->get('id');
500 if (!isset($missingdocs[$resultid])) {
501 // We got a result we didn't expect. Skip it.
502 continue;
503 }
504 // Attach the files.
505 foreach ($missingdocs[$resultid] as $filedoc) {
506 $result->add_stored_file($filedoc);
507 }
508 $out[$resultid] = $result;
509 }
510
511 return $out;
512 }
513
95c6aeaf
DM
514 /**
515 * Returns a standard php array from a \SolrObject instance.
516 *
517 * @param \SolrObject $obj
518 * @return array The returned document as an array.
519 */
520 public function standarize_solr_obj(\SolrObject $obj) {
521 $properties = $obj->getPropertyNames();
522
523 $docdata = array();
524 foreach($properties as $name) {
525 // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
526 $name = trim($name);
527 $docdata[$name] = $obj->offsetGet($name);
528 }
529 return $docdata;
530 }
531
532 /**
533 * Adds a document to the search engine.
534 *
535 * This does not commit to the search engine.
536 *
091973db
EM
537 * @param document $document
538 * @param bool $fileindexing True if file indexing is to be used
539 * @return bool
95c6aeaf 540 */
091973db
EM
541 public function add_document($document, $fileindexing = false) {
542 $docdata = $document->export_for_engine();
543
cd894f84 544 if (!$this->add_solr_document($docdata)) {
091973db
EM
545 return false;
546 }
547
cd894f84
EM
548 if ($fileindexing) {
549 // This will take care of updating all attached files in the index.
550 $this->process_document_files($document);
551 }
552
091973db
EM
553 return true;
554 }
95c6aeaf 555
091973db
EM
556 /**
557 * Adds a text document to the search engine.
558 *
cd894f84 559 * @param array $doc
091973db
EM
560 * @return bool
561 */
cd894f84 562 protected function add_solr_document($doc) {
95c6aeaf
DM
563 $solrdoc = new \SolrInputDocument();
564 foreach ($doc as $field => $value) {
565 $solrdoc->addField($field, $value);
566 }
567
568 try {
569 $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
091973db 570 return true;
95c6aeaf
DM
571 } catch (\SolrClientException $e) {
572 debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
63d5007b
EM
573 } catch (\SolrServerException $e) {
574 // We only use the first line of the message, as it's a fully java stacktrace behind it.
575 $msg = strtok($e->getMessage(), "\n");
576 debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
95c6aeaf 577 }
091973db
EM
578
579 return false;
95c6aeaf
DM
580 }
581
cd894f84
EM
582 /**
583 * Index files attached to the docuemnt, ensuring the index matches the current document files.
584 *
585 * For documents that aren't known to be new, we check the index for existing files.
586 * - New files we will add.
587 * - Existing and unchanged files we will skip.
588 * - File that are in the index but not on the document will be deleted from the index.
589 * - Files that have changed will be re-indexed.
590 *
591 * @param document $document
592 */
593 protected function process_document_files($document) {
594 if (!$this->file_indexing_enabled()) {
595 return;
596 }
597
598 // Maximum rows to process at a time.
599 $rows = 500;
600
601 // Get the attached files.
602 $files = $document->get_files();
603
604 // If this isn't a new document, we need to check the exiting indexed files.
605 if (!$document->get_is_new()) {
606 // We do this progressively, so we can handle lots of files cleanly.
607 list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
608 $count = 0;
609 $idstodelete = array();
610
611 do {
612 // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
613 foreach ($indexedfiles as $indexedfile) {
614 $fileid = $indexedfile->solr_fileid;
615
616 if (isset($files[$fileid])) {
617 // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
618 // Filelib does not guarantee time modified is updated, so we will check important values.
619 if ($indexedfile->modified < $files[$fileid]->get_timemodified()) {
620 continue;
621 }
622 if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
623 continue;
624 }
625 if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
626 continue;
627 }
f6b4ec7b 628 if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
cd894f84
EM
629 $this->file_is_indexable($files[$fileid])) {
630 // This means that the last time we indexed this file, filtering blocked it.
631 // Current settings say it is indexable, so we will allow it to be indexed.
632 continue;
633 }
634
635 // If the file is already indexed, we can just remove it from the files array and skip it.
636 unset($files[$fileid]);
637 } else {
638 // This means we have found a file that is no longer attached, so we need to delete from the index.
639 // We do it later, since this is progressive, and it could reorder results.
640 $idstodelete[] = $indexedfile->id;
641 }
642 }
643 $count += $rows;
644
645 if ($count < $numfound) {
646 // If we haven't hit the total count yet, fetch the next batch.
647 list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
648 }
649
650 } while ($count < $numfound);
651
652 // Delete files that are no longer attached.
653 foreach ($idstodelete as $id) {
654 // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
655 $this->get_search_client()->deleteById($id);
656 }
657 }
658
659 // Now we can actually index all the remaining files.
660 foreach ($files as $file) {
661 $this->add_stored_file($document, $file);
662 }
663 }
664
665 /**
666 * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
667 *
668 * @param document $document
669 * @param int $start The row to start the results on. Zero indexed.
670 * @param int $rows The number of rows to fetch
671 * @return array A two element array, the first is the total number of availble results, the second is an array
672 * of documents for the current request.
673 */
674 protected function get_indexed_files($document, $start = 0, $rows = 500) {
675 // Build a custom query that will get any document files that are in our solr_filegroupingid.
676 $query = new \SolrQuery();
677
678 // We want to get all file records tied to a document.
679 // For efficiency, we are building our own, stripped down, query.
680 $query->setQuery('*');
681 $query->setRows($rows);
682 $query->setStart($start);
683 // We want a consistent sorting.
684 $query->addSortField('id');
685
686 // We only want the bare minimum of fields.
687 $query->addField('id');
688 $query->addField('modified');
689 $query->addField('title');
690 $query->addField('solr_fileid');
691 $query->addField('solr_filecontenthash');
f6b4ec7b 692 $query->addField('solr_fileindexstatus');
cd894f84
EM
693
694 $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
695 $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
696
697 try {
698 $response = $this->get_search_client()->query($query);
699 $responsedoc = $response->getResponse();
700
701 if (empty($responsedoc->response->numFound)) {
702 return array(0, array());
703 }
704 $numfound = $responsedoc->response->numFound;
705
706 return array($numfound, $this->convert_file_results($responsedoc));
707 } catch (\SolrClientException $ex) {
708 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
709 $this->queryerror = $ex->getMessage();
710 return array(0, array());
711 } catch (\SolrServerException $ex) {
712 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
713 $this->queryerror = $ex->getMessage();
714 return array(0, array());
715 }
716 }
717
718 /**
719 * A very lightweight handler for getting information about already indexed files from a Solr response.
720 *
721 * @param SolrObject $responsedoc A Solr response document
722 * @return stdClass[] An array of objects that contain the basic information for file processing.
723 */
724 protected function convert_file_results($responsedoc) {
725 if (!$docs = $responsedoc->response->docs) {
726 return array();
727 }
728
729 $out = array();
730
731 foreach ($docs as $doc) {
732 // Copy the bare minimim needed info.
733 $result = new \stdClass();
734 $result->id = $doc->id;
735 $result->modified = document::import_time_from_engine($doc->modified);
736 $result->title = $doc->title;
737 $result->solr_fileid = $doc->solr_fileid;
738 $result->solr_filecontenthash = $doc->solr_filecontenthash;
f6b4ec7b 739 $result->solr_fileindexstatus = $doc->solr_fileindexstatus;
cd894f84
EM
740 $out[] = $result;
741 }
742
743 return $out;
744 }
745
746 /**
747 * Adds a file to the search engine.
748 *
749 * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
750 * Tika has much better content type detection than Moodle, and we will have many more doc failures
751 * if we try to send mime types.
752 *
753 * @param document $document
754 * @param \stored_file $storedfile
755 * @return void
756 */
757 protected function add_stored_file($document, $storedfile) {
758 $filedoc = $document->export_file_for_engine($storedfile);
759
760 if (!$this->file_is_indexable($storedfile)) {
761 // For files that we don't consider indexable, we will still place a reference in the search engine.
f6b4ec7b 762 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
cd894f84
EM
763 $this->add_solr_document($filedoc);
764 return;
765 }
766
767 $curl = $this->get_curl_object();
768
769 $url = $this->get_connection_url('/update/extract');
770
771 // This will prevent solr from automatically making fields for every tika output.
772 $url->param('uprefix', 'ignored_');
773
546c0af5
EM
774 // Control how content is captured. This will keep our file content clean of non-important metadata.
775 $url->param('captureAttr', 'true');
776 // Move the content to a field for indexing.
777 $url->param('fmap.content', 'solr_filecontent');
778
cd894f84
EM
779 // These are common fields that matches the standard *_point dynamic field and causes an error.
780 $url->param('fmap.media_white_point', 'ignored_mwp');
781 $url->param('fmap.media_black_point', 'ignored_mbp');
782
783 // Copy each key to the url with literal.
784 // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
785 foreach ($filedoc as $key => $value) {
786 // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
787 $url->param('fmap.'.$key, 'ignored_'.$key);
788 // Place data in a tmp field.
789 $url->param('literal.mdltmp_'.$key, $value);
790 // Then move to the final field.
791 $url->param('fmap.mdltmp_'.$key, $key);
792 }
793
794 // This sets the true filename for Tika.
795 $url->param('resource.name', $storedfile->get_filename());
796
797 // A giant block of code that is really just error checking around the curl request.
798 try {
799 // Now actually do the request.
800 $result = $curl->post($url->out(false), array('myfile' => $storedfile));
801
802 $code = $curl->get_errno();
803 $info = $curl->get_info();
804
805 // Now error handling. It is just informational, since we aren't tracking per file/doc results.
806 if ($code != 0) {
807 // This means an internal cURL error occurred error is in result.
808 $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
809 debugging($message, DEBUG_DEVELOPER);
810 } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
811 // Unexpected HTTP response code.
812 $message = 'Error while indexing file with document id '.$filedoc['id'];
813 // Try to get error message out of msg or title if it exists.
814 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
815 $message .= ': '.$matches[1];
816 } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
817 $message .= ': '.$matches[1];
818 }
819 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
820 if (CLI_SCRIPT && !PHPUNIT_TEST) {
821 mtrace($message);
822 }
823 } else {
824 // Check for the expected status field.
825 if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
826 // Now check for the expected status of 0, if not, error.
827 if ((int)$matches[1] !== 0) {
828 $message = 'Unexpected Solr status code '.(int)$matches[1];
829 $message .= ' while indexing file with document id '.$filedoc['id'].'.';
830 debugging($message, DEBUG_DEVELOPER);
831 } else {
832 // The document was successfully indexed.
833 return;
834 }
835 } else {
836 // We received an unprocessable response.
837 $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
838 $message .= strtok($result, "\n");
839 debugging($message, DEBUG_DEVELOPER);
840 }
841 }
842 } catch (\Exception $e) {
843 // There was an error, but we are not tracking per-file success, so we just continue on.
844 debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
845 }
846
847 // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
f6b4ec7b 848 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
cd894f84
EM
849 $this->add_solr_document($filedoc);
850 }
851
852 /**
853 * Checks to see if a passed file is indexable.
854 *
855 * @param \stored_file $file The file to check
856 * @return bool True if the file can be indexed
857 */
858 protected function file_is_indexable($file) {
859 if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
860 // The file is too big to index.
861 return false;
862 }
863
864 $mime = $file->get_mimetype();
865
866 if ($mime == 'application/vnd.moodle.backup') {
867 // We don't index Moodle backup files. There is nothing usefully indexable in them.
868 return false;
869 }
870
871 return true;
872 }
873
95c6aeaf
DM
874 /**
875 * Commits all pending changes.
876 *
877 * @return void
878 */
075fa912 879 protected function commit() {
95c6aeaf
DM
880 $this->get_search_client()->commit();
881 }
882
075fa912
EM
883 /**
884 * Do any area cleanup needed, and do anything to confirm contents.
885 *
886 * Return false to prevent the search area completed time and stats from being updated.
887 *
888 * @param \core_search\area\base $searcharea The search area that was complete
889 * @param int $numdocs The number of documents that were added to the index
890 * @param bool $fullindex True if a full index is being performed
891 * @return bool True means that data is considered indexed
892 */
893 public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
894 $this->commit();
895
896 return true;
897 }
898
cd894f84
EM
899 /**
900 * Return true if file indexing is supported and enabled. False otherwise.
901 *
902 * @return bool
903 */
904 public function file_indexing_enabled() {
905 return (bool)$this->config->fileindexing;
906 }
907
95c6aeaf
DM
908 /**
909 * Defragments the index.
910 *
911 * @return void
912 */
913 public function optimize() {
bfd6c78f 914 $this->get_search_client()->optimize(1, true, false);
95c6aeaf
DM
915 }
916
917 /**
918 * Deletes the specified document.
919 *
920 * @param string $id The document id to delete
921 * @return void
922 */
923 public function delete_by_id($id) {
cd894f84
EM
924 // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
925 $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
075fa912 926 $this->commit();
95c6aeaf
DM
927 }
928
929 /**
930 * Delete all area's documents.
931 *
932 * @param string $areaid
933 * @return void
934 */
935 public function delete($areaid = null) {
936 if ($areaid) {
937 $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
938 } else {
939 $this->get_search_client()->deleteByQuery('*:*');
940 }
075fa912 941 $this->commit();
95c6aeaf
DM
942 }
943
944 /**
945 * Pings the Solr server using search_solr config
946 *
947 * @return true|string Returns true if all good or an error string.
948 */
949 public function is_server_ready() {
950
951 if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
952 return 'No solr configuration found';
953 }
954
7a4a0bc8 955 if (!$client = $this->get_search_client(false)) {
95c6aeaf
DM
956 return get_string('engineserverstatus', 'search');
957 }
958
959 try {
7a4a0bc8 960 @$client->ping();
95c6aeaf
DM
961 } catch (\SolrClientException $ex) {
962 return 'Solr client error: ' . $ex->getMessage();
963 } catch (\SolrServerException $ex) {
964 return 'Solr server error: ' . $ex->getMessage();
965 }
966
967 // Check that setup schema has already run.
968 try {
969 $schema = new \search_solr\schema();
970 $schema->validate_setup();
971 } catch (\moodle_exception $e) {
972 return $e->getMessage();
973 }
974
975 return true;
976 }
977
978 /**
979 * Checks if the PHP Solr extension is available.
980 *
981 * @return bool
982 */
983 public function is_installed() {
984 return function_exists('solr_get_version');
985 }
986
987 /**
988 * Returns the solr client instance.
989 *
7a4a0bc8
EM
990 * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl.
991 *
95c6aeaf
DM
992 * @throws \core_search\engine_exception
993 * @param bool $triggerexception
994 * @return \SolrClient
995 */
996 protected function get_search_client($triggerexception = true) {
997
998 // Type comparison as it is set to false if not available.
999 if ($this->client !== null) {
1000 return $this->client;
1001 }
1002
1003 $options = array(
1004 'hostname' => $this->config->server_hostname,
1005 'path' => '/solr/' . $this->config->indexname,
1006 'login' => !empty($this->config->server_username) ? $this->config->server_username : '',
1007 'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
1008 'port' => !empty($this->config->server_port) ? $this->config->server_port : '',
5dc4624c 1009 'secure' => !empty($this->config->secure) ? true : false,
95c6aeaf 1010 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '',
95c6aeaf 1011 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '',
5dc4624c 1012 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '',
95c6aeaf
DM
1013 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '',
1014 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '',
bfd6c78f 1015 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30'
95c6aeaf
DM
1016 );
1017
7a4a0bc8 1018 $client = new \SolrClient($options);
95c6aeaf 1019
7a4a0bc8 1020 if ($client === false && $triggerexception) {
95c6aeaf
DM
1021 throw new \core_search\engine_exception('engineserverstatus', 'search');
1022 }
1023
7a4a0bc8
EM
1024 if ($this->cacheclient) {
1025 $this->client = $client;
1026 }
1027
1028 return $client;
95c6aeaf 1029 }
5dc4624c
EM
1030
1031 /**
1032 * Returns a curl object for conntecting to solr.
1033 *
1034 * @return \curl
1035 */
1036 public function get_curl_object() {
1037 if (!is_null($this->curl)) {
1038 return $this->curl;
1039 }
1040
1041 $this->curl = new \curl();
1042
1043 $options = array();
1044 // Build the SSL options. Based on pecl-solr and general testing.
1045 if (!empty($this->config->secure)) {
1046 if (!empty($this->config->ssl_cert)) {
1047 $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert;
1048 $options['CURLOPT_SSLCERTTYPE'] = 'PEM';
1049 }
1050
1051 if (!empty($this->config->ssl_key)) {
1052 $options['CURLOPT_SSLKEY'] = $this->config->ssl_key;
1053 $options['CURLOPT_SSLKEYTYPE'] = 'PEM';
1054 }
1055
1056 if (!empty($this->config->ssl_keypassword)) {
1057 $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword;
1058 }
1059
1060 if (!empty($this->config->ssl_cainfo)) {
1061 $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo;
1062 }
1063
1064 if (!empty($this->config->ssl_capath)) {
1065 $options['CURLOPT_CAPATH'] = $this->config->ssl_capath;
1066 }
1067 }
1068
1069 $this->curl->setopt($options);
1070
1071 if (!empty($this->config->server_username) && !empty($this->config->server_password)) {
1072 $authorization = $this->config->server_username . ':' . $this->config->server_password;
1073 $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization));
1074 }
1075
1076 return $this->curl;
1077 }
1078
1079 /**
1080 * Return a Moodle url object for the server connection.
1081 *
1082 * @param string $path The solr path to append.
1083 * @return \moodle_url
1084 */
1085 public function get_connection_url($path) {
1086 // Must use the proper protocol, or SSL will fail.
1087 $protocol = !empty($this->config->secure) ? 'https' : 'http';
1088 $url = $protocol . '://' . rtrim($this->config->server_hostname, '/');
1089 if (!empty($this->config->server_port)) {
1090 $url .= ':' . $this->config->server_port;
1091 }
1092 $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/');
1093
1094 return new \moodle_url($url);
1095 }
95c6aeaf 1096}