Merge branch 'MDL-53643_master' of git://github.com/dmonllao/moodle
[moodle.git] / search / engine / solr / classes / engine.php
CommitLineData
95c6aeaf
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Solr engine.
19 *
20 * @package search_solr
21 * @copyright 2015 Daniel Neis Araujo
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace search_solr;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Solr engine.
31 *
32 * @package search_solr
33 * @copyright 2015 Daniel Neis Araujo
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36class engine extends \core_search\engine {
37
38 /**
39 * @var string The date format used by solr.
40 */
41 const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
42
43 /**
44 * @var int Commit documents interval (number of miliseconds).
45 */
46 const AUTOCOMMIT_WITHIN = 15000;
47
48 /**
4894840d 49 * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
95c6aeaf 50 */
4894840d
EM
51 const FRAG_SIZE = 510;
52
53 /**
54 * Marker for the start of a highlight.
55 */
56 const HIGHLIGHT_START = '@@HI_S@@';
57
58 /**
59 * Marker for the end of a highlight.
60 */
61 const HIGHLIGHT_END = '@@HI_E@@';
95c6aeaf
DM
62
63 /**
64 * @var \SolrClient
65 */
66 protected $client = null;
67
5dc4624c
EM
68 /**
69 * @var \curl Direct curl object.
70 */
71 protected $curl = null;
72
95c6aeaf
DM
73 /**
74 * @var array Fields that can be highlighted.
75 */
4894840d 76 protected $highlightfields = array('title', 'content', 'description1', 'description2');
95c6aeaf
DM
77
78 /**
79 * Prepares a Solr query, applies filters and executes it returning its results.
80 *
81 * @throws \core_search\engine_exception
f6b425e2
EM
82 * @param stdClass $filters Containing query and filters.
83 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts.
95c6aeaf
DM
84 * @return \core_search\document[] Results or false if no results
85 */
86 public function execute_query($filters, $usercontexts) {
f6b425e2 87 global $USER;
95c6aeaf
DM
88
89 // Let's keep these changes internal.
90 $data = clone $filters;
91
92 // If there is any problem we trigger the exception as soon as possible.
93 $this->client = $this->get_search_client();
94
95 $serverstatus = $this->is_server_ready();
96 if ($serverstatus !== true) {
97 throw new \core_search\engine_exception('engineserverstatus', 'search');
98 }
99
100 $query = new \SolrQuery();
cd894f84
EM
101 $maxrows = \core_search\manager::MAX_RESULTS;
102 if ($this->file_indexing_enabled()) {
103 // When using file indexing and grouping, we are going to collapse results, so we want extra results.
104 $maxrows *= 2;
105 }
106 $this->set_query($query, $data->q, $maxrows);
95c6aeaf
DM
107 $this->add_fields($query);
108
109 // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
110 // we are really interested in caching contexts filters instead.
111 if (!empty($data->title)) {
112 $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
113 }
114 if (!empty($data->areaid)) {
115 // Even if it is only supposed to contain PARAM_ALPHANUMEXT, better to prevent.
116 $query->addFilterQuery('{!field cache=false f=areaid}' . $data->areaid);
117 }
118
119 if (!empty($data->timestart) or !empty($data->timeend)) {
120 if (empty($data->timestart)) {
121 $data->timestart = '*';
122 } else {
123 $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
124 }
125 if (empty($data->timeend)) {
126 $data->timeend = '*';
127 } else {
128 $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
129 }
130
131 // No cache.
132 $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
133 }
134
f6b425e2
EM
135 // Restrict to users who are supposed to be able to see a particular result.
136 $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
137
95c6aeaf
DM
138 // And finally restrict it to the context where the user can access, we want this one cached.
139 // If the user can access all contexts $usercontexts value is just true, we don't need to filter
140 // in that case.
141 if ($usercontexts && is_array($usercontexts)) {
142 if (!empty($data->areaid)) {
143 $query->addFilterQuery('contextid:(' . implode(' OR ', $usercontexts[$data->areaid]) . ')');
144 } else {
145 // Join all area contexts into a single array and implode.
146 $allcontexts = array();
147 foreach ($usercontexts as $areacontexts) {
148 foreach ($areacontexts as $contextid) {
149 // Ensure they are unique.
150 $allcontexts[$contextid] = $contextid;
151 }
152 }
153 $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
154 }
155 }
156
157 try {
cd894f84
EM
158 if ($this->file_indexing_enabled()) {
159 // Now group records by solr_filegroupingid. Limit to 3 results per group.
160 $query->setGroup(true);
161 $query->setGroupLimit(3);
162 $query->addGroupField('solr_filegroupingid');
163 return $this->grouped_files_query_response($this->client->query($query));
164 } else {
165 return $this->query_response($this->client->query($query));
166 }
95c6aeaf
DM
167 } catch (\SolrClientException $ex) {
168 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
169 $this->queryerror = $ex->getMessage();
170 return array();
171 } catch (\SolrServerException $ex) {
172 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
173 $this->queryerror = $ex->getMessage();
174 return array();
175 }
176
177 }
178
179 /**
180 * Prepares a new query by setting the query, start offset and rows to return.
181 * @param SolrQuery $query
cd894f84
EM
182 * @param object $q Containing query and filters.
183 * @param null|int $maxresults The number of results to limit. manager::MAX_RESULTS if not set.
95c6aeaf 184 */
cd894f84
EM
185 protected function set_query($query, $q, $maxresults = null) {
186 if (!is_numeric($maxresults)) {
187 $maxresults = \core_search\manager::MAX_RESULTS;
188 }
95c6aeaf
DM
189
190 // Set hightlighting.
191 $query->setHighlight(true);
192 foreach ($this->highlightfields as $field) {
193 $query->addHighlightField($field);
194 }
195 $query->setHighlightFragsize(static::FRAG_SIZE);
4894840d
EM
196 $query->setHighlightSimplePre(self::HIGHLIGHT_START);
197 $query->setHighlightSimplePost(self::HIGHLIGHT_END);
198 $query->setHighlightMergeContiguous(true);
95c6aeaf
DM
199
200 $query->setQuery($q);
201
202 // A reasonable max.
cd894f84 203 $query->setRows($maxresults);
95c6aeaf
DM
204 }
205
206 /**
207 * Sets fields to be returned in the result.
208 *
209 * @param SolrQuery $query object.
210 */
211 public function add_fields($query) {
212 $documentclass = $this->get_document_classname();
213 $fields = array_keys($documentclass::get_default_fields_definition());
214 foreach ($fields as $field) {
215 $query->addField($field);
216 }
217 }
218
219 /**
220 * Finds the key common to both highlighing and docs array returned from response.
221 * @param object $response containing results.
222 */
223 public function add_highlight_content($response) {
cd894f84
EM
224 if (!isset($response->highlighting)) {
225 // There is no highlighting to add.
226 return;
227 }
228
95c6aeaf
DM
229 $highlightedobject = $response->highlighting;
230 foreach ($response->response->docs as $doc) {
231 $x = $doc->id;
232 $highlighteddoc = $highlightedobject->$x;
233 $this->merge_highlight_field_values($doc, $highlighteddoc);
234 }
235 }
236
237 /**
238 * Adds the highlighting array values to docs array values.
239 *
240 * @throws \core_search\engine_exception
241 * @param object $doc containing the results.
242 * @param object $highlighteddoc containing the highlighted results values.
243 */
244 public function merge_highlight_field_values($doc, $highlighteddoc) {
245
246 foreach ($this->highlightfields as $field) {
247 if (!empty($doc->$field)) {
248
249 // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
250 if (is_array($doc->{$field})) {
251 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
252 }
253
254 if (!empty($highlighteddoc->$field)) {
255 // Replace by the highlighted result.
256 $doc->$field = reset($highlighteddoc->$field);
257 }
258 }
259 }
260 }
261
262 /**
263 * Filters the response on Moodle side.
264 *
265 * @param object $queryresponse containing the response return from solr server.
266 * @return array $results containing final results to be displayed.
267 */
268 public function query_response($queryresponse) {
f6b425e2
EM
269 global $USER;
270
271 $userid = $USER->id;
272 $noownerid = \core_search\manager::NO_OWNER_ID;
95c6aeaf
DM
273
274 $response = $queryresponse->getResponse();
275 $numgranted = 0;
276
277 if (!$docs = $response->response->docs) {
278 return array();
279 }
280
281 if (!empty($response->response->numFound)) {
282 $this->add_highlight_content($response);
283
284 // Iterate through the results checking its availability and whether they are available for the user or not.
285 foreach ($docs as $key => $docdata) {
f6b425e2
EM
286 if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
287 // If owneruserid is set, no other user should be able to access this record.
288 unset($docs[$key]);
289 continue;
290 }
291
95c6aeaf
DM
292 if (!$searcharea = $this->get_search_area($docdata->areaid)) {
293 unset($docs[$key]);
294 continue;
295 }
296
297 $docdata = $this->standarize_solr_obj($docdata);
298
299 $access = $searcharea->check_access($docdata['itemid']);
300 switch ($access) {
301 case \core_search\manager::ACCESS_DELETED:
302 $this->delete_by_id($docdata['id']);
303 unset($docs[$key]);
304 break;
305 case \core_search\manager::ACCESS_DENIED:
306 unset($docs[$key]);
307 break;
308 case \core_search\manager::ACCESS_GRANTED:
309 $numgranted++;
310
311 // Add the doc.
312 $docs[$key] = $this->to_document($searcharea, $docdata);
313 break;
314 }
315
316 // This should never happen.
317 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
318 $docs = array_slice($docs, 0, \core_search\manager::MAX_RESULTS, true);
319 break;
320 }
321 }
322 }
323
324 return $docs;
325 }
326
cd894f84
EM
327 /**
328 * Processes grouped file results into documents, with attached matching files.
329 *
330 * @param SolrQueryResponse $queryresponse The response returned from solr server
331 * @return array Final results to be displayed.
332 */
333 protected function grouped_files_query_response($queryresponse) {
334 $response = $queryresponse->getResponse();
335
336 // If we can't find the grouping, or there are no matches in the grouping, return empty.
337 if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
338 return array();
339 }
340
341 $numgranted = 0;
342 $orderedids = array();
343 $completedocs = array();
344 $incompletedocs = array();
345
346 $highlightingobj = $response->highlighting;
347
348 // Each group represents a "master document".
349 $groups = $response->grouped->solr_filegroupingid->groups;
350 foreach ($groups as $group) {
351 $groupid = $group->groupValue;
352 $groupdocs = $group->doclist->docs;
353 $firstdoc = reset($groupdocs);
354
355 if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
356 // Well, this is a problem.
357 continue;
358 }
359
360 // Check for access.
361 $access = $searcharea->check_access($firstdoc->itemid);
362 switch ($access) {
363 case \core_search\manager::ACCESS_DELETED:
364 // If deleted from Moodle, delete from index and then continue.
365 $this->delete_by_id($firstdoc->id);
366 continue 2;
367 break;
368 case \core_search\manager::ACCESS_DENIED:
369 // This means we should just skip for the current user.
370 continue 2;
371 break;
372 }
373 $numgranted++;
374
375 $maindoc = false;
376 $fileids = array();
377 // Seperate the main document and any files returned.
378 foreach ($groupdocs as $groupdoc) {
379 if ($groupdoc->id == $groupid) {
380 $maindoc = $groupdoc;
381 } else if (isset($groupdoc->solr_fileid)) {
382 $fileids[] = $groupdoc->solr_fileid;
383 }
384 }
385
386 // Store the id of this group, in order, for later merging.
387 $orderedids[] = $groupid;
388
389 if (!$maindoc) {
390 // We don't have the main doc, store what we know for later building.
391 $incompletedocs[$groupid] = $fileids;
392 } else {
393 if (isset($highlightingobj->$groupid)) {
394 // Merge the highlighting for this doc.
395 $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
396 }
397 $docdata = $this->standarize_solr_obj($maindoc);
398 $doc = $this->to_document($searcharea, $docdata);
399 // Now we need to attach the result files to the doc.
400 foreach ($fileids as $fileid) {
401 $doc->add_stored_file($fileid);
402 }
403 $completedocs[$groupid] = $doc;
404 }
405
406 if ($numgranted >= \core_search\manager::MAX_RESULTS) {
407 // We have hit the max results, we will just ignore the rest.
408 break;
409 }
410 }
411
412 $incompletedocs = $this->get_missing_docs($incompletedocs);
413
414 $out = array();
415 // Now merge the complete and incomplete documents, in results order.
416 foreach ($orderedids as $docid) {
417 if (isset($completedocs[$docid])) {
418 $out[] = $completedocs[$docid];
419 } else if (isset($incompletedocs[$docid])) {
420 $out[] = $incompletedocs[$docid];
421 }
422 }
423
424 return $out;
425 }
426
427 /**
428 * Retreive any missing main documents and attach provided files.
429 *
430 * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
431 * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
432 *
433 * Return array also indexed by document id.
434 *
435 * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
436 * @return document[]
437 */
438 protected function get_missing_docs($missingdocs) {
439 if (empty($missingdocs)) {
440 return array();
441 }
442
443 $docids = array_keys($missingdocs);
444
445 // Build a custom query that will get all the missing documents.
446 $query = new \SolrQuery();
447 $this->set_query($query, '*', count($docids));
448 $this->add_fields($query);
449 $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
450
451 try {
452 $results = $this->query_response($this->get_search_client()->query($query));
453 } catch (\SolrClientException $ex) {
454 return array();
455 } catch (\SolrServerException $ex) {
456 return array();
457 }
458
459 $out = array();
460 foreach ($results as $result) {
461 $resultid = $result->get('id');
462 if (!isset($missingdocs[$resultid])) {
463 // We got a result we didn't expect. Skip it.
464 continue;
465 }
466 // Attach the files.
467 foreach ($missingdocs[$resultid] as $filedoc) {
468 $result->add_stored_file($filedoc);
469 }
470 $out[$resultid] = $result;
471 }
472
473 return $out;
474 }
475
95c6aeaf
DM
476 /**
477 * Returns a standard php array from a \SolrObject instance.
478 *
479 * @param \SolrObject $obj
480 * @return array The returned document as an array.
481 */
482 public function standarize_solr_obj(\SolrObject $obj) {
483 $properties = $obj->getPropertyNames();
484
485 $docdata = array();
486 foreach($properties as $name) {
487 // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
488 $name = trim($name);
489 $docdata[$name] = $obj->offsetGet($name);
490 }
491 return $docdata;
492 }
493
494 /**
495 * Adds a document to the search engine.
496 *
497 * This does not commit to the search engine.
498 *
091973db
EM
499 * @param document $document
500 * @param bool $fileindexing True if file indexing is to be used
501 * @return bool
95c6aeaf 502 */
091973db
EM
503 public function add_document($document, $fileindexing = false) {
504 $docdata = $document->export_for_engine();
505
cd894f84 506 if (!$this->add_solr_document($docdata)) {
091973db
EM
507 return false;
508 }
509
cd894f84
EM
510 if ($fileindexing) {
511 // This will take care of updating all attached files in the index.
512 $this->process_document_files($document);
513 }
514
091973db
EM
515 return true;
516 }
95c6aeaf 517
091973db
EM
518 /**
519 * Adds a text document to the search engine.
520 *
cd894f84 521 * @param array $doc
091973db
EM
522 * @return bool
523 */
cd894f84 524 protected function add_solr_document($doc) {
95c6aeaf
DM
525 $solrdoc = new \SolrInputDocument();
526 foreach ($doc as $field => $value) {
527 $solrdoc->addField($field, $value);
528 }
529
530 try {
531 $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
091973db 532 return true;
95c6aeaf
DM
533 } catch (\SolrClientException $e) {
534 debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
63d5007b
EM
535 } catch (\SolrServerException $e) {
536 // We only use the first line of the message, as it's a fully java stacktrace behind it.
537 $msg = strtok($e->getMessage(), "\n");
538 debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
95c6aeaf 539 }
091973db
EM
540
541 return false;
95c6aeaf
DM
542 }
543
cd894f84
EM
544 /**
545 * Index files attached to the docuemnt, ensuring the index matches the current document files.
546 *
547 * For documents that aren't known to be new, we check the index for existing files.
548 * - New files we will add.
549 * - Existing and unchanged files we will skip.
550 * - File that are in the index but not on the document will be deleted from the index.
551 * - Files that have changed will be re-indexed.
552 *
553 * @param document $document
554 */
555 protected function process_document_files($document) {
556 if (!$this->file_indexing_enabled()) {
557 return;
558 }
559
560 // Maximum rows to process at a time.
561 $rows = 500;
562
563 // Get the attached files.
564 $files = $document->get_files();
565
566 // If this isn't a new document, we need to check the exiting indexed files.
567 if (!$document->get_is_new()) {
568 // We do this progressively, so we can handle lots of files cleanly.
569 list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
570 $count = 0;
571 $idstodelete = array();
572
573 do {
574 // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
575 foreach ($indexedfiles as $indexedfile) {
576 $fileid = $indexedfile->solr_fileid;
577
578 if (isset($files[$fileid])) {
579 // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
580 // Filelib does not guarantee time modified is updated, so we will check important values.
581 if ($indexedfile->modified < $files[$fileid]->get_timemodified()) {
582 continue;
583 }
584 if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
585 continue;
586 }
587 if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
588 continue;
589 }
590 if ($indexedfile->solr_fileindexedcontent == document::INDEXED_FILE_FALSE &&
591 $this->file_is_indexable($files[$fileid])) {
592 // This means that the last time we indexed this file, filtering blocked it.
593 // Current settings say it is indexable, so we will allow it to be indexed.
594 continue;
595 }
596
597 // If the file is already indexed, we can just remove it from the files array and skip it.
598 unset($files[$fileid]);
599 } else {
600 // This means we have found a file that is no longer attached, so we need to delete from the index.
601 // We do it later, since this is progressive, and it could reorder results.
602 $idstodelete[] = $indexedfile->id;
603 }
604 }
605 $count += $rows;
606
607 if ($count < $numfound) {
608 // If we haven't hit the total count yet, fetch the next batch.
609 list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
610 }
611
612 } while ($count < $numfound);
613
614 // Delete files that are no longer attached.
615 foreach ($idstodelete as $id) {
616 // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
617 $this->get_search_client()->deleteById($id);
618 }
619 }
620
621 // Now we can actually index all the remaining files.
622 foreach ($files as $file) {
623 $this->add_stored_file($document, $file);
624 }
625 }
626
627 /**
628 * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
629 *
630 * @param document $document
631 * @param int $start The row to start the results on. Zero indexed.
632 * @param int $rows The number of rows to fetch
633 * @return array A two element array, the first is the total number of availble results, the second is an array
634 * of documents for the current request.
635 */
636 protected function get_indexed_files($document, $start = 0, $rows = 500) {
637 // Build a custom query that will get any document files that are in our solr_filegroupingid.
638 $query = new \SolrQuery();
639
640 // We want to get all file records tied to a document.
641 // For efficiency, we are building our own, stripped down, query.
642 $query->setQuery('*');
643 $query->setRows($rows);
644 $query->setStart($start);
645 // We want a consistent sorting.
646 $query->addSortField('id');
647
648 // We only want the bare minimum of fields.
649 $query->addField('id');
650 $query->addField('modified');
651 $query->addField('title');
652 $query->addField('solr_fileid');
653 $query->addField('solr_filecontenthash');
654 $query->addField('solr_fileindexedcontent');
655
656 $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
657 $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
658
659 try {
660 $response = $this->get_search_client()->query($query);
661 $responsedoc = $response->getResponse();
662
663 if (empty($responsedoc->response->numFound)) {
664 return array(0, array());
665 }
666 $numfound = $responsedoc->response->numFound;
667
668 return array($numfound, $this->convert_file_results($responsedoc));
669 } catch (\SolrClientException $ex) {
670 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
671 $this->queryerror = $ex->getMessage();
672 return array(0, array());
673 } catch (\SolrServerException $ex) {
674 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
675 $this->queryerror = $ex->getMessage();
676 return array(0, array());
677 }
678 }
679
680 /**
681 * A very lightweight handler for getting information about already indexed files from a Solr response.
682 *
683 * @param SolrObject $responsedoc A Solr response document
684 * @return stdClass[] An array of objects that contain the basic information for file processing.
685 */
686 protected function convert_file_results($responsedoc) {
687 if (!$docs = $responsedoc->response->docs) {
688 return array();
689 }
690
691 $out = array();
692
693 foreach ($docs as $doc) {
694 // Copy the bare minimim needed info.
695 $result = new \stdClass();
696 $result->id = $doc->id;
697 $result->modified = document::import_time_from_engine($doc->modified);
698 $result->title = $doc->title;
699 $result->solr_fileid = $doc->solr_fileid;
700 $result->solr_filecontenthash = $doc->solr_filecontenthash;
701 $result->solr_fileindexedcontent = $doc->solr_fileindexedcontent;
702 $out[] = $result;
703 }
704
705 return $out;
706 }
707
708 /**
709 * Adds a file to the search engine.
710 *
711 * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
712 * Tika has much better content type detection than Moodle, and we will have many more doc failures
713 * if we try to send mime types.
714 *
715 * @param document $document
716 * @param \stored_file $storedfile
717 * @return void
718 */
719 protected function add_stored_file($document, $storedfile) {
720 $filedoc = $document->export_file_for_engine($storedfile);
721
722 if (!$this->file_is_indexable($storedfile)) {
723 // For files that we don't consider indexable, we will still place a reference in the search engine.
724 $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_FALSE;
725 $this->add_solr_document($filedoc);
726 return;
727 }
728
729 $curl = $this->get_curl_object();
730
731 $url = $this->get_connection_url('/update/extract');
732
733 // This will prevent solr from automatically making fields for every tika output.
734 $url->param('uprefix', 'ignored_');
735
736 // These are common fields that matches the standard *_point dynamic field and causes an error.
737 $url->param('fmap.media_white_point', 'ignored_mwp');
738 $url->param('fmap.media_black_point', 'ignored_mbp');
739
740 // Copy each key to the url with literal.
741 // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
742 foreach ($filedoc as $key => $value) {
743 // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
744 $url->param('fmap.'.$key, 'ignored_'.$key);
745 // Place data in a tmp field.
746 $url->param('literal.mdltmp_'.$key, $value);
747 // Then move to the final field.
748 $url->param('fmap.mdltmp_'.$key, $key);
749 }
750
751 // This sets the true filename for Tika.
752 $url->param('resource.name', $storedfile->get_filename());
753
754 // A giant block of code that is really just error checking around the curl request.
755 try {
756 // Now actually do the request.
757 $result = $curl->post($url->out(false), array('myfile' => $storedfile));
758
759 $code = $curl->get_errno();
760 $info = $curl->get_info();
761
762 // Now error handling. It is just informational, since we aren't tracking per file/doc results.
763 if ($code != 0) {
764 // This means an internal cURL error occurred error is in result.
765 $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
766 debugging($message, DEBUG_DEVELOPER);
767 } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
768 // Unexpected HTTP response code.
769 $message = 'Error while indexing file with document id '.$filedoc['id'];
770 // Try to get error message out of msg or title if it exists.
771 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
772 $message .= ': '.$matches[1];
773 } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
774 $message .= ': '.$matches[1];
775 }
776 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
777 if (CLI_SCRIPT && !PHPUNIT_TEST) {
778 mtrace($message);
779 }
780 } else {
781 // Check for the expected status field.
782 if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
783 // Now check for the expected status of 0, if not, error.
784 if ((int)$matches[1] !== 0) {
785 $message = 'Unexpected Solr status code '.(int)$matches[1];
786 $message .= ' while indexing file with document id '.$filedoc['id'].'.';
787 debugging($message, DEBUG_DEVELOPER);
788 } else {
789 // The document was successfully indexed.
790 return;
791 }
792 } else {
793 // We received an unprocessable response.
794 $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
795 $message .= strtok($result, "\n");
796 debugging($message, DEBUG_DEVELOPER);
797 }
798 }
799 } catch (\Exception $e) {
800 // There was an error, but we are not tracking per-file success, so we just continue on.
801 debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
802 }
803
804 // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
805 $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_ERROR;
806 $this->add_solr_document($filedoc);
807 }
808
809 /**
810 * Checks to see if a passed file is indexable.
811 *
812 * @param \stored_file $file The file to check
813 * @return bool True if the file can be indexed
814 */
815 protected function file_is_indexable($file) {
816 if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
817 // The file is too big to index.
818 return false;
819 }
820
821 $mime = $file->get_mimetype();
822
823 if ($mime == 'application/vnd.moodle.backup') {
824 // We don't index Moodle backup files. There is nothing usefully indexable in them.
825 return false;
826 }
827
828 return true;
829 }
830
95c6aeaf
DM
831 /**
832 * Commits all pending changes.
833 *
834 * @return void
835 */
075fa912 836 protected function commit() {
95c6aeaf
DM
837 $this->get_search_client()->commit();
838 }
839
075fa912
EM
840 /**
841 * Do any area cleanup needed, and do anything to confirm contents.
842 *
843 * Return false to prevent the search area completed time and stats from being updated.
844 *
845 * @param \core_search\area\base $searcharea The search area that was complete
846 * @param int $numdocs The number of documents that were added to the index
847 * @param bool $fullindex True if a full index is being performed
848 * @return bool True means that data is considered indexed
849 */
850 public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
851 $this->commit();
852
853 return true;
854 }
855
cd894f84
EM
856 /**
857 * Return true if file indexing is supported and enabled. False otherwise.
858 *
859 * @return bool
860 */
861 public function file_indexing_enabled() {
862 return (bool)$this->config->fileindexing;
863 }
864
95c6aeaf
DM
865 /**
866 * Defragments the index.
867 *
868 * @return void
869 */
870 public function optimize() {
bfd6c78f 871 $this->get_search_client()->optimize(1, true, false);
95c6aeaf
DM
872 }
873
874 /**
875 * Deletes the specified document.
876 *
877 * @param string $id The document id to delete
878 * @return void
879 */
880 public function delete_by_id($id) {
cd894f84
EM
881 // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
882 $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
075fa912 883 $this->commit();
95c6aeaf
DM
884 }
885
886 /**
887 * Delete all area's documents.
888 *
889 * @param string $areaid
890 * @return void
891 */
892 public function delete($areaid = null) {
893 if ($areaid) {
894 $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
895 } else {
896 $this->get_search_client()->deleteByQuery('*:*');
897 }
075fa912 898 $this->commit();
95c6aeaf
DM
899 }
900
901 /**
902 * Pings the Solr server using search_solr config
903 *
904 * @return true|string Returns true if all good or an error string.
905 */
906 public function is_server_ready() {
907
908 if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
909 return 'No solr configuration found';
910 }
911
912 if (!$this->client = $this->get_search_client(false)) {
913 return get_string('engineserverstatus', 'search');
914 }
915
916 try {
917 @$this->client->ping();
918 } catch (\SolrClientException $ex) {
919 return 'Solr client error: ' . $ex->getMessage();
920 } catch (\SolrServerException $ex) {
921 return 'Solr server error: ' . $ex->getMessage();
922 }
923
924 // Check that setup schema has already run.
925 try {
926 $schema = new \search_solr\schema();
927 $schema->validate_setup();
928 } catch (\moodle_exception $e) {
929 return $e->getMessage();
930 }
931
932 return true;
933 }
934
935 /**
936 * Checks if the PHP Solr extension is available.
937 *
938 * @return bool
939 */
940 public function is_installed() {
941 return function_exists('solr_get_version');
942 }
943
944 /**
945 * Returns the solr client instance.
946 *
947 * @throws \core_search\engine_exception
948 * @param bool $triggerexception
949 * @return \SolrClient
950 */
951 protected function get_search_client($triggerexception = true) {
952
953 // Type comparison as it is set to false if not available.
954 if ($this->client !== null) {
955 return $this->client;
956 }
957
958 $options = array(
959 'hostname' => $this->config->server_hostname,
960 'path' => '/solr/' . $this->config->indexname,
961 'login' => !empty($this->config->server_username) ? $this->config->server_username : '',
962 'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
963 'port' => !empty($this->config->server_port) ? $this->config->server_port : '',
5dc4624c 964 'secure' => !empty($this->config->secure) ? true : false,
95c6aeaf 965 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '',
95c6aeaf 966 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '',
5dc4624c 967 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '',
95c6aeaf
DM
968 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '',
969 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '',
bfd6c78f 970 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30'
95c6aeaf
DM
971 );
972
973 $this->client = new \SolrClient($options);
974
975 if ($this->client === false && $triggerexception) {
976 throw new \core_search\engine_exception('engineserverstatus', 'search');
977 }
978
979 return $this->client;
980 }
5dc4624c
EM
981
982 /**
983 * Returns a curl object for conntecting to solr.
984 *
985 * @return \curl
986 */
987 public function get_curl_object() {
988 if (!is_null($this->curl)) {
989 return $this->curl;
990 }
991
992 $this->curl = new \curl();
993
994 $options = array();
995 // Build the SSL options. Based on pecl-solr and general testing.
996 if (!empty($this->config->secure)) {
997 if (!empty($this->config->ssl_cert)) {
998 $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert;
999 $options['CURLOPT_SSLCERTTYPE'] = 'PEM';
1000 }
1001
1002 if (!empty($this->config->ssl_key)) {
1003 $options['CURLOPT_SSLKEY'] = $this->config->ssl_key;
1004 $options['CURLOPT_SSLKEYTYPE'] = 'PEM';
1005 }
1006
1007 if (!empty($this->config->ssl_keypassword)) {
1008 $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword;
1009 }
1010
1011 if (!empty($this->config->ssl_cainfo)) {
1012 $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo;
1013 }
1014
1015 if (!empty($this->config->ssl_capath)) {
1016 $options['CURLOPT_CAPATH'] = $this->config->ssl_capath;
1017 }
1018 }
1019
1020 $this->curl->setopt($options);
1021
1022 if (!empty($this->config->server_username) && !empty($this->config->server_password)) {
1023 $authorization = $this->config->server_username . ':' . $this->config->server_password;
1024 $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization));
1025 }
1026
1027 return $this->curl;
1028 }
1029
1030 /**
1031 * Return a Moodle url object for the server connection.
1032 *
1033 * @param string $path The solr path to append.
1034 * @return \moodle_url
1035 */
1036 public function get_connection_url($path) {
1037 // Must use the proper protocol, or SSL will fail.
1038 $protocol = !empty($this->config->secure) ? 'https' : 'http';
1039 $url = $protocol . '://' . rtrim($this->config->server_hostname, '/');
1040 if (!empty($this->config->server_port)) {
1041 $url .= ':' . $this->config->server_port;
1042 }
1043 $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/');
1044
1045 return new \moodle_url($url);
1046 }
95c6aeaf 1047}