MDL-57161 search: Set the correct HTTP header
[moodle.git] / search / engine / solr / classes / engine.php
CommitLineData
95c6aeaf
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Solr engine.
19 *
20 * @package search_solr
21 * @copyright 2015 Daniel Neis Araujo
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace search_solr;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Solr engine.
31 *
32 * @package search_solr
33 * @copyright 2015 Daniel Neis Araujo
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36class engine extends \core_search\engine {
37
38 /**
39 * @var string The date format used by solr.
40 */
41 const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
42
43 /**
44 * @var int Commit documents interval (number of miliseconds).
45 */
46 const AUTOCOMMIT_WITHIN = 15000;
47
053118a1
EM
48 /**
49 * The maximum number of results to fetch at a time.
50 */
51 const QUERY_SIZE = 120;
52
95c6aeaf 53 /**
4894840d 54 * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
95c6aeaf 55 */
4894840d
EM
56 const FRAG_SIZE = 510;
57
58 /**
59 * Marker for the start of a highlight.
60 */
61 const HIGHLIGHT_START = '@@HI_S@@';
62
63 /**
64 * Marker for the end of a highlight.
65 */
66 const HIGHLIGHT_END = '@@HI_E@@';
95c6aeaf
DM
67
68 /**
69 * @var \SolrClient
70 */
71 protected $client = null;
72
7a4a0bc8
EM
73 /**
74 * @var bool True if we should reuse SolrClients, false if not.
75 */
76 protected $cacheclient = true;
77
5dc4624c
EM
78 /**
79 * @var \curl Direct curl object.
80 */
81 protected $curl = null;
82
95c6aeaf
DM
83 /**
84 * @var array Fields that can be highlighted.
85 */
4894840d 86 protected $highlightfields = array('title', 'content', 'description1', 'description2');
95c6aeaf 87
053118a1
EM
88 /**
89 * @var int Number of total docs reported by Sorl for the last query.
90 */
91 protected $totalenginedocs = 0;
92
93 /**
94 * @var int Number of docs we have processed for the last query.
95 */
96 protected $processeddocs = 0;
97
98 /**
99 * @var int Number of docs that have been skipped while processing the last query.
100 */
101 protected $skippeddocs = 0;
102
7a4a0bc8
EM
103 /**
104 * Initialises the search engine configuration.
105 *
106 * @return void
107 */
108 public function __construct() {
109 parent::__construct();
110
111 $curlversion = curl_version();
112 if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) {
113 // There is a flaw with curl 7.35.0 that causes problems with client reuse.
114 $this->cacheclient = false;
115 }
116 }
117
95c6aeaf
DM
118 /**
119 * Prepares a Solr query, applies filters and executes it returning its results.
120 *
121 * @throws \core_search\engine_exception
f6b425e2
EM
122 * @param stdClass $filters Containing query and filters.
123 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts.
053118a1 124 * @param int $limit The maximum number of results to return.
95c6aeaf
DM
125 * @return \core_search\document[] Results or false if no results
126 */
053118a1 127 public function execute_query($filters, $usercontexts, $limit = 0) {
f6b425e2 128 global $USER;
95c6aeaf 129
053118a1
EM
130 if (empty($limit)) {
131 $limit = \core_search\manager::MAX_RESULTS;
132 }
95c6aeaf
DM
133
134 // If there is any problem we trigger the exception as soon as possible.
7a4a0bc8 135 $client = $this->get_search_client();
95c6aeaf 136
053118a1
EM
137 // Create the query object.
138 $query = $this->create_user_query($filters, $usercontexts);
139
140 // We expect good match rates, so for our first get, we will get a small number of records.
141 // This significantly speeds solr response time for first few pages.
142 $query->setRows(min($limit * 3, static::QUERY_SIZE));
143 $response = $this->get_query_response($query);
144
145 // Get count data out of the response, and reset our counters.
146 list($included, $found) = $this->get_response_counts($response);
147 $this->totalenginedocs = $found;
148 $this->processeddocs = 0;
149 $this->skippeddocs = 0;
150 if ($included == 0 || $this->totalenginedocs == 0) {
151 // No results.
152 return array();
153 }
154
155 // Get valid documents out of the response.
156 $results = $this->process_response($response, $limit);
157
158 // We have processed all the docs in the response at this point.
159 $this->processeddocs += $included;
160
161 // If we haven't reached the limit, and there are more docs left in Solr, lets keep trying.
162 while (count($results) < $limit && ($this->totalenginedocs - $this->processeddocs) > 0) {
163 // Offset the start of the query, and since we are making another call, get more per call.
164 $query->setStart($this->processeddocs);
165 $query->setRows(static::QUERY_SIZE);
166
167 $response = $this->get_query_response($query);
168 list($included, $found) = $this->get_response_counts($response);
169 if ($included == 0 || $found == 0) {
170 // No new results were found. Found being empty would be weird, so we will just return.
171 return $results;
172 }
173 $this->totalenginedocs = $found;
174
175 // Get the new response docs, limiting to remaining we need, then add it to the end of the results array.
176 $newdocs = $this->process_response($response, $limit - count($results));
177 $results = array_merge($results, $newdocs);
178
179 // Add to our processed docs count.
180 $this->processeddocs += $included;
181 }
182
183 return $results;
184 }
185
186 /**
187 * Takes a query and returns the response in SolrObject format.
188 *
189 * @param SolrQuery $query Solr query object.
190 * @return SolrObject|false Response document or false on error.
191 */
192 protected function get_query_response($query) {
193 try {
194 return $this->get_search_client()->query($query)->getResponse();
195 } catch (\SolrClientException $ex) {
196 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
197 $this->queryerror = $ex->getMessage();
198 return false;
199 } catch (\SolrServerException $ex) {
200 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
201 $this->queryerror = $ex->getMessage();
202 return false;
cd894f84 203 }
053118a1
EM
204 }
205
206 /**
207 * Returns the total number of documents available for the most recently call to execute_query.
208 *
209 * @return int
210 */
211 public function get_query_total_count() {
212 // Return the total engine count minus the docs we have determined are bad.
213 return $this->totalenginedocs - $this->skippeddocs;
214 }
215
216 /**
217 * Returns count information for a provided response. Will return 0, 0 for invalid or empty responses.
218 *
219 * @param SolrDocument $response The response document from Solr.
220 * @return array A two part array. First how many response docs are in the response.
221 * Second, how many results are vailable in the engine.
222 */
223 protected function get_response_counts($response) {
224 $found = 0;
225 $included = 0;
226
227 if (isset($response->grouped->solr_filegroupingid->ngroups)) {
228 // Get the number of results for file grouped queries.
229 $found = $response->grouped->solr_filegroupingid->ngroups;
230 $included = count($response->grouped->solr_filegroupingid->groups);
231 } else if (isset($response->response->numFound)) {
232 // Get the number of results for standard queries.
233 $found = $response->response->numFound;
234 $included = count($response->response->docs);
cd894f84 235 }
053118a1
EM
236
237 return array($included, $found);
238 }
239
240 /**
241 * Prepares a new query object with needed limits, filters, etc.
242 *
243 * @param stdClass $filters Containing query and filters.
244 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts.
245 * @return SolrDisMaxQuery
246 */
247 protected function create_user_query($filters, $usercontexts) {
248 global $USER;
249
250 // Let's keep these changes internal.
251 $data = clone $filters;
252
253 $query = new \SolrDisMaxQuery();
254
255 $this->set_query($query, $data->q);
95c6aeaf
DM
256 $this->add_fields($query);
257
258 // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
259 // we are really interested in caching contexts filters instead.
260 if (!empty($data->title)) {
261 $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
262 }
501801a2
EM
263 if (!empty($data->areaids)) {
264 // If areaids are specified, we want to get any that match.
265 $query->addFilterQuery('{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')');
95c6aeaf 266 }
427e3cbc
EM
267 if (!empty($data->courseids)) {
268 $query->addFilterQuery('{!cache=false}courseid:(' . implode(' OR ', $data->courseids) . ')');
269 }
95c6aeaf
DM
270
271 if (!empty($data->timestart) or !empty($data->timeend)) {
272 if (empty($data->timestart)) {
273 $data->timestart = '*';
274 } else {
275 $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
276 }
277 if (empty($data->timeend)) {
278 $data->timeend = '*';
279 } else {
280 $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
281 }
282
283 // No cache.
284 $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
285 }
286
f6b425e2
EM
287 // Restrict to users who are supposed to be able to see a particular result.
288 $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
289
95c6aeaf
DM
290 // And finally restrict it to the context where the user can access, we want this one cached.
291 // If the user can access all contexts $usercontexts value is just true, we don't need to filter
292 // in that case.
293 if ($usercontexts && is_array($usercontexts)) {
427e3cbc
EM
294 // Join all area contexts into a single array and implode.
295 $allcontexts = array();
296 foreach ($usercontexts as $areaid => $areacontexts) {
501801a2 297 if (!empty($data->areaids) && !in_array($areaid, $data->areaids)) {
427e3cbc
EM
298 // Skip unused areas.
299 continue;
95c6aeaf 300 }
427e3cbc
EM
301 foreach ($areacontexts as $contextid) {
302 // Ensure they are unique.
303 $allcontexts[$contextid] = $contextid;
304 }
305 }
306 if (empty($allcontexts)) {
307 // This means there are no valid contexts for them, so they get no results.
308 return array();
95c6aeaf 309 }
427e3cbc 310 $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
95c6aeaf
DM
311 }
312
053118a1
EM
313 if ($this->file_indexing_enabled()) {
314 // Now group records by solr_filegroupingid. Limit to 3 results per group.
315 $query->setGroup(true);
316 $query->setGroupLimit(3);
317 $query->setGroupNGroups(true);
318 $query->addGroupField('solr_filegroupingid');
e0867b22
EM
319 } else {
320 // Make sure we only get text files, in case the index has pre-existing files.
321 $query->addFilterQuery('type:'.\core_search\manager::TYPE_TEXT);
95c6aeaf
DM
322 }
323
053118a1 324 return $query;
95c6aeaf
DM
325 }
326
327 /**
328 * Prepares a new query by setting the query, start offset and rows to return.
3744ceb6 329 *
95c6aeaf 330 * @param SolrQuery $query
cd894f84 331 * @param object $q Containing query and filters.
95c6aeaf 332 */
053118a1 333 protected function set_query($query, $q) {
95c6aeaf
DM
334 // Set hightlighting.
335 $query->setHighlight(true);
336 foreach ($this->highlightfields as $field) {
337 $query->addHighlightField($field);
338 }
339 $query->setHighlightFragsize(static::FRAG_SIZE);
4894840d
EM
340 $query->setHighlightSimplePre(self::HIGHLIGHT_START);
341 $query->setHighlightSimplePost(self::HIGHLIGHT_END);
342 $query->setHighlightMergeContiguous(true);
95c6aeaf
DM
343
344 $query->setQuery($q);
345
346 // A reasonable max.
053118a1 347 $query->setRows(static::QUERY_SIZE);
95c6aeaf
DM
348 }
349
350 /**
351 * Sets fields to be returned in the result.
352 *
3744ceb6 353 * @param SolrDisMaxQuery|SolrQuery $query object.
95c6aeaf
DM
354 */
355 public function add_fields($query) {
356 $documentclass = $this->get_document_classname();
3744ceb6
EM
357 $fields = $documentclass::get_default_fields_definition();
358
359 $dismax = false;
546c0af5 360 if ($query instanceof \SolrDisMaxQuery) {
3744ceb6
EM
361 $dismax = true;
362 }
363
364 foreach ($fields as $key => $field) {
365 $query->addField($key);
366 if ($dismax && !empty($field['mainquery'])) {
367 // Add fields the main query should be run against.
368 $query->addQueryField($key);
369 }
95c6aeaf
DM
370 }
371 }
372
373 /**
374 * Finds the key common to both highlighing and docs array returned from response.
375 * @param object $response containing results.
376 */
377 public function add_highlight_content($response) {
cd894f84
EM
378 if (!isset($response->highlighting)) {
379 // There is no highlighting to add.
380 return;
381 }
382
95c6aeaf
DM
383 $highlightedobject = $response->highlighting;
384 foreach ($response->response->docs as $doc) {
385 $x = $doc->id;
386 $highlighteddoc = $highlightedobject->$x;
387 $this->merge_highlight_field_values($doc, $highlighteddoc);
388 }
389 }
390
391 /**
392 * Adds the highlighting array values to docs array values.
393 *
394 * @throws \core_search\engine_exception
395 * @param object $doc containing the results.
396 * @param object $highlighteddoc containing the highlighted results values.
397 */
398 public function merge_highlight_field_values($doc, $highlighteddoc) {
399
400 foreach ($this->highlightfields as $field) {
401 if (!empty($doc->$field)) {
402
403 // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
404 if (is_array($doc->{$field})) {
405 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
406 }
407
408 if (!empty($highlighteddoc->$field)) {
409 // Replace by the highlighted result.
410 $doc->$field = reset($highlighteddoc->$field);
411 }
412 }
413 }
414 }
415
416 /**
417 * Filters the response on Moodle side.
418 *
053118a1
EM
419 * @param SolrObject $response Solr object containing the response return from solr server.
420 * @param int $limit The maximum number of results to return. 0 for all.
421 * @param bool $skipaccesscheck Don't use check_access() on results. Only to be used when results have known access.
95c6aeaf
DM
422 * @return array $results containing final results to be displayed.
423 */
053118a1 424 protected function process_response($response, $limit = 0, $skipaccesscheck = false) {
f6b425e2
EM
425 global $USER;
426
053118a1
EM
427 if (empty($response)) {
428 return array();
429 }
430
431 if (isset($response->grouped)) {
432 return $this->grouped_files_process_response($response, $limit);
433 }
434
f6b425e2
EM
435 $userid = $USER->id;
436 $noownerid = \core_search\manager::NO_OWNER_ID;
95c6aeaf 437
95c6aeaf
DM
438 $numgranted = 0;
439
440 if (!$docs = $response->response->docs) {
441 return array();
442 }
443
053118a1 444 $out = array();
95c6aeaf
DM
445 if (!empty($response->response->numFound)) {
446 $this->add_highlight_content($response);
447
448 // Iterate through the results checking its availability and whether they are available for the user or not.
449 foreach ($docs as $key => $docdata) {
f6b425e2
EM
450 if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
451 // If owneruserid is set, no other user should be able to access this record.
f6b425e2
EM
452 continue;
453 }
454
95c6aeaf 455 if (!$searcharea = $this->get_search_area($docdata->areaid)) {
95c6aeaf
DM
456 continue;
457 }
458
459 $docdata = $this->standarize_solr_obj($docdata);
460
053118a1
EM
461 if ($skipaccesscheck) {
462 $access = \core_search\manager::ACCESS_GRANTED;
463 } else {
464 $access = $searcharea->check_access($docdata['itemid']);
465 }
95c6aeaf
DM
466 switch ($access) {
467 case \core_search\manager::ACCESS_DELETED:
468 $this->delete_by_id($docdata['id']);
053118a1
EM
469 // Remove one from our processed and total counters, since we promptly deleted.
470 $this->processeddocs--;
471 $this->totalenginedocs--;
95c6aeaf
DM
472 break;
473 case \core_search\manager::ACCESS_DENIED:
053118a1 474 $this->skippeddocs++;
95c6aeaf
DM
475 break;
476 case \core_search\manager::ACCESS_GRANTED:
477 $numgranted++;
478
479 // Add the doc.
053118a1 480 $out[] = $this->to_document($searcharea, $docdata);
95c6aeaf
DM
481 break;
482 }
483
053118a1
EM
484 // Stop when we hit our limit.
485 if (!empty($limit) && count($out) >= $limit) {
95c6aeaf
DM
486 break;
487 }
488 }
489 }
490
053118a1 491 return $out;
95c6aeaf
DM
492 }
493
cd894f84
EM
494 /**
495 * Processes grouped file results into documents, with attached matching files.
496 *
053118a1
EM
497 * @param SolrObject $response The response returned from solr server
498 * @param int $limit The maximum number of results to return. 0 for all.
cd894f84
EM
499 * @return array Final results to be displayed.
500 */
053118a1 501 protected function grouped_files_process_response($response, $limit = 0) {
cd894f84
EM
502 // If we can't find the grouping, or there are no matches in the grouping, return empty.
503 if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
504 return array();
505 }
506
507 $numgranted = 0;
508 $orderedids = array();
509 $completedocs = array();
510 $incompletedocs = array();
511
512 $highlightingobj = $response->highlighting;
513
514 // Each group represents a "master document".
515 $groups = $response->grouped->solr_filegroupingid->groups;
516 foreach ($groups as $group) {
517 $groupid = $group->groupValue;
518 $groupdocs = $group->doclist->docs;
519 $firstdoc = reset($groupdocs);
520
521 if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
522 // Well, this is a problem.
523 continue;
524 }
525
526 // Check for access.
527 $access = $searcharea->check_access($firstdoc->itemid);
528 switch ($access) {
529 case \core_search\manager::ACCESS_DELETED:
530 // If deleted from Moodle, delete from index and then continue.
531 $this->delete_by_id($firstdoc->id);
053118a1
EM
532 // Remove one from our processed and total counters, since we promptly deleted.
533 $this->processeddocs--;
534 $this->totalenginedocs--;
cd894f84
EM
535 continue 2;
536 break;
537 case \core_search\manager::ACCESS_DENIED:
538 // This means we should just skip for the current user.
053118a1 539 $this->skippeddocs++;
cd894f84
EM
540 continue 2;
541 break;
542 }
543 $numgranted++;
544
545 $maindoc = false;
546 $fileids = array();
547 // Seperate the main document and any files returned.
548 foreach ($groupdocs as $groupdoc) {
549 if ($groupdoc->id == $groupid) {
550 $maindoc = $groupdoc;
551 } else if (isset($groupdoc->solr_fileid)) {
552 $fileids[] = $groupdoc->solr_fileid;
553 }
554 }
555
556 // Store the id of this group, in order, for later merging.
557 $orderedids[] = $groupid;
558
559 if (!$maindoc) {
560 // We don't have the main doc, store what we know for later building.
561 $incompletedocs[$groupid] = $fileids;
562 } else {
563 if (isset($highlightingobj->$groupid)) {
564 // Merge the highlighting for this doc.
565 $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
566 }
567 $docdata = $this->standarize_solr_obj($maindoc);
568 $doc = $this->to_document($searcharea, $docdata);
569 // Now we need to attach the result files to the doc.
570 foreach ($fileids as $fileid) {
571 $doc->add_stored_file($fileid);
572 }
573 $completedocs[$groupid] = $doc;
574 }
575
053118a1 576 if (!empty($limit) && $numgranted >= $limit) {
cd894f84
EM
577 // We have hit the max results, we will just ignore the rest.
578 break;
579 }
580 }
581
582 $incompletedocs = $this->get_missing_docs($incompletedocs);
583
584 $out = array();
585 // Now merge the complete and incomplete documents, in results order.
586 foreach ($orderedids as $docid) {
587 if (isset($completedocs[$docid])) {
588 $out[] = $completedocs[$docid];
589 } else if (isset($incompletedocs[$docid])) {
590 $out[] = $incompletedocs[$docid];
591 }
592 }
593
594 return $out;
595 }
596
597 /**
598 * Retreive any missing main documents and attach provided files.
599 *
600 * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
601 * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
602 *
603 * Return array also indexed by document id.
604 *
605 * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
606 * @return document[]
607 */
608 protected function get_missing_docs($missingdocs) {
609 if (empty($missingdocs)) {
610 return array();
611 }
612
613 $docids = array_keys($missingdocs);
614
615 // Build a custom query that will get all the missing documents.
616 $query = new \SolrQuery();
053118a1 617 $this->set_query($query, '*');
cd894f84 618 $this->add_fields($query);
053118a1 619 $query->setRows(count($docids));
cd894f84
EM
620 $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
621
053118a1
EM
622 $response = $this->get_query_response($query);
623 // We know the missing docs have already been checked for access, so don't recheck.
624 $results = $this->process_response($response, 0, true);
cd894f84
EM
625
626 $out = array();
627 foreach ($results as $result) {
628 $resultid = $result->get('id');
629 if (!isset($missingdocs[$resultid])) {
630 // We got a result we didn't expect. Skip it.
631 continue;
632 }
633 // Attach the files.
634 foreach ($missingdocs[$resultid] as $filedoc) {
635 $result->add_stored_file($filedoc);
636 }
637 $out[$resultid] = $result;
638 }
639
640 return $out;
641 }
642
95c6aeaf
DM
643 /**
644 * Returns a standard php array from a \SolrObject instance.
645 *
646 * @param \SolrObject $obj
647 * @return array The returned document as an array.
648 */
649 public function standarize_solr_obj(\SolrObject $obj) {
650 $properties = $obj->getPropertyNames();
651
652 $docdata = array();
653 foreach($properties as $name) {
654 // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
655 $name = trim($name);
656 $docdata[$name] = $obj->offsetGet($name);
657 }
658 return $docdata;
659 }
660
661 /**
662 * Adds a document to the search engine.
663 *
664 * This does not commit to the search engine.
665 *
091973db
EM
666 * @param document $document
667 * @param bool $fileindexing True if file indexing is to be used
668 * @return bool
95c6aeaf 669 */
091973db
EM
670 public function add_document($document, $fileindexing = false) {
671 $docdata = $document->export_for_engine();
672
cd894f84 673 if (!$this->add_solr_document($docdata)) {
091973db
EM
674 return false;
675 }
676
cd894f84
EM
677 if ($fileindexing) {
678 // This will take care of updating all attached files in the index.
679 $this->process_document_files($document);
680 }
681
091973db
EM
682 return true;
683 }
95c6aeaf 684
091973db
EM
685 /**
686 * Adds a text document to the search engine.
687 *
cd894f84 688 * @param array $doc
091973db
EM
689 * @return bool
690 */
cd894f84 691 protected function add_solr_document($doc) {
95c6aeaf
DM
692 $solrdoc = new \SolrInputDocument();
693 foreach ($doc as $field => $value) {
694 $solrdoc->addField($field, $value);
695 }
696
697 try {
698 $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
091973db 699 return true;
95c6aeaf
DM
700 } catch (\SolrClientException $e) {
701 debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
63d5007b
EM
702 } catch (\SolrServerException $e) {
703 // We only use the first line of the message, as it's a fully java stacktrace behind it.
704 $msg = strtok($e->getMessage(), "\n");
705 debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
95c6aeaf 706 }
091973db
EM
707
708 return false;
95c6aeaf
DM
709 }
710
cd894f84
EM
711 /**
712 * Index files attached to the docuemnt, ensuring the index matches the current document files.
713 *
714 * For documents that aren't known to be new, we check the index for existing files.
715 * - New files we will add.
716 * - Existing and unchanged files we will skip.
717 * - File that are in the index but not on the document will be deleted from the index.
718 * - Files that have changed will be re-indexed.
719 *
720 * @param document $document
721 */
722 protected function process_document_files($document) {
723 if (!$this->file_indexing_enabled()) {
724 return;
725 }
726
727 // Maximum rows to process at a time.
728 $rows = 500;
729
730 // Get the attached files.
731 $files = $document->get_files();
732
733 // If this isn't a new document, we need to check the exiting indexed files.
734 if (!$document->get_is_new()) {
735 // We do this progressively, so we can handle lots of files cleanly.
736 list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
737 $count = 0;
738 $idstodelete = array();
739
740 do {
741 // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
742 foreach ($indexedfiles as $indexedfile) {
743 $fileid = $indexedfile->solr_fileid;
744
745 if (isset($files[$fileid])) {
746 // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
747 // Filelib does not guarantee time modified is updated, so we will check important values.
1aaead91 748 if ($indexedfile->modified != $files[$fileid]->get_timemodified()) {
cd894f84
EM
749 continue;
750 }
751 if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
752 continue;
753 }
754 if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
755 continue;
756 }
f6b4ec7b 757 if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
cd894f84
EM
758 $this->file_is_indexable($files[$fileid])) {
759 // This means that the last time we indexed this file, filtering blocked it.
760 // Current settings say it is indexable, so we will allow it to be indexed.
761 continue;
762 }
763
764 // If the file is already indexed, we can just remove it from the files array and skip it.
765 unset($files[$fileid]);
766 } else {
767 // This means we have found a file that is no longer attached, so we need to delete from the index.
768 // We do it later, since this is progressive, and it could reorder results.
769 $idstodelete[] = $indexedfile->id;
770 }
771 }
772 $count += $rows;
773
774 if ($count < $numfound) {
775 // If we haven't hit the total count yet, fetch the next batch.
776 list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
777 }
778
779 } while ($count < $numfound);
780
781 // Delete files that are no longer attached.
782 foreach ($idstodelete as $id) {
783 // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
784 $this->get_search_client()->deleteById($id);
785 }
786 }
787
788 // Now we can actually index all the remaining files.
789 foreach ($files as $file) {
790 $this->add_stored_file($document, $file);
791 }
792 }
793
794 /**
795 * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
796 *
797 * @param document $document
798 * @param int $start The row to start the results on. Zero indexed.
799 * @param int $rows The number of rows to fetch
800 * @return array A two element array, the first is the total number of availble results, the second is an array
801 * of documents for the current request.
802 */
803 protected function get_indexed_files($document, $start = 0, $rows = 500) {
804 // Build a custom query that will get any document files that are in our solr_filegroupingid.
805 $query = new \SolrQuery();
806
807 // We want to get all file records tied to a document.
808 // For efficiency, we are building our own, stripped down, query.
809 $query->setQuery('*');
810 $query->setRows($rows);
811 $query->setStart($start);
812 // We want a consistent sorting.
813 $query->addSortField('id');
814
815 // We only want the bare minimum of fields.
816 $query->addField('id');
817 $query->addField('modified');
818 $query->addField('title');
819 $query->addField('solr_fileid');
820 $query->addField('solr_filecontenthash');
f6b4ec7b 821 $query->addField('solr_fileindexstatus');
cd894f84
EM
822
823 $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
824 $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
825
053118a1
EM
826 $response = $this->get_query_response($query);
827 if (empty($response->response->numFound)) {
cd894f84
EM
828 return array(0, array());
829 }
053118a1
EM
830
831 return array($response->response->numFound, $this->convert_file_results($response));
cd894f84
EM
832 }
833
834 /**
835 * A very lightweight handler for getting information about already indexed files from a Solr response.
836 *
837 * @param SolrObject $responsedoc A Solr response document
838 * @return stdClass[] An array of objects that contain the basic information for file processing.
839 */
840 protected function convert_file_results($responsedoc) {
841 if (!$docs = $responsedoc->response->docs) {
842 return array();
843 }
844
845 $out = array();
846
847 foreach ($docs as $doc) {
848 // Copy the bare minimim needed info.
849 $result = new \stdClass();
850 $result->id = $doc->id;
851 $result->modified = document::import_time_from_engine($doc->modified);
852 $result->title = $doc->title;
853 $result->solr_fileid = $doc->solr_fileid;
854 $result->solr_filecontenthash = $doc->solr_filecontenthash;
f6b4ec7b 855 $result->solr_fileindexstatus = $doc->solr_fileindexstatus;
cd894f84
EM
856 $out[] = $result;
857 }
858
859 return $out;
860 }
861
862 /**
863 * Adds a file to the search engine.
864 *
865 * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
866 * Tika has much better content type detection than Moodle, and we will have many more doc failures
867 * if we try to send mime types.
868 *
869 * @param document $document
870 * @param \stored_file $storedfile
871 * @return void
872 */
873 protected function add_stored_file($document, $storedfile) {
874 $filedoc = $document->export_file_for_engine($storedfile);
875
876 if (!$this->file_is_indexable($storedfile)) {
877 // For files that we don't consider indexable, we will still place a reference in the search engine.
f6b4ec7b 878 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
cd894f84
EM
879 $this->add_solr_document($filedoc);
880 return;
881 }
882
883 $curl = $this->get_curl_object();
884
885 $url = $this->get_connection_url('/update/extract');
886
887 // This will prevent solr from automatically making fields for every tika output.
888 $url->param('uprefix', 'ignored_');
889
546c0af5
EM
890 // Control how content is captured. This will keep our file content clean of non-important metadata.
891 $url->param('captureAttr', 'true');
892 // Move the content to a field for indexing.
893 $url->param('fmap.content', 'solr_filecontent');
894
cd894f84
EM
895 // These are common fields that matches the standard *_point dynamic field and causes an error.
896 $url->param('fmap.media_white_point', 'ignored_mwp');
897 $url->param('fmap.media_black_point', 'ignored_mbp');
898
899 // Copy each key to the url with literal.
900 // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
901 foreach ($filedoc as $key => $value) {
902 // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
903 $url->param('fmap.'.$key, 'ignored_'.$key);
904 // Place data in a tmp field.
905 $url->param('literal.mdltmp_'.$key, $value);
906 // Then move to the final field.
907 $url->param('fmap.mdltmp_'.$key, $key);
908 }
909
910 // This sets the true filename for Tika.
911 $url->param('resource.name', $storedfile->get_filename());
912
913 // A giant block of code that is really just error checking around the curl request.
914 try {
915 // Now actually do the request.
916 $result = $curl->post($url->out(false), array('myfile' => $storedfile));
917
918 $code = $curl->get_errno();
919 $info = $curl->get_info();
920
921 // Now error handling. It is just informational, since we aren't tracking per file/doc results.
922 if ($code != 0) {
923 // This means an internal cURL error occurred error is in result.
924 $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
925 debugging($message, DEBUG_DEVELOPER);
926 } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
927 // Unexpected HTTP response code.
928 $message = 'Error while indexing file with document id '.$filedoc['id'];
929 // Try to get error message out of msg or title if it exists.
930 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
931 $message .= ': '.$matches[1];
932 } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
933 $message .= ': '.$matches[1];
934 }
935 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
936 if (CLI_SCRIPT && !PHPUNIT_TEST) {
937 mtrace($message);
938 }
939 } else {
940 // Check for the expected status field.
941 if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
942 // Now check for the expected status of 0, if not, error.
943 if ((int)$matches[1] !== 0) {
944 $message = 'Unexpected Solr status code '.(int)$matches[1];
945 $message .= ' while indexing file with document id '.$filedoc['id'].'.';
946 debugging($message, DEBUG_DEVELOPER);
947 } else {
948 // The document was successfully indexed.
949 return;
950 }
951 } else {
952 // We received an unprocessable response.
953 $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
954 $message .= strtok($result, "\n");
955 debugging($message, DEBUG_DEVELOPER);
956 }
957 }
958 } catch (\Exception $e) {
959 // There was an error, but we are not tracking per-file success, so we just continue on.
960 debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
961 }
962
963 // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
f6b4ec7b 964 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
cd894f84
EM
965 $this->add_solr_document($filedoc);
966 }
967
968 /**
969 * Checks to see if a passed file is indexable.
970 *
971 * @param \stored_file $file The file to check
972 * @return bool True if the file can be indexed
973 */
974 protected function file_is_indexable($file) {
975 if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
976 // The file is too big to index.
977 return false;
978 }
979
980 $mime = $file->get_mimetype();
981
982 if ($mime == 'application/vnd.moodle.backup') {
983 // We don't index Moodle backup files. There is nothing usefully indexable in them.
984 return false;
985 }
986
987 return true;
988 }
989
95c6aeaf
DM
990 /**
991 * Commits all pending changes.
992 *
993 * @return void
994 */
075fa912 995 protected function commit() {
95c6aeaf
DM
996 $this->get_search_client()->commit();
997 }
998
075fa912
EM
999 /**
1000 * Do any area cleanup needed, and do anything to confirm contents.
1001 *
1002 * Return false to prevent the search area completed time and stats from being updated.
1003 *
0bd8383a 1004 * @param \core_search\base $searcharea The search area that was complete
075fa912
EM
1005 * @param int $numdocs The number of documents that were added to the index
1006 * @param bool $fullindex True if a full index is being performed
1007 * @return bool True means that data is considered indexed
1008 */
1009 public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
1010 $this->commit();
1011
1012 return true;
1013 }
1014
cd894f84
EM
1015 /**
1016 * Return true if file indexing is supported and enabled. False otherwise.
1017 *
1018 * @return bool
1019 */
1020 public function file_indexing_enabled() {
1021 return (bool)$this->config->fileindexing;
1022 }
1023
95c6aeaf
DM
1024 /**
1025 * Defragments the index.
1026 *
1027 * @return void
1028 */
1029 public function optimize() {
bfd6c78f 1030 $this->get_search_client()->optimize(1, true, false);
95c6aeaf
DM
1031 }
1032
1033 /**
1034 * Deletes the specified document.
1035 *
1036 * @param string $id The document id to delete
1037 * @return void
1038 */
1039 public function delete_by_id($id) {
cd894f84
EM
1040 // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
1041 $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
075fa912 1042 $this->commit();
95c6aeaf
DM
1043 }
1044
1045 /**
1046 * Delete all area's documents.
1047 *
1048 * @param string $areaid
1049 * @return void
1050 */
1051 public function delete($areaid = null) {
1052 if ($areaid) {
1053 $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
1054 } else {
1055 $this->get_search_client()->deleteByQuery('*:*');
1056 }
075fa912 1057 $this->commit();
95c6aeaf
DM
1058 }
1059
1060 /**
1061 * Pings the Solr server using search_solr config
1062 *
1063 * @return true|string Returns true if all good or an error string.
1064 */
1065 public function is_server_ready() {
1066
23fc1be8
DM
1067 $configured = $this->is_server_configured();
1068 if ($configured !== true) {
1069 return $configured;
1070 }
1071
1072 // Check that the schema is already set up.
1073 try {
1074 $schema = new \search_solr\schema();
1075 $schema->validate_setup();
1076 } catch (\moodle_exception $e) {
1077 return $e->getMessage();
1078 }
1079
1080 return true;
1081 }
1082
1083 /**
1084 * Is the solr server properly configured?.
1085 *
1086 * @return true|string Returns true if all good or an error string.
1087 */
1088 public function is_server_configured() {
1089
95c6aeaf
DM
1090 if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
1091 return 'No solr configuration found';
1092 }
1093
7a4a0bc8 1094 if (!$client = $this->get_search_client(false)) {
95c6aeaf
DM
1095 return get_string('engineserverstatus', 'search');
1096 }
1097
1098 try {
23fc1be8
DM
1099 if ($this->get_solr_major_version() < 4) {
1100 // Minimum solr 4.0.
1101 return get_string('minimumsolr4', 'search_solr');
1102 }
95c6aeaf 1103 } catch (\SolrClientException $ex) {
d0b4772c
DM
1104 debugging('Solr client error: ' . html_to_text($ex->getMessage()), DEBUG_DEVELOPER);
1105 return get_string('engineserverstatus', 'search');
95c6aeaf 1106 } catch (\SolrServerException $ex) {
d0b4772c
DM
1107 debugging('Solr server error: ' . html_to_text($ex->getMessage()), DEBUG_DEVELOPER);
1108 return get_string('engineserverstatus', 'search');
95c6aeaf
DM
1109 }
1110
95c6aeaf
DM
1111 return true;
1112 }
1113
23fc1be8
DM
1114 /**
1115 * Returns the solr server major version.
1116 *
1117 * @return int
1118 */
1119 public function get_solr_major_version() {
d0b4772c
DM
1120 // We should really ping first the server to see if the specified indexname is valid but
1121 // we want to minimise solr server requests as they are expensive. system() emits a warning
1122 // if it can not connect to the configured index in the configured server.
1123 $systemdata = @$this->get_search_client()->system();
23fc1be8
DM
1124 $solrversion = $systemdata->getResponse()->offsetGet('lucene')->offsetGet('solr-spec-version');
1125 return intval(substr($solrversion, 0, strpos($solrversion, '.')));
1126 }
1127
95c6aeaf
DM
1128 /**
1129 * Checks if the PHP Solr extension is available.
1130 *
1131 * @return bool
1132 */
1133 public function is_installed() {
1134 return function_exists('solr_get_version');
1135 }
1136
1137 /**
1138 * Returns the solr client instance.
1139 *
7a4a0bc8
EM
1140 * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl.
1141 *
95c6aeaf
DM
1142 * @throws \core_search\engine_exception
1143 * @param bool $triggerexception
1144 * @return \SolrClient
1145 */
1146 protected function get_search_client($triggerexception = true) {
1147
1148 // Type comparison as it is set to false if not available.
1149 if ($this->client !== null) {
1150 return $this->client;
1151 }
1152
1153 $options = array(
1154 'hostname' => $this->config->server_hostname,
1155 'path' => '/solr/' . $this->config->indexname,
1156 'login' => !empty($this->config->server_username) ? $this->config->server_username : '',
1157 'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
1158 'port' => !empty($this->config->server_port) ? $this->config->server_port : '',
5dc4624c 1159 'secure' => !empty($this->config->secure) ? true : false,
95c6aeaf 1160 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '',
95c6aeaf 1161 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '',
5dc4624c 1162 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '',
95c6aeaf
DM
1163 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '',
1164 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '',
bfd6c78f 1165 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30'
95c6aeaf
DM
1166 );
1167
e240a613
DM
1168 if (!class_exists('\SolrClient')) {
1169 throw new \core_search\engine_exception('enginenotinstalled', 'search', '', 'solr');
1170 }
1171
7a4a0bc8 1172 $client = new \SolrClient($options);
95c6aeaf 1173
7a4a0bc8 1174 if ($client === false && $triggerexception) {
95c6aeaf
DM
1175 throw new \core_search\engine_exception('engineserverstatus', 'search');
1176 }
1177
7a4a0bc8
EM
1178 if ($this->cacheclient) {
1179 $this->client = $client;
1180 }
1181
1182 return $client;
95c6aeaf 1183 }
5dc4624c
EM
1184
1185 /**
1186 * Returns a curl object for conntecting to solr.
1187 *
1188 * @return \curl
1189 */
1190 public function get_curl_object() {
1191 if (!is_null($this->curl)) {
1192 return $this->curl;
1193 }
1194
1195 $this->curl = new \curl();
1196
1197 $options = array();
1198 // Build the SSL options. Based on pecl-solr and general testing.
1199 if (!empty($this->config->secure)) {
1200 if (!empty($this->config->ssl_cert)) {
1201 $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert;
1202 $options['CURLOPT_SSLCERTTYPE'] = 'PEM';
1203 }
1204
1205 if (!empty($this->config->ssl_key)) {
1206 $options['CURLOPT_SSLKEY'] = $this->config->ssl_key;
1207 $options['CURLOPT_SSLKEYTYPE'] = 'PEM';
1208 }
1209
1210 if (!empty($this->config->ssl_keypassword)) {
1211 $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword;
1212 }
1213
1214 if (!empty($this->config->ssl_cainfo)) {
1215 $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo;
1216 }
1217
1218 if (!empty($this->config->ssl_capath)) {
1219 $options['CURLOPT_CAPATH'] = $this->config->ssl_capath;
1220 }
1221 }
1222
1223 $this->curl->setopt($options);
1224
1225 if (!empty($this->config->server_username) && !empty($this->config->server_password)) {
1226 $authorization = $this->config->server_username . ':' . $this->config->server_password;
c7203847 1227 $this->curl->setHeader('Authorization: Basic ' . base64_encode($authorization));
5dc4624c
EM
1228 }
1229
1230 return $this->curl;
1231 }
1232
1233 /**
1234 * Return a Moodle url object for the server connection.
1235 *
1236 * @param string $path The solr path to append.
1237 * @return \moodle_url
1238 */
1239 public function get_connection_url($path) {
1240 // Must use the proper protocol, or SSL will fail.
1241 $protocol = !empty($this->config->secure) ? 'https' : 'http';
1242 $url = $protocol . '://' . rtrim($this->config->server_hostname, '/');
1243 if (!empty($this->config->server_port)) {
1244 $url .= ':' . $this->config->server_port;
1245 }
1246 $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/');
1247
1248 return new \moodle_url($url);
1249 }
95c6aeaf 1250}