Merge branch 'MDL-62899-search-icons-master' of https://github.com/dmitriim/moodle
[moodle.git] / search / classes / document.php
CommitLineData
db48207e
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Document representation.
19 *
20 * @package core_search
21 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_search;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Represents a document to index.
31 *
32 * Note that, if you are writting a search engine and you want to change \core_search\document
33 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document.
34 *
35 * @package core_search
36 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
37 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
38 */
39class document implements \renderable, \templatable {
40
41 /**
42 * @var array $data The document data.
43 */
44 protected $data = array();
45
46 /**
47 * @var array Extra data needed to render the document.
48 */
49 protected $extradata = array();
50
51 /**
52 * @var \moodle_url Link to the document.
53 */
54 protected $docurl = null;
55
56 /**
57 * @var \moodle_url Link to the document context.
58 */
59 protected $contexturl = null;
60
66f145ef
DM
61 /**
62 * @var \core_search\document_icon Document icon instance.
63 */
64 protected $docicon = null;
65
db48207e
DM
66 /**
67 * @var int|null The content field filearea.
68 */
69 protected $contentfilearea = null;
70
71 /**
72 * @var int|null The content field itemid.
73 */
74 protected $contentitemid = null;
75
091973db
EM
76 /**
77 * @var bool Should be set to true if document hasn't been indexed before. False if unknown.
78 */
79 protected $isnew = false;
80
81 /**
82 * @var \stored_file[] An array of stored files to attach to the document.
83 */
84 protected $files = array();
85
4359ef18 86 /**
87 * Change list (for engine implementers):
88 * 2017091700 - add optional field groupid
89 *
90 * @var int Schema version number (update if any change)
91 */
92 const SCHEMA_VERSION = 2017091700;
93
db48207e
DM
94 /**
95 * All required fields any doc should contain.
96 *
97 * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the
98 * default search engine.
99 *
100 * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format
101 * they need.
102 *
103 * @var array
104 */
105 protected static $requiredfields = array(
106 'id' => array(
107 'type' => 'string',
108 'stored' => true,
109 'indexed' => false
110 ),
111 'itemid' => array(
112 'type' => 'int',
113 'stored' => true,
114 'indexed' => true
115 ),
116 'title' => array(
4894840d 117 'type' => 'text',
db48207e 118 'stored' => true,
3744ceb6
EM
119 'indexed' => true,
120 'mainquery' => true
db48207e
DM
121 ),
122 'content' => array(
4894840d 123 'type' => 'text',
db48207e 124 'stored' => true,
3744ceb6
EM
125 'indexed' => true,
126 'mainquery' => true
db48207e
DM
127 ),
128 'contextid' => array(
129 'type' => 'int',
130 'stored' => true,
131 'indexed' => true
132 ),
133 'areaid' => array(
134 'type' => 'string',
135 'stored' => true,
136 'indexed' => true
137 ),
138 'type' => array(
139 'type' => 'int',
140 'stored' => true,
141 'indexed' => true
142 ),
143 'courseid' => array(
144 'type' => 'int',
145 'stored' => true,
a4902f66 146 'indexed' => true
db48207e 147 ),
f6b425e2
EM
148 'owneruserid' => array(
149 'type' => 'int',
150 'stored' => true,
151 'indexed' => true
152 ),
db48207e
DM
153 'modified' => array(
154 'type' => 'tdate',
155 'stored' => true,
156 'indexed' => true
157 ),
158 );
159
160 /**
161 * All optional fields docs can contain.
162 *
163 * Although it matches solr fields format, this is just to define the field types. Search
164 * engine plugins are responsible of setting their appropriate field types and map these
165 * naming to whatever format they need.
166 *
167 * @var array
168 */
169 protected static $optionalfields = array(
170 'userid' => array(
171 'type' => 'int',
172 'stored' => true,
a4902f66 173 'indexed' => true
db48207e 174 ),
4359ef18 175 'groupid' => array(
176 'type' => 'int',
177 'stored' => true,
178 'indexed' => true
179 ),
db48207e 180 'description1' => array(
4894840d 181 'type' => 'text',
db48207e 182 'stored' => true,
3744ceb6
EM
183 'indexed' => true,
184 'mainquery' => true
db48207e
DM
185 ),
186 'description2' => array(
4894840d 187 'type' => 'text',
db48207e 188 'stored' => true,
3744ceb6
EM
189 'indexed' => true,
190 'mainquery' => true
091973db 191 )
db48207e
DM
192 );
193
091973db
EM
194 /**
195 * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin
196 * for internal purposes.
197 *
198 * Field names should be prefixed with engine name to avoid potential conflict with core fields.
199 *
200 * Uses same format as fields above.
201 *
202 * @var array
203 */
204 protected static $enginefields = array();
205
db48207e
DM
206 /**
207 * We ensure that the document has a unique id across search areas.
208 *
209 * @param int $itemid An id unique to the search area
210 * @param string $componentname The search area component Frankenstyle name
211 * @param string $areaname The area name (the search area class name)
212 * @return void
213 */
214 public function __construct($itemid, $componentname, $areaname) {
215
216 if (!is_numeric($itemid)) {
217 throw new \coding_exception('The itemid should be an integer');
218 }
219
220 $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname);
221 $this->data['id'] = $this->data['areaid'] . '-' . $itemid;
222 $this->data['itemid'] = intval($itemid);
223 }
224
091973db
EM
225 /**
226 * Add a stored file to the document.
227 *
228 * @param \stored_file|int $file The file to add, or file id.
229 * @return void
230 */
231 public function add_stored_file($file) {
232 if (is_numeric($file)) {
233 $this->files[$file] = $file;
234 } else {
235 $this->files[$file->get_id()] = $file;
236 }
237 }
238
239 /**
240 * Returns the array of attached files.
241 *
242 * @return \stored_file[]
243 */
244 public function get_files() {
245 // The files array can contain stored file ids, so we need to get instances if asked.
246 foreach ($this->files as $id => $listfile) {
247 if (is_numeric($listfile)) {
248 $fs = get_file_storage();
249
250 if ($file = $fs->get_file_by_id($id)) {
251 $this->files[$id] = $file;
3043fe92
MC
252 } else {
253 unset($this->files[$id]); // Index is out of date and referencing a file that does not exist.
091973db
EM
254 }
255 }
256 }
257
258 return $this->files;
259 }
260
db48207e
DM
261 /**
262 * Setter.
263 *
264 * Basic checkings to prevent common issues.
265 *
266 * If the field is a string tags will be stripped, if it is an integer or a date it
267 * will be casted to a PHP integer. tdate fields values are expected to be timestamps.
268 *
269 * @throws \coding_exception
270 * @param string $fieldname The field name
271 * @param string|int $value The value to store
272 * @return string|int The stored value
273 */
274 public function set($fieldname, $value) {
275
276 if (!empty(static::$requiredfields[$fieldname])) {
277 $fielddata = static::$requiredfields[$fieldname];
278 } else if (!empty(static::$optionalfields[$fieldname])) {
279 $fielddata = static::$optionalfields[$fieldname];
091973db
EM
280 } else if (!empty(static::$enginefields[$fieldname])) {
281 $fielddata = static::$enginefields[$fieldname];
db48207e
DM
282 }
283
284 if (empty($fielddata)) {
285 throw new \coding_exception('"' . $fieldname . '" field does not exist.');
286 }
287
288 // tdate fields should be set as timestamps, later they might be converted to
289 // a date format, it depends on the search engine.
290 if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) {
291 throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"');
292 }
293
294 // We want to be strict here, there might be engines that expect us to
295 // provide them data with the proper type already set.
296 if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
297 $this->data[$fieldname] = intval($value);
298 } else {
ffa868a9 299 // Remove disallowed Unicode characters.
300 $value = \core_text::remove_unicode_non_characters($value);
301
505ce884 302 // Replace all groups of line breaks and spaces by single spaces.
2a36babb 303 $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
d20ec360 304 if ($this->data[$fieldname] === null) {
305 if (isset($this->data['id'])) {
306 $docid = $this->data['id'];
307 } else {
308 $docid = '(unknown)';
309 }
310 throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname .
311 '" value causes preg_replace error (may be caused by unusual characters) ' .
312 'in document with id "' . $docid . '"');
313 }
db48207e
DM
314 }
315
316 return $this->data[$fieldname];
317 }
318
319 /**
320 * Sets data to this->extradata
321 *
322 * This data can be retrieved using \core_search\document->get($fieldname).
323 *
324 * @param string $fieldname
325 * @param string $value
326 * @return void
327 */
328 public function set_extra($fieldname, $value) {
329 $this->extradata[$fieldname] = $value;
330 }
331
332 /**
333 * Getter.
334 *
335 * Use self::is_set if you are not sure if this field is set or not
336 * as otherwise it will trigger a \coding_exception
337 *
338 * @throws \coding_exception
339 * @param string $field
340 * @return string|int
341 */
342 public function get($field) {
343
344 if (isset($this->data[$field])) {
345 return $this->data[$field];
346 }
347
348 // Fallback to extra data.
349 if (isset($this->extradata[$field])) {
350 return $this->extradata[$field];
351 }
352
353 throw new \coding_exception('Field "' . $field . '" is not set in the document');
354 }
355
356 /**
357 * Checks if a field is set.
358 *
359 * @param string $field
360 * @return bool
361 */
362 public function is_set($field) {
363 return (isset($this->data[$field]) || isset($this->extradata[$field]));
364 }
365
091973db
EM
366 /**
367 * Set if this is a new document. False if unknown.
368 *
369 * @param bool $new
370 */
371 public function set_is_new($new) {
372 $this->isnew = (bool)$new;
373 }
374
375 /**
376 * Returns if the document is new. False if unknown.
377 *
378 * @return bool
379 */
380 public function get_is_new() {
381 return $this->isnew;
382 }
383
db48207e
DM
384 /**
385 * Returns all default fields definitions.
386 *
387 * @return array
388 */
389 public static function get_default_fields_definition() {
091973db 390 return static::$requiredfields + static::$optionalfields + static::$enginefields;
db48207e
DM
391 }
392
393 /**
394 * Formats the timestamp preparing the time fields to be inserted into the search engine.
395 *
396 * By default it just returns a timestamp so any search engine could just store integers
397 * and use integers comparison to get documents between x and y timestamps, but search
398 * engines might be interested in using their own field formats. They can do it extending
399 * this class in \search_xxx\document.
400 *
401 * @param int $timestamp
402 * @return string
403 */
404 public static function format_time_for_engine($timestamp) {
405 return $timestamp;
406 }
407
408 /**
409 * Formats a string value for the search engine.
410 *
411 * Search engines may overwrite this method to apply restrictions, like limiting the size.
412 * The default behaviour is just returning the string.
413 *
414 * @param string $string
415 * @return string
416 */
417 public static function format_string_for_engine($string) {
418 return $string;
419 }
420
4894840d
EM
421 /**
422 * Formats a text value for the search engine.
423 *
424 * Search engines may overwrite this method to apply restrictions, like limiting the size.
425 * The default behaviour is just returning the string.
426 *
427 * @param string $text
428 * @return string
429 */
430 public static function format_text_for_engine($text) {
431 return $text;
432 }
433
db48207e
DM
434 /**
435 * Returns a timestamp from the value stored in the search engine.
436 *
437 * By default it just returns a timestamp so any search engine could just store integers
438 * and use integers comparison to get documents between x and y timestamps, but search
439 * engines might be interested in using their own field formats. They should do it extending
440 * this class in \search_xxx\document.
441 *
442 * @param string $time
443 * @return int
444 */
445 public static function import_time_from_engine($time) {
446 return $time;
447 }
448
449 /**
450 * Returns how text is returned from the search engine.
451 *
452 * @return int
453 */
454 protected function get_text_format() {
455 return FORMAT_PLAIN;
456 }
457
458 /**
459 * Fills the document with data coming from the search engine.
460 *
461 * @throws \core_search\engine_exception
462 * @param array $docdata
463 * @return void
464 */
465 public function set_data_from_engine($docdata) {
091973db 466 $fields = static::$requiredfields + static::$optionalfields + static::$enginefields;
db48207e
DM
467 foreach ($fields as $fieldname => $field) {
468
469 // Optional params might not be there.
470 if (isset($docdata[$fieldname])) {
471 if ($field['type'] === 'tdate') {
472 // Time fields may need a preprocessing.
473 $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname]));
474 } else {
475 // No way we can make this work if there is any multivalue field.
476 if (is_array($docdata[$fieldname])) {
477 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname);
478 }
479 $this->set($fieldname, $docdata[$fieldname]);
480 }
481 }
482 }
483 }
484
485 /**
486 * Sets the document url.
487 *
488 * @param \moodle_url $url
489 * @return void
490 */
491 public function set_doc_url(\moodle_url $url) {
492 $this->docurl = $url;
493 }
494
495 /**
496 * Gets the url to the doc.
497 *
498 * @return \moodle_url
499 */
500 public function get_doc_url() {
501 return $this->docurl;
502 }
503
66f145ef
DM
504 /**
505 * Sets document icon instance.
506 *
507 * @param \core_search\document_icon $docicon
508 */
509 public function set_doc_icon(document_icon $docicon) {
510 $this->docicon = $docicon;
511 }
512
513 /**
514 * Gets document icon instance.
515 *
516 * @return \core_search\document_icon
517 */
518 public function get_doc_icon() {
519 return $this->docicon;
520 }
521
db48207e
DM
522 public function set_context_url(\moodle_url $url) {
523 $this->contexturl = $url;
524 }
525
526 /**
527 * Gets the url to the context.
528 *
529 * @return \moodle_url
530 */
531 public function get_context_url() {
532 return $this->contexturl;
533 }
534
535 /**
536 * Returns the document ready to submit to the search engine.
537 *
538 * @throws \coding_exception
539 * @return array
540 */
541 public function export_for_engine() {
091973db
EM
542 // Set any unset defaults.
543 $this->apply_defaults();
db48207e
DM
544
545 // We don't want to affect the document instance.
546 $data = $this->data;
547
548 // Apply specific engine-dependant formats and restrictions.
549 foreach (static::$requiredfields as $fieldname => $field) {
550
551 // We also check that we have everything we need.
552 if (!isset($data[$fieldname])) {
553 throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"');
554 }
555
556 if ($field['type'] === 'tdate') {
557 // Overwrite the timestamp with the engine dependant format.
558 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
559 } else if ($field['type'] === 'string') {
4894840d 560 // Overwrite the string with the engine dependant format.
db48207e 561 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
4894840d
EM
562 } else if ($field['type'] === 'text') {
563 // Overwrite the text with the engine dependant format.
564 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
db48207e 565 }
4894840d 566
db48207e
DM
567 }
568
091973db
EM
569 $fields = static::$optionalfields + static::$enginefields;
570 foreach ($fields as $fieldname => $field) {
db48207e
DM
571 if (!isset($data[$fieldname])) {
572 continue;
573 }
574 if ($field['type'] === 'tdate') {
575 // Overwrite the timestamp with the engine dependant format.
576 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
577 } else if ($field['type'] === 'string') {
4894840d 578 // Overwrite the string with the engine dependant format.
db48207e 579 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
4894840d
EM
580 } else if ($field['type'] === 'text') {
581 // Overwrite the text with the engine dependant format.
582 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
db48207e
DM
583 }
584 }
585
586 return $data;
587 }
588
091973db
EM
589 /**
590 * Apply any defaults to unset fields before export. Called after document building, but before export.
591 *
592 * Sub-classes of this should make sure to call parent::apply_defaults().
593 */
594 protected function apply_defaults() {
595 // Set the default type, TYPE_TEXT.
596 if (!isset($this->data['type'])) {
597 $this->data['type'] = manager::TYPE_TEXT;
598 }
599 }
600
db48207e
DM
601 /**
602 * Export the document data to be used as a template context.
603 *
604 * Adding more info than the required one as people might be interested in extending the template.
605 *
606 * Although content is a required field when setting up the document, it accepts '' (empty) values
607 * as they may be the result of striping out HTML.
608 *
4e2b5191
EM
609 * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed.
610 * The renderer will output the content without any further cleaning.
611 *
db48207e
DM
612 * @param renderer_base $output The renderer.
613 * @return array
614 */
615 public function export_for_template(\renderer_base $output) {
db48207e
DM
616 list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid'));
617
10505522
VD
618 $searcharea = \core_search\manager::get_search_area($this->data['areaid']);
619 $title = $this->is_set('title') ? $this->format_text($searcharea->get_document_display_title($this)) : '';
db48207e 620 $data = [
a5deab2d
SK
621 'componentname' => $componentname,
622 'areaname' => $areaname,
b06a90b0 623 'courseurl' => course_get_url($this->get('courseid')),
db48207e
DM
624 'coursefullname' => format_string($this->get('coursefullname'), true, array('context' => $this->get('contextid'))),
625 'modified' => userdate($this->get('modified')),
4894840d 626 'title' => ($title !== '') ? $title : get_string('notitle', 'search'),
db48207e
DM
627 'docurl' => $this->get_doc_url(),
628 'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null,
629 'contexturl' => $this->get_context_url(),
630 'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null,
631 'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null,
632 ];
633
091973db
EM
634 // Now take any attached any files.
635 $files = $this->get_files();
636 if (!empty($files)) {
637 if (count($files) > 1) {
638 $filenames = array();
639 foreach ($files as $file) {
4e2b5191 640 $filenames[] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
091973db
EM
641 }
642 $data['multiplefiles'] = true;
643 $data['filenames'] = $filenames;
644 } else {
645 $file = reset($files);
4e2b5191 646 $data['filename'] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
091973db
EM
647 }
648 }
649
db48207e
DM
650 if ($this->is_set('userid')) {
651 $data['userurl'] = new \moodle_url('/user/view.php', array('id' => $this->get('userid'), 'course' => $this->get('courseid')));
652 $data['userfullname'] = format_string($this->get('userfullname'), true, array('context' => $this->get('contextid')));
653 }
654
66f145ef
DM
655 if ($docicon = $this->get_doc_icon()) {
656 $data['icon'] = $output->image_url($docicon->get_name(), $docicon->get_component());
657 }
658
db48207e
DM
659 return $data;
660 }
661
662 /**
663 * Formats a text string coming from the search engine.
664 *
665 * By default just return the text as it is:
666 * - Search areas are responsible of sending just plain data, the search engine may
667 * append HTML or markdown to it (highlighing for example).
668 * - The view is responsible of shortening the text if it is too big
669 *
670 * @param string $text Text to format
671 * @return string HTML text to be renderer
672 */
673 protected function format_text($text) {
674 return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid')));
675 }
676}