2 // This file is part of Moodle - http://moodle.org/
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
18 * Document representation.
20 * @package core_search
21 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace core_search;
27 defined('MOODLE_INTERNAL') || die();
30 * Represents a document to index.
32 * Note that, if you are writting a search engine and you want to change \core_search\document
33 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document.
35 * @package core_search
36 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
37 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
39 class document implements \renderable, \templatable {
42 * @var array $data The document data.
44 protected $data = array();
47 * @var array Extra data needed to render the document.
49 protected $extradata = array();
52 * @var \moodle_url Link to the document.
54 protected $docurl = null;
57 * @var \moodle_url Link to the document context.
59 protected $contexturl = null;
62 * @var \core_search\document_icon Document icon instance.
64 protected $docicon = null;
67 * @var int|null The content field filearea.
69 protected $contentfilearea = null;
72 * @var int|null The content field itemid.
74 protected $contentitemid = null;
77 * @var bool Should be set to true if document hasn't been indexed before. False if unknown.
79 protected $isnew = false;
82 * @var \stored_file[] An array of stored files to attach to the document.
84 protected $files = array();
87 * Change list (for engine implementers):
88 * 2017091700 - add optional field groupid
90 * @var int Schema version number (update if any change)
92 const SCHEMA_VERSION = 2017091700;
95 * All required fields any doc should contain.
97 * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the
98 * default search engine.
100 * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format
105 protected static $requiredfields = array(
128 'contextid' => array(
148 'owneruserid' => array(
161 * All optional fields docs can contain.
163 * Although it matches solr fields format, this is just to define the field types. Search
164 * engine plugins are responsible of setting their appropriate field types and map these
165 * naming to whatever format they need.
169 protected static $optionalfields = array(
180 'description1' => array(
186 'description2' => array(
195 * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin
196 * for internal purposes.
198 * Field names should be prefixed with engine name to avoid potential conflict with core fields.
200 * Uses same format as fields above.
204 protected static $enginefields = array();
207 * We ensure that the document has a unique id across search areas.
209 * @param int $itemid An id unique to the search area
210 * @param string $componentname The search area component Frankenstyle name
211 * @param string $areaname The area name (the search area class name)
214 public function __construct($itemid, $componentname, $areaname) {
216 if (!is_numeric($itemid)) {
217 throw new \coding_exception('The itemid should be an integer');
220 $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname);
221 $this->data['id'] = $this->data['areaid'] . '-' . $itemid;
222 $this->data['itemid'] = intval($itemid);
226 * Add a stored file to the document.
228 * @param \stored_file|int $file The file to add, or file id.
231 public function add_stored_file($file) {
232 if (is_numeric($file)) {
233 $this->files[$file] = $file;
235 $this->files[$file->get_id()] = $file;
240 * Returns the array of attached files.
242 * @return \stored_file[]
244 public function get_files() {
245 // The files array can contain stored file ids, so we need to get instances if asked.
246 foreach ($this->files as $id => $listfile) {
247 if (is_numeric($listfile)) {
248 $fs = get_file_storage();
250 if ($file = $fs->get_file_by_id($id)) {
251 $this->files[$id] = $file;
253 unset($this->files[$id]); // Index is out of date and referencing a file that does not exist.
264 * Basic checkings to prevent common issues.
266 * If the field is a string tags will be stripped, if it is an integer or a date it
267 * will be casted to a PHP integer. tdate fields values are expected to be timestamps.
269 * @throws \coding_exception
270 * @param string $fieldname The field name
271 * @param string|int $value The value to store
272 * @return string|int The stored value
274 public function set($fieldname, $value) {
276 if (!empty(static::$requiredfields[$fieldname])) {
277 $fielddata = static::$requiredfields[$fieldname];
278 } else if (!empty(static::$optionalfields[$fieldname])) {
279 $fielddata = static::$optionalfields[$fieldname];
280 } else if (!empty(static::$enginefields[$fieldname])) {
281 $fielddata = static::$enginefields[$fieldname];
284 if (empty($fielddata)) {
285 throw new \coding_exception('"' . $fieldname . '" field does not exist.');
288 // tdate fields should be set as timestamps, later they might be converted to
289 // a date format, it depends on the search engine.
290 if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) {
291 throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"');
294 // We want to be strict here, there might be engines that expect us to
295 // provide them data with the proper type already set.
296 if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
297 $this->data[$fieldname] = intval($value);
299 // Remove disallowed Unicode characters.
300 $value = \core_text::remove_unicode_non_characters($value);
302 // Replace all groups of line breaks and spaces by single spaces.
303 $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
304 if ($this->data[$fieldname] === null) {
305 if (isset($this->data['id'])) {
306 $docid = $this->data['id'];
308 $docid = '(unknown)';
310 throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname .
311 '" value causes preg_replace error (may be caused by unusual characters) ' .
312 'in document with id "' . $docid . '"');
316 return $this->data[$fieldname];
320 * Sets data to this->extradata
322 * This data can be retrieved using \core_search\document->get($fieldname).
324 * @param string $fieldname
325 * @param string $value
328 public function set_extra($fieldname, $value) {
329 $this->extradata[$fieldname] = $value;
335 * Use self::is_set if you are not sure if this field is set or not
336 * as otherwise it will trigger a \coding_exception
338 * @throws \coding_exception
339 * @param string $field
342 public function get($field) {
344 if (isset($this->data[$field])) {
345 return $this->data[$field];
348 // Fallback to extra data.
349 if (isset($this->extradata[$field])) {
350 return $this->extradata[$field];
353 throw new \coding_exception('Field "' . $field . '" is not set in the document');
357 * Checks if a field is set.
359 * @param string $field
362 public function is_set($field) {
363 return (isset($this->data[$field]) || isset($this->extradata[$field]));
367 * Set if this is a new document. False if unknown.
371 public function set_is_new($new) {
372 $this->isnew = (bool)$new;
376 * Returns if the document is new. False if unknown.
380 public function get_is_new() {
385 * Returns all default fields definitions.
389 public static function get_default_fields_definition() {
390 return static::$requiredfields + static::$optionalfields + static::$enginefields;
394 * Formats the timestamp preparing the time fields to be inserted into the search engine.
396 * By default it just returns a timestamp so any search engine could just store integers
397 * and use integers comparison to get documents between x and y timestamps, but search
398 * engines might be interested in using their own field formats. They can do it extending
399 * this class in \search_xxx\document.
401 * @param int $timestamp
404 public static function format_time_for_engine($timestamp) {
409 * Formats a string value for the search engine.
411 * Search engines may overwrite this method to apply restrictions, like limiting the size.
412 * The default behaviour is just returning the string.
414 * @param string $string
417 public static function format_string_for_engine($string) {
422 * Formats a text value for the search engine.
424 * Search engines may overwrite this method to apply restrictions, like limiting the size.
425 * The default behaviour is just returning the string.
427 * @param string $text
430 public static function format_text_for_engine($text) {
435 * Returns a timestamp from the value stored in the search engine.
437 * By default it just returns a timestamp so any search engine could just store integers
438 * and use integers comparison to get documents between x and y timestamps, but search
439 * engines might be interested in using their own field formats. They should do it extending
440 * this class in \search_xxx\document.
442 * @param string $time
445 public static function import_time_from_engine($time) {
450 * Returns how text is returned from the search engine.
454 protected function get_text_format() {
459 * Fills the document with data coming from the search engine.
461 * @throws \core_search\engine_exception
462 * @param array $docdata
465 public function set_data_from_engine($docdata) {
466 $fields = static::$requiredfields + static::$optionalfields + static::$enginefields;
467 foreach ($fields as $fieldname => $field) {
469 // Optional params might not be there.
470 if (isset($docdata[$fieldname])) {
471 if ($field['type'] === 'tdate') {
472 // Time fields may need a preprocessing.
473 $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname]));
475 // No way we can make this work if there is any multivalue field.
476 if (is_array($docdata[$fieldname])) {
477 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname);
479 $this->set($fieldname, $docdata[$fieldname]);
486 * Sets the document url.
488 * @param \moodle_url $url
491 public function set_doc_url(\moodle_url $url) {
492 $this->docurl = $url;
496 * Gets the url to the doc.
498 * @return \moodle_url
500 public function get_doc_url() {
501 return $this->docurl;
505 * Sets document icon instance.
507 * @param \core_search\document_icon $docicon
509 public function set_doc_icon(document_icon $docicon) {
510 $this->docicon = $docicon;
514 * Gets document icon instance.
516 * @return \core_search\document_icon
518 public function get_doc_icon() {
519 return $this->docicon;
522 public function set_context_url(\moodle_url $url) {
523 $this->contexturl = $url;
527 * Gets the url to the context.
529 * @return \moodle_url
531 public function get_context_url() {
532 return $this->contexturl;
536 * Returns the document ready to submit to the search engine.
538 * @throws \coding_exception
541 public function export_for_engine() {
542 // Set any unset defaults.
543 $this->apply_defaults();
545 // We don't want to affect the document instance.
548 // Apply specific engine-dependant formats and restrictions.
549 foreach (static::$requiredfields as $fieldname => $field) {
551 // We also check that we have everything we need.
552 if (!isset($data[$fieldname])) {
553 throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"');
556 if ($field['type'] === 'tdate') {
557 // Overwrite the timestamp with the engine dependant format.
558 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
559 } else if ($field['type'] === 'string') {
560 // Overwrite the string with the engine dependant format.
561 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
562 } else if ($field['type'] === 'text') {
563 // Overwrite the text with the engine dependant format.
564 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
569 $fields = static::$optionalfields + static::$enginefields;
570 foreach ($fields as $fieldname => $field) {
571 if (!isset($data[$fieldname])) {
574 if ($field['type'] === 'tdate') {
575 // Overwrite the timestamp with the engine dependant format.
576 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
577 } else if ($field['type'] === 'string') {
578 // Overwrite the string with the engine dependant format.
579 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
580 } else if ($field['type'] === 'text') {
581 // Overwrite the text with the engine dependant format.
582 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
590 * Apply any defaults to unset fields before export. Called after document building, but before export.
592 * Sub-classes of this should make sure to call parent::apply_defaults().
594 protected function apply_defaults() {
595 // Set the default type, TYPE_TEXT.
596 if (!isset($this->data['type'])) {
597 $this->data['type'] = manager::TYPE_TEXT;
602 * Export the document data to be used as a template context.
604 * Adding more info than the required one as people might be interested in extending the template.
606 * Although content is a required field when setting up the document, it accepts '' (empty) values
607 * as they may be the result of striping out HTML.
609 * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed.
610 * The renderer will output the content without any further cleaning.
612 * @param renderer_base $output The renderer.
615 public function export_for_template(\renderer_base $output) {
616 list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid'));
618 $searcharea = \core_search\manager::get_search_area($this->data['areaid']);
619 $title = $this->is_set('title') ? $this->format_text($searcharea->get_document_display_title($this)) : '';
621 'componentname' => $componentname,
622 'areaname' => $areaname,
623 'courseurl' => course_get_url($this->get('courseid')),
624 'coursefullname' => format_string($this->get('coursefullname'), true, array('context' => $this->get('contextid'))),
625 'modified' => userdate($this->get('modified')),
626 'title' => ($title !== '') ? $title : get_string('notitle', 'search'),
627 'docurl' => $this->get_doc_url(),
628 'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null,
629 'contexturl' => $this->get_context_url(),
630 'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null,
631 'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null,
634 // Now take any attached any files.
635 $files = $this->get_files();
636 if (!empty($files)) {
637 if (count($files) > 1) {
638 $filenames = array();
639 foreach ($files as $file) {
640 $filenames[] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
642 $data['multiplefiles'] = true;
643 $data['filenames'] = $filenames;
645 $file = reset($files);
646 $data['filename'] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
650 if ($this->is_set('userid')) {
651 $data['userurl'] = new \moodle_url('/user/view.php', array('id' => $this->get('userid'), 'course' => $this->get('courseid')));
652 $data['userfullname'] = format_string($this->get('userfullname'), true, array('context' => $this->get('contextid')));
655 if ($docicon = $this->get_doc_icon()) {
656 $data['icon'] = $output->image_url($docicon->get_name(), $docicon->get_component());
663 * Formats a text string coming from the search engine.
665 * By default just return the text as it is:
666 * - Search areas are responsible of sending just plain data, the search engine may
667 * append HTML or markdown to it (highlighing for example).
668 * - The view is responsible of shortening the text if it is too big
670 * @param string $text Text to format
671 * @return string HTML text to be renderer
673 protected function format_text($text) {
674 return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid')));