MDL-59661 analytics: Export models training data
[moodle.git] / analytics / classes / local / analyser / base.php
CommitLineData
369389c9
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
413f19bc 18 * Analysers base class.
369389c9
DM
19 *
20 * @package core_analytics
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_analytics\local\analyser;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
413f19bc 30 * Analysers base class.
369389c9
DM
31 *
32 * @package core_analytics
33 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36abstract class base {
37
413f19bc
DM
38 /**
39 * @var int
40 */
369389c9
DM
41 protected $modelid;
42
413f19bc
DM
43 /**
44 * The model target.
45 *
46 * @var \core_analytics\local\target\base
47 */
369389c9 48 protected $target;
413f19bc
DM
49
50 /**
51 * The model indicators.
52 *
53 * @var \core_analytics\local\indicator\base[]
54 */
369389c9 55 protected $indicators;
413f19bc
DM
56
57 /**
58 * Time splitting methods to use.
59 *
60 * Multiple time splitting methods during evaluation and 1 single
61 * time splitting method once the model is enabled.
62 *
63 * @var \core_analytics\local\time_splitting\base[]
64 */
369389c9
DM
65 protected $timesplittings;
66
413f19bc
DM
67 /**
68 * Execution options.
69 *
70 * @var array
71 */
369389c9
DM
72 protected $options;
73
413f19bc
DM
74 /**
75 * Simple log array.
76 *
77 * @var string[]
78 */
369389c9
DM
79 protected $log;
80
413f19bc
DM
81 /**
82 * Constructor method.
83 *
84 * @param int $modelid
85 * @param \core_analytics\local\target\base $target
86 * @param \core_analytics\local\indicator\base[] $indicators
87 * @param \core_analytics\local\time_splitting\base[] $timesplittings
88 * @param array $options
89 * @return void
90 */
369389c9
DM
91 public function __construct($modelid, \core_analytics\local\target\base $target, $indicators, $timesplittings, $options) {
92 $this->modelid = $modelid;
93 $this->target = $target;
94 $this->indicators = $indicators;
95 $this->timesplittings = $timesplittings;
96
97 if (empty($options['evaluation'])) {
98 $options['evaluation'] = false;
99 }
100 $this->options = $options;
101
102 // Checks if the analyser satisfies the indicators requirements.
103 $this->check_indicators_requirements();
104
105 $this->log = array();
106 }
107
108 /**
413f19bc 109 * This function returns this analysable list of samples.
369389c9
DM
110 *
111 * @param \core_analytics\analysable $analysable
a40952d3 112 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
369389c9
DM
113 */
114 abstract protected function get_all_samples(\core_analytics\analysable $analysable);
115
a40952d3 116 /**
413f19bc 117 * This function returns the samples data from a list of sample ids.
a40952d3
DM
118 *
119 * @param int[] $sampleids
120 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
121 */
369389c9
DM
122 abstract public function get_samples($sampleids);
123
a40952d3 124 /**
413f19bc 125 * Returns the analysable of a sample.
a40952d3
DM
126 *
127 * @param int $sampleid
128 * @return \core_analytics\analysable
129 */
130 abstract public function get_sample_analysable($sampleid);
131
132 /**
413f19bc 133 * Returns the sample's origin in moodle database.
a40952d3
DM
134 *
135 * @return string
136 */
369389c9
DM
137 abstract protected function get_samples_origin();
138
139 /**
413f19bc
DM
140 * Returns the context of a sample.
141 *
369389c9
DM
142 * moodle/analytics:listinsights will be required at this level to access the sample predictions.
143 *
144 * @param int $sampleid
145 * @return \context
146 */
147 abstract public function sample_access_context($sampleid);
148
a40952d3 149 /**
413f19bc 150 * Describes a sample with a description summary and a \renderable (an image for example)
a40952d3
DM
151 *
152 * @param int $sampleid
153 * @param int $contextid
154 * @param array $sampledata
155 * @return array array(string, \renderable)
156 */
369389c9
DM
157 abstract public function sample_description($sampleid, $contextid, $sampledata);
158
369389c9
DM
159 /**
160 * Main analyser method which processes the site analysables.
161 *
162 * \core_analytics\local\analyser\by_course and \core_analytics\local\analyser\sitewide are implementing
163 * this method returning site courses (by_course) and the whole system (sitewide) as analysables.
164 * In most of the cases you should have enough extending from one of these classes so you don't need
165 * to reimplement this method.
166 *
413f19bc 167 * @param bool $includetarget
369389c9
DM
168 * @return \stored_file[]
169 */
170 abstract public function get_analysable_data($includetarget);
171
413f19bc
DM
172 /**
173 * Samples data this analyser provides.
174 *
175 * @return string[]
176 */
177 protected function provided_sample_data() {
178 return array($this->get_samples_origin());
179 }
180
181 /**
182 * Returns labelled data (training and evaluation).
183 *
184 * @return array
185 */
369389c9
DM
186 public function get_labelled_data() {
187 return $this->get_analysable_data(true);
188 }
189
413f19bc
DM
190 /**
191 * Returns unlabelled data (prediction).
192 *
193 * @return array
194 */
369389c9
DM
195 public function get_unlabelled_data() {
196 return $this->get_analysable_data(false);
197 }
198
199 /**
200 * Checks if the analyser satisfies all the model indicators requirements.
201 *
202 * @throws \core_analytics\requirements_exception
203 * @return void
204 */
205 protected function check_indicators_requirements() {
206
207 foreach ($this->indicators as $indicator) {
208 $missingrequired = $this->check_indicator_requirements($indicator);
209 if ($missingrequired !== true) {
210 throw new \core_analytics\requirements_exception(get_class($indicator) . ' indicator requires ' .
211 json_encode($missingrequired) . ' sample data which is not provided by ' . get_class($this));
212 }
213 }
214 }
215
216 /**
413f19bc 217 * Checks that this analyser satisfies the provided indicator requirements.
369389c9
DM
218 *
219 * @param \core_analytics\local\indicator\base $indicator
220 * @return true|string[] True if all good, missing requirements list otherwise
221 */
222 public function check_indicator_requirements(\core_analytics\local\indicator\base $indicator) {
223
224 $providedsampledata = $this->provided_sample_data();
225
226 $requiredsampledata = $indicator::required_sample_data();
227 if (empty($requiredsampledata)) {
228 // The indicator does not need any sample data.
229 return true;
230 }
231 $missingrequired = array_diff($requiredsampledata, $providedsampledata);
232
233 if (empty($missingrequired)) {
234 return true;
235 }
236
237 return $missingrequired;
238 }
239
240 /**
241 * Processes an analysable
242 *
243 * This method returns the general analysable status, an array of files by time splitting method and
244 * an error message if there is any problem.
245 *
246 * @param \core_analytics\analysable $analysable
247 * @param bool $includetarget
248 * @return \stored_file[] Files by time splitting method
249 */
250 public function process_analysable($analysable, $includetarget) {
251
252 // Default returns.
253 $files = array();
254 $message = null;
255
256 // Target instances scope is per-analysable (it can't be lower as calculations run once per
257 // analysable, not time splitting method nor time range).
1611308b 258 $target = call_user_func(array($this->target, 'instance'));
369389c9
DM
259
260 // We need to check that the analysable is valid for the target even if we don't include targets
261 // as we still need to discard invalid analysables for the target.
e10b29ed 262 $result = $target->is_valid_analysable($analysable, $includetarget);
369389c9
DM
263 if ($result !== true) {
264 $a = new \stdClass();
265 $a->analysableid = $analysable->get_id();
266 $a->result = $result;
a40952d3 267 $this->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
369389c9
DM
268 return array();
269 }
270
271 // Process all provided time splitting methods.
272 $results = array();
273 foreach ($this->timesplittings as $timesplitting) {
274
275 // For evaluation purposes we don't need to be that strict about how updated the data is,
276 // if this analyser was analysed less that 1 week ago we skip generating a new one. This
277 // helps scale the evaluation process as sites with tons of courses may a lot of time to
278 // complete an evaluation.
279 if (!empty($this->options['evaluation']) && !empty($this->options['reuseprevanalysed'])) {
280
281 $previousanalysis = \core_analytics\dataset_manager::get_evaluation_analysable_file($this->modelid,
282 $analysable->get_id(), $timesplitting->get_id());
1611308b 283 // 1 week is a partly random time interval, no need to worry about DST.
369389c9
DM
284 $boundary = time() - WEEKSECS;
285 if ($previousanalysis && $previousanalysis->get_timecreated() > $boundary) {
286 // Recover the previous analysed file and avoid generating a new one.
287
288 // Don't bother filling a result object as it is only useful when there are no files generated.
289 $files[$timesplitting->get_id()] = $previousanalysis;
290 continue;
291 }
292 }
293
294 if ($includetarget) {
295 $result = $this->process_time_splitting($timesplitting, $analysable, $target);
296 } else {
297 $result = $this->process_time_splitting($timesplitting, $analysable);
298 }
299
300 if (!empty($result->file)) {
301 $files[$timesplitting->get_id()] = $result->file;
302 }
303 $results[] = $result;
304 }
305
306 if (empty($files)) {
307 $errors = array();
308 foreach ($results as $timesplittingid => $result) {
309 $errors[] = $timesplittingid . ': ' . $result->message;
310 }
311
312 $a = new \stdClass();
313 $a->analysableid = $analysable->get_id();
413f19bc 314 $a->errors = implode(', ', $errors);
a40952d3 315 $this->add_log(get_string('analysablenotused', 'analytics', $a));
369389c9
DM
316 }
317
318 return $files;
319 }
320
a40952d3 321 /**
413f19bc 322 * Adds a register to the analysis log.
a40952d3
DM
323 *
324 * @param string $string
325 * @return void
326 */
327 public function add_log($string) {
328 $this->log[] = $string;
329 }
330
331 /**
413f19bc 332 * Returns the analysis logs.
a40952d3
DM
333 *
334 * @return string[]
335 */
369389c9
DM
336 public function get_logs() {
337 return $this->log;
338 }
339
413f19bc
DM
340 /**
341 * Processes the analysable samples using the provided time splitting method.
342 *
343 * @param \core_analytics\local\time_splitting\base $timesplitting
344 * @param \core_analytics\analysable $analysable
345 * @param \core_analytics\local\target\base|false $target
346 * @return \stdClass Results object.
347 */
369389c9
DM
348 protected function process_time_splitting($timesplitting, $analysable, $target = false) {
349
350 $result = new \stdClass();
351
352 if (!$timesplitting->is_valid_analysable($analysable)) {
413f19bc 353 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
354 $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
355 $timesplitting->get_name());
356 return $result;
357 }
358 $timesplitting->set_analysable($analysable);
359
360 if (CLI_SCRIPT && !PHPUNIT_TEST) {
413f19bc
DM
361 mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() .
362 '" time splitting method...');
369389c9
DM
363 }
364
365 // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
366 // attempt... it is on what we will base indicators calculations.
367 list($sampleids, $samplesdata) = $this->get_all_samples($analysable);
368
369 if (count($sampleids) === 0) {
413f19bc 370 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
371 $result->message = get_string('nodata', 'analytics');
372 return $result;
373 }
374
375 if ($target) {
376 // All ranges are used when we are calculating data for training.
377 $ranges = $timesplitting->get_all_ranges();
378 } else {
00da1e60
DM
379 // The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
380 $ranges = $this->get_most_recent_prediction_range($timesplitting);
369389c9
DM
381 }
382
383 // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
384 if ($this->options['evaluation'] === false) {
385
386 if (empty($ranges)) {
413f19bc 387 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
00da1e60 388 $result->message = get_string('noranges', 'analytics');
369389c9
DM
389 return $result;
390 }
391
00da1e60
DM
392 // We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
393 $this->filter_out_train_samples($sampleids, $timesplitting);
369389c9
DM
394
395 if (count($sampleids) === 0) {
413f19bc 396 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
397 $result->message = get_string('nonewdata', 'analytics');
398 return $result;
399 }
400
369389c9
DM
401 // Only when processing data for predictions.
402 if ($target === false) {
00da1e60
DM
403 // We also filter out samples and ranges that have already been used for predictions.
404 $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
405 }
406
407 if (count($sampleids) === 0) {
408 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
409 $result->message = get_string('nonewdata', 'analytics');
410 return $result;
369389c9
DM
411 }
412
413 if (count($ranges) === 0) {
413f19bc 414 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
00da1e60 415 $result->message = get_string('nonewranges', 'analytics');
369389c9
DM
416 return $result;
417 }
418 }
419
56d4981e
DM
420 if (!empty($target)) {
421 $filearea = \core_analytics\dataset_manager::LABELLED_FILEAREA;
422 } else {
423 $filearea = \core_analytics\dataset_manager::UNLABELLED_FILEAREA;
424 }
369389c9 425 $dataset = new \core_analytics\dataset_manager($this->modelid, $analysable->get_id(), $timesplitting->get_id(),
56d4981e 426 $filearea, $this->options['evaluation']);
369389c9
DM
427
428 // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
1611308b
DM
429 if (!$dataset->init_process()) {
430 // If this model + analysable + timesplitting combination is being analysed we skip this process.
431 $result->status = \core_analytics\model::NO_DATASET;
432 $result->message = get_string('analysisinprogress', 'analytics');
433 return $result;
434 }
435
436 // Remove samples the target consider invalid. Note that we use $this->target, $target will be false
437 // during prediction, but we still need to discard samples the target considers invalid.
438 $this->target->add_sample_data($samplesdata);
439 $this->target->filter_out_invalid_samples($sampleids, $analysable, $target);
440
441 if (!$sampleids) {
442 $result->status = \core_analytics\model::NO_DATASET;
443 $result->message = get_string('novalidsamples', 'analytics');
444 $dataset->close_process();
445 return $result;
446 }
369389c9
DM
447
448 foreach ($this->indicators as $key => $indicator) {
449 // The analyser attaches the main entities the sample depends on and are provided to the
450 // indicator to calculate the sample.
a40952d3
DM
451 $this->indicators[$key]->add_sample_data($samplesdata);
452 }
1611308b
DM
453 // Provide samples to the target instance (different than $this->target) $target is the new instance we get
454 // for each analysis in progress.
a40952d3 455 if ($target) {
a40952d3 456 $target->add_sample_data($samplesdata);
369389c9
DM
457 }
458
459 // Here we start the memory intensive process that will last until $data var is
460 // unset (until the method is finished basically).
461 $data = $timesplitting->calculate($sampleids, $this->get_samples_origin(), $this->indicators, $ranges, $target);
462
463 if (!$data) {
413f19bc 464 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9 465 $result->message = get_string('novaliddata', 'analytics');
1611308b 466 $dataset->close_process();
369389c9
DM
467 return $result;
468 }
469
470 // Write all calculated data to a file.
471 $file = $dataset->store($data);
472
473 // Flag the model + analysable + timesplitting as analysed.
474 $dataset->close_process();
475
476 // No need to keep track of analysed stuff when evaluating.
477 if ($this->options['evaluation'] === false) {
478 // Save the samples that have been already analysed so they are not analysed again in future.
479
480 if ($target) {
481 $this->save_train_samples($sampleids, $timesplitting, $file);
482 } else {
00da1e60 483 $this->save_prediction_samples($sampleids, $ranges, $timesplitting);
369389c9
DM
484 }
485 }
486
487 $result->status = \core_analytics\model::OK;
488 $result->message = get_string('successfullyanalysed', 'analytics');
489 $result->file = $file;
490 return $result;
491 }
492
413f19bc 493 /**
00da1e60 494 * Returns the most recent range that can be used to predict.
413f19bc
DM
495 *
496 * @param \core_analytics\local\time_splitting\base $timesplitting
497 * @return array
498 */
00da1e60 499 protected function get_most_recent_prediction_range($timesplitting) {
369389c9
DM
500
501 $now = time();
00da1e60
DM
502 $ranges = $timesplitting->get_all_ranges();
503
504 // Opposite order as we are interested in the last range that can be used for prediction.
e4584b81 505 krsort($ranges);
369389c9
DM
506
507 // We already provided the analysable to the time splitting method, there is no need to feed it back.
00da1e60 508 foreach ($ranges as $rangeindex => $range) {
369389c9
DM
509 if ($timesplitting->ready_to_predict($range)) {
510 // We need to maintain the same indexes.
00da1e60 511 return array($rangeindex => $range);
369389c9
DM
512 }
513 }
514
00da1e60 515 return array();
369389c9
DM
516 }
517
413f19bc
DM
518 /**
519 * Filters out samples that have already been used for training.
520 *
521 * @param int[] $sampleids
522 * @param \core_analytics\local\time_splitting\base $timesplitting
413f19bc 523 */
00da1e60 524 protected function filter_out_train_samples(&$sampleids, $timesplitting) {
369389c9
DM
525 global $DB;
526
527 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
528 'timesplitting' => $timesplitting->get_id());
529
530 $trainingsamples = $DB->get_records('analytics_train_samples', $params);
531
532 // Skip each file trained samples.
533 foreach ($trainingsamples as $trainingfile) {
534
535 $usedsamples = json_decode($trainingfile->sampleids, true);
536
537 if (!empty($usedsamples)) {
538 // Reset $sampleids to $sampleids minus this file's $usedsamples.
539 $sampleids = array_diff_key($sampleids, $usedsamples);
540 }
541 }
369389c9
DM
542 }
543
413f19bc
DM
544 /**
545 * Filters out samples that have already been used for prediction.
546 *
00da1e60 547 * @param int[] $sampleids
413f19bc
DM
548 * @param array $ranges
549 * @param \core_analytics\local\time_splitting\base $timesplitting
413f19bc 550 */
00da1e60 551 protected function filter_out_prediction_samples_and_ranges(&$sampleids, &$ranges, $timesplitting) {
369389c9
DM
552 global $DB;
553
00da1e60
DM
554 if (count($ranges) > 1) {
555 throw new \coding_exception('$ranges argument should only contain one range');
556 }
557
558 $rangeindex = key($ranges);
559
369389c9 560 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
00da1e60
DM
561 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
562 $predictedrange = $DB->get_record('analytics_predict_samples', $params);
369389c9 563
00da1e60
DM
564 if (!$predictedrange) {
565 // Nothing to filter out.
566 return;
369389c9
DM
567 }
568
00da1e60
DM
569 $predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
570 $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
571 if (count($missingsamples) === 0) {
572 // All samples already calculated.
573 unset($ranges[$rangeindex]);
574 return;
575 }
369389c9 576
00da1e60
DM
577 // Replace the list of samples by the one excluding samples that already got predictions at this range.
578 $sampleids = $missingsamples;
369389c9
DM
579 }
580
413f19bc
DM
581 /**
582 * Saves samples that have just been used for training.
583 *
584 * @param int[] $sampleids
585 * @param \core_analytics\local\time_splitting\base $timesplitting
586 * @param \stored_file $file
00da1e60 587 * @return void
413f19bc 588 */
369389c9
DM
589 protected function save_train_samples($sampleids, $timesplitting, $file) {
590 global $DB;
591
592 $trainingsamples = new \stdClass();
593 $trainingsamples->modelid = $this->modelid;
594 $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
595 $trainingsamples->timesplitting = $timesplitting->get_id();
596 $trainingsamples->fileid = $file->get_id();
597
369389c9
DM
598 $trainingsamples->sampleids = json_encode($sampleids);
599 $trainingsamples->timecreated = time();
600
00da1e60 601 $DB->insert_record('analytics_train_samples', $trainingsamples);
369389c9
DM
602 }
603
413f19bc
DM
604 /**
605 * Saves samples that have just been used for prediction.
606 *
00da1e60 607 * @param int[] $sampleids
413f19bc
DM
608 * @param array $ranges
609 * @param \core_analytics\local\time_splitting\base $timesplitting
610 * @return void
611 */
00da1e60 612 protected function save_prediction_samples($sampleids, $ranges, $timesplitting) {
369389c9
DM
613 global $DB;
614
00da1e60
DM
615 if (count($ranges) > 1) {
616 throw new \coding_exception('$ranges argument should only contain one range');
617 }
618
619 $rangeindex = key($ranges);
369389c9 620
00da1e60
DM
621 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
622 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
623 if ($predictionrange = $DB->get_record('analytics_predict_samples', $params)) {
624 // Append the new samples used for prediction.
625 $prevsamples = json_decode($predictionrange->sampleids, true);
626 $predictionrange->sampleids = json_encode($prevsamples + $sampleids);
627 $predictionrange->timemodified = time();
628 $DB->update_record('analytics_predict_samples', $predictionrange);
629 } else {
630 $predictionrange = (object)$params;
631 $predictionrange->sampleids = json_encode($sampleids);
632 $predictionrange->timecreated = time();
633 $predictionrange->timemodified = $predictionrange->timecreated;
634 $DB->insert_record('analytics_predict_samples', $predictionrange);
369389c9
DM
635 }
636 }
637}