MDL-62218 analytics: Privacy API implementation
[moodle.git] / analytics / classes / local / analyser / base.php
CommitLineData
369389c9
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
413f19bc 18 * Analysers base class.
369389c9
DM
19 *
20 * @package core_analytics
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_analytics\local\analyser;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
413f19bc 30 * Analysers base class.
369389c9
DM
31 *
32 * @package core_analytics
33 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36abstract class base {
37
413f19bc
DM
38 /**
39 * @var int
40 */
369389c9
DM
41 protected $modelid;
42
413f19bc
DM
43 /**
44 * The model target.
45 *
46 * @var \core_analytics\local\target\base
47 */
369389c9 48 protected $target;
413f19bc 49
0690a271
DM
50 /**
51 * A $this->$target copy loaded with the ongoing analysis analysable.
52 *
53 * @var \core_analytics\local\target\base
54 */
55 protected $analysabletarget;
56
413f19bc
DM
57 /**
58 * The model indicators.
59 *
60 * @var \core_analytics\local\indicator\base[]
61 */
369389c9 62 protected $indicators;
413f19bc
DM
63
64 /**
65 * Time splitting methods to use.
66 *
67 * Multiple time splitting methods during evaluation and 1 single
68 * time splitting method once the model is enabled.
69 *
70 * @var \core_analytics\local\time_splitting\base[]
71 */
369389c9
DM
72 protected $timesplittings;
73
413f19bc
DM
74 /**
75 * Execution options.
76 *
77 * @var array
78 */
369389c9
DM
79 protected $options;
80
413f19bc
DM
81 /**
82 * Simple log array.
83 *
84 * @var string[]
85 */
369389c9
DM
86 protected $log;
87
413f19bc
DM
88 /**
89 * Constructor method.
90 *
91 * @param int $modelid
92 * @param \core_analytics\local\target\base $target
93 * @param \core_analytics\local\indicator\base[] $indicators
94 * @param \core_analytics\local\time_splitting\base[] $timesplittings
95 * @param array $options
96 * @return void
97 */
369389c9
DM
98 public function __construct($modelid, \core_analytics\local\target\base $target, $indicators, $timesplittings, $options) {
99 $this->modelid = $modelid;
100 $this->target = $target;
101 $this->indicators = $indicators;
102 $this->timesplittings = $timesplittings;
103
104 if (empty($options['evaluation'])) {
105 $options['evaluation'] = false;
106 }
107 $this->options = $options;
108
109 // Checks if the analyser satisfies the indicators requirements.
110 $this->check_indicators_requirements();
111
112 $this->log = array();
113 }
114
a8ccc5f2
DM
115 /**
116 * Returns the list of analysable elements available on the site.
117 *
118 * \core_analytics\local\analyser\by_course and \core_analytics\local\analyser\sitewide are implementing
119 * this method returning site courses (by_course) and the whole system (sitewide) as analysables.
120 *
dd13fc22 121 * @return \core_analytics\analysable[] Array of analysable elements using the analysable id as array key.
a8ccc5f2
DM
122 */
123 abstract public function get_analysables();
124
369389c9 125 /**
413f19bc 126 * This function returns this analysable list of samples.
369389c9
DM
127 *
128 * @param \core_analytics\analysable $analysable
a40952d3 129 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
369389c9
DM
130 */
131 abstract protected function get_all_samples(\core_analytics\analysable $analysable);
132
a40952d3 133 /**
413f19bc 134 * This function returns the samples data from a list of sample ids.
a40952d3
DM
135 *
136 * @param int[] $sampleids
137 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
138 */
369389c9
DM
139 abstract public function get_samples($sampleids);
140
a40952d3 141 /**
413f19bc 142 * Returns the analysable of a sample.
a40952d3
DM
143 *
144 * @param int $sampleid
145 * @return \core_analytics\analysable
146 */
147 abstract public function get_sample_analysable($sampleid);
148
149 /**
413f19bc 150 * Returns the sample's origin in moodle database.
a40952d3
DM
151 *
152 * @return string
153 */
a8ccc5f2 154 abstract public function get_samples_origin();
369389c9
DM
155
156 /**
413f19bc
DM
157 * Returns the context of a sample.
158 *
369389c9
DM
159 * moodle/analytics:listinsights will be required at this level to access the sample predictions.
160 *
161 * @param int $sampleid
162 * @return \context
163 */
164 abstract public function sample_access_context($sampleid);
165
a40952d3 166 /**
413f19bc 167 * Describes a sample with a description summary and a \renderable (an image for example)
a40952d3
DM
168 *
169 * @param int $sampleid
170 * @param int $contextid
171 * @param array $sampledata
172 * @return array array(string, \renderable)
173 */
369389c9
DM
174 abstract public function sample_description($sampleid, $contextid, $sampledata);
175
369389c9
DM
176 /**
177 * Main analyser method which processes the site analysables.
178 *
413f19bc 179 * @param bool $includetarget
369389c9
DM
180 * @return \stored_file[]
181 */
a8ccc5f2 182 public function get_analysable_data($includetarget) {
dd13fc22
DM
183 global $DB;
184
185 // Time limit control.
186 $modeltimelimit = intval(get_config('analytics', 'modeltimelimit'));
a8ccc5f2
DM
187
188 $filesbytimesplitting = array();
189
dd13fc22
DM
190 list($analysables, $processedanalysables) = $this->get_sorted_analysables($includetarget);
191
192 $inittime = time();
2dca1339 193 foreach ($analysables as $key => $analysable) {
a8ccc5f2
DM
194
195 $files = $this->process_analysable($analysable, $includetarget);
196
197 // Later we will need to aggregate data by time splitting method.
198 foreach ($files as $timesplittingid => $file) {
2dca1339 199 $filesbytimesplitting[$timesplittingid][] = $file;
a8ccc5f2 200 }
dd13fc22
DM
201
202 $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id(), $includetarget);
203
204 // Apply time limit.
205 if (!$this->options['evaluation']) {
206 $timespent = time() - $inittime;
207 if ($modeltimelimit <= $timespent) {
208 break;
209 }
210 }
2dca1339
DM
211
212 unset($analysables[$key]);
213 }
214
215 if ($this->options['evaluation'] === false) {
216 // Look for previous training and prediction files we generated and couldn't be used
217 // by machine learning backends because they weren't big enough.
218
219 $pendingfiles = \core_analytics\dataset_manager::get_pending_files($this->modelid, $includetarget,
220 array_keys($filesbytimesplitting));
221 foreach ($pendingfiles as $timesplittingid => $files) {
222 foreach ($files as $file) {
223 $filesbytimesplitting[$timesplittingid][] = $file;
224 }
225 }
a8ccc5f2
DM
226 }
227
228 // We join the datasets by time splitting method.
229 $timesplittingfiles = $this->merge_analysable_files($filesbytimesplitting, $includetarget);
230
2dca1339
DM
231 if (!empty($pendingfiles)) {
232 // We must remove them now as they are already part of another dataset.
233 foreach ($pendingfiles as $timesplittingid => $files) {
234 foreach ($files as $file) {
235 $file->delete();
236 }
237 }
238 }
239
a8ccc5f2
DM
240 return $timesplittingfiles;
241 }
369389c9 242
413f19bc
DM
243 /**
244 * Samples data this analyser provides.
245 *
246 * @return string[]
247 */
248 protected function provided_sample_data() {
249 return array($this->get_samples_origin());
250 }
251
252 /**
253 * Returns labelled data (training and evaluation).
254 *
255 * @return array
256 */
369389c9
DM
257 public function get_labelled_data() {
258 return $this->get_analysable_data(true);
259 }
260
413f19bc
DM
261 /**
262 * Returns unlabelled data (prediction).
263 *
264 * @return array
265 */
369389c9
DM
266 public function get_unlabelled_data() {
267 return $this->get_analysable_data(false);
268 }
269
270 /**
271 * Checks if the analyser satisfies all the model indicators requirements.
272 *
273 * @throws \core_analytics\requirements_exception
274 * @return void
275 */
276 protected function check_indicators_requirements() {
277
278 foreach ($this->indicators as $indicator) {
279 $missingrequired = $this->check_indicator_requirements($indicator);
280 if ($missingrequired !== true) {
281 throw new \core_analytics\requirements_exception(get_class($indicator) . ' indicator requires ' .
282 json_encode($missingrequired) . ' sample data which is not provided by ' . get_class($this));
283 }
284 }
285 }
286
a8ccc5f2
DM
287 /**
288 * Merges analysable dataset files into 1.
289 *
290 * @param array $filesbytimesplitting
291 * @param bool $includetarget
292 * @return \stored_file[]
293 */
294 protected function merge_analysable_files($filesbytimesplitting, $includetarget) {
295
296 $timesplittingfiles = array();
297 foreach ($filesbytimesplitting as $timesplittingid => $files) {
298
299 if ($this->options['evaluation'] === true) {
300 // Delete the previous copy. Only when evaluating.
301 \core_analytics\dataset_manager::delete_previous_evaluation_file($this->modelid, $timesplittingid);
302 }
303
304 // Merge all course files into one.
305 if ($includetarget) {
306 $filearea = \core_analytics\dataset_manager::LABELLED_FILEAREA;
307 } else {
308 $filearea = \core_analytics\dataset_manager::UNLABELLED_FILEAREA;
309 }
310 $timesplittingfiles[$timesplittingid] = \core_analytics\dataset_manager::merge_datasets($files,
311 $this->modelid, $timesplittingid, $filearea, $this->options['evaluation']);
312 }
313
314 return $timesplittingfiles;
315 }
316
369389c9 317 /**
413f19bc 318 * Checks that this analyser satisfies the provided indicator requirements.
369389c9
DM
319 *
320 * @param \core_analytics\local\indicator\base $indicator
321 * @return true|string[] True if all good, missing requirements list otherwise
322 */
323 public function check_indicator_requirements(\core_analytics\local\indicator\base $indicator) {
324
325 $providedsampledata = $this->provided_sample_data();
326
327 $requiredsampledata = $indicator::required_sample_data();
328 if (empty($requiredsampledata)) {
329 // The indicator does not need any sample data.
330 return true;
331 }
332 $missingrequired = array_diff($requiredsampledata, $providedsampledata);
333
334 if (empty($missingrequired)) {
335 return true;
336 }
337
338 return $missingrequired;
339 }
340
341 /**
342 * Processes an analysable
343 *
344 * This method returns the general analysable status, an array of files by time splitting method and
345 * an error message if there is any problem.
346 *
347 * @param \core_analytics\analysable $analysable
348 * @param bool $includetarget
349 * @return \stored_file[] Files by time splitting method
350 */
351 public function process_analysable($analysable, $includetarget) {
352
353 // Default returns.
354 $files = array();
355 $message = null;
356
357 // Target instances scope is per-analysable (it can't be lower as calculations run once per
358 // analysable, not time splitting method nor time range).
0690a271 359 $this->analysabletarget = call_user_func(array($this->target, 'instance'));
369389c9
DM
360
361 // We need to check that the analysable is valid for the target even if we don't include targets
362 // as we still need to discard invalid analysables for the target.
0690a271 363 $result = $this->analysabletarget->is_valid_analysable($analysable, $includetarget);
369389c9
DM
364 if ($result !== true) {
365 $a = new \stdClass();
b0fb8ef5 366 $a->analysableid = $analysable->get_name();
369389c9 367 $a->result = $result;
a40952d3 368 $this->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
369389c9
DM
369 return array();
370 }
371
372 // Process all provided time splitting methods.
373 $results = array();
374 foreach ($this->timesplittings as $timesplitting) {
375
376 // For evaluation purposes we don't need to be that strict about how updated the data is,
377 // if this analyser was analysed less that 1 week ago we skip generating a new one. This
378 // helps scale the evaluation process as sites with tons of courses may a lot of time to
379 // complete an evaluation.
380 if (!empty($this->options['evaluation']) && !empty($this->options['reuseprevanalysed'])) {
381
382 $previousanalysis = \core_analytics\dataset_manager::get_evaluation_analysable_file($this->modelid,
383 $analysable->get_id(), $timesplitting->get_id());
1611308b 384 // 1 week is a partly random time interval, no need to worry about DST.
369389c9
DM
385 $boundary = time() - WEEKSECS;
386 if ($previousanalysis && $previousanalysis->get_timecreated() > $boundary) {
387 // Recover the previous analysed file and avoid generating a new one.
388
389 // Don't bother filling a result object as it is only useful when there are no files generated.
390 $files[$timesplitting->get_id()] = $previousanalysis;
391 continue;
392 }
393 }
394
0690a271 395 $result = $this->process_time_splitting($timesplitting, $analysable, $includetarget);
369389c9
DM
396
397 if (!empty($result->file)) {
398 $files[$timesplitting->get_id()] = $result->file;
399 }
400 $results[] = $result;
401 }
402
403 if (empty($files)) {
404 $errors = array();
405 foreach ($results as $timesplittingid => $result) {
406 $errors[] = $timesplittingid . ': ' . $result->message;
407 }
408
409 $a = new \stdClass();
b0fb8ef5 410 $a->analysableid = $analysable->get_name();
413f19bc 411 $a->errors = implode(', ', $errors);
a40952d3 412 $this->add_log(get_string('analysablenotused', 'analytics', $a));
369389c9
DM
413 }
414
415 return $files;
416 }
417
a40952d3 418 /**
413f19bc 419 * Adds a register to the analysis log.
a40952d3
DM
420 *
421 * @param string $string
422 * @return void
423 */
424 public function add_log($string) {
425 $this->log[] = $string;
426 }
427
428 /**
413f19bc 429 * Returns the analysis logs.
a40952d3
DM
430 *
431 * @return string[]
432 */
369389c9
DM
433 public function get_logs() {
434 return $this->log;
435 }
436
97b0a6cb
DM
437 /**
438 * Whether the plugin needs user data clearing or not.
439 *
440 * This is related to privacy. Override this method if your analyser samples have any relation
441 * to the 'user' database entity. We need to clean the site from all user-related data if a user
442 * request their data to be deleted from the system. A static::provided_sample_data returning 'user'
443 * is an indicator that you should be returning true.
444 *
445 * @return bool
446 */
447 public function processes_user_data() {
448 return false;
449 }
450
451 /**
452 * SQL JOIN from a sample to users table.
453 *
454 * This function should be defined if static::processes_user_data returns true and it is related to analytics API
455 * privacy API implementation. It allows the analytics API to identify data associated to users that needs to be
456 * deleted or exported.
457 *
458 * This function receives the alias of a table with a 'sampleid' field and it should return a SQL join
459 * with static::get_samples_origin and with 'user' table. Note that:
460 * - The function caller expects the returned 'user' table to be aliased as 'u' (defacto standard in moodle).
461 * - You can join with other tables if your samples origin table does not contain a 'userid' field (if that would be
462 * a requirement this solution would be automated for you) you can't though use the following
463 * aliases: 'ap', 'apa', 'aic' and 'am'.
464 *
465 * Some examples:
466 *
467 * static::get_samples_origin() === 'user':
468 * JOIN {user} u ON {$sampletablealias}.sampleid = u.id
469 *
470 * static::get_samples_origin() === 'role_assignments':
471 * JOIN {role_assignments} ra ON {$sampletablealias}.sampleid = ra.userid JOIN {user} u ON u.id = ra.userid
472 *
473 * static::get_samples_origin() === 'user_enrolments':
474 * JOIN {user_enrolments} ue ON {$sampletablealias}.sampleid = ue.userid JOIN {user} u ON u.id = ue.userid
475 *
476 * @throws \coding_exception
477 * @param string $sampletablealias The alias of the table with a sampleid field that will join with this SQL string
478 * @return string
479 */
480 public function join_sample_user($sampletablealias) {
481 throw new \coding_exception('This method should be implemented if static::processes_user_data returns true.');
482 }
483
413f19bc
DM
484 /**
485 * Processes the analysable samples using the provided time splitting method.
486 *
487 * @param \core_analytics\local\time_splitting\base $timesplitting
488 * @param \core_analytics\analysable $analysable
0690a271 489 * @param bool $includetarget
413f19bc
DM
490 * @return \stdClass Results object.
491 */
0690a271 492 protected function process_time_splitting($timesplitting, $analysable, $includetarget = false) {
369389c9
DM
493
494 $result = new \stdClass();
495
496 if (!$timesplitting->is_valid_analysable($analysable)) {
413f19bc 497 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
498 $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
499 $timesplitting->get_name());
500 return $result;
501 }
502 $timesplitting->set_analysable($analysable);
503
504 if (CLI_SCRIPT && !PHPUNIT_TEST) {
413f19bc
DM
505 mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() .
506 '" time splitting method...');
369389c9
DM
507 }
508
509 // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
510 // attempt... it is on what we will base indicators calculations.
511 list($sampleids, $samplesdata) = $this->get_all_samples($analysable);
512
513 if (count($sampleids) === 0) {
413f19bc 514 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
515 $result->message = get_string('nodata', 'analytics');
516 return $result;
517 }
518
0690a271 519 if ($includetarget) {
369389c9
DM
520 // All ranges are used when we are calculating data for training.
521 $ranges = $timesplitting->get_all_ranges();
522 } else {
00da1e60
DM
523 // The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
524 $ranges = $this->get_most_recent_prediction_range($timesplitting);
369389c9
DM
525 }
526
527 // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
528 if ($this->options['evaluation'] === false) {
529
530 if (empty($ranges)) {
413f19bc 531 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
00da1e60 532 $result->message = get_string('noranges', 'analytics');
369389c9
DM
533 return $result;
534 }
535
00da1e60
DM
536 // We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
537 $this->filter_out_train_samples($sampleids, $timesplitting);
369389c9
DM
538
539 if (count($sampleids) === 0) {
413f19bc 540 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9
DM
541 $result->message = get_string('nonewdata', 'analytics');
542 return $result;
543 }
544
369389c9 545 // Only when processing data for predictions.
0690a271 546 if (!$includetarget) {
00da1e60
DM
547 // We also filter out samples and ranges that have already been used for predictions.
548 $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
549 }
550
551 if (count($sampleids) === 0) {
552 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
553 $result->message = get_string('nonewdata', 'analytics');
554 return $result;
369389c9
DM
555 }
556
557 if (count($ranges) === 0) {
413f19bc 558 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
00da1e60 559 $result->message = get_string('nonewranges', 'analytics');
369389c9
DM
560 return $result;
561 }
562 }
563
28c97d1b 564 if (!empty($includetarget)) {
56d4981e
DM
565 $filearea = \core_analytics\dataset_manager::LABELLED_FILEAREA;
566 } else {
567 $filearea = \core_analytics\dataset_manager::UNLABELLED_FILEAREA;
568 }
369389c9 569 $dataset = new \core_analytics\dataset_manager($this->modelid, $analysable->get_id(), $timesplitting->get_id(),
56d4981e 570 $filearea, $this->options['evaluation']);
369389c9
DM
571
572 // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
1611308b
DM
573 if (!$dataset->init_process()) {
574 // If this model + analysable + timesplitting combination is being analysed we skip this process.
575 $result->status = \core_analytics\model::NO_DATASET;
576 $result->message = get_string('analysisinprogress', 'analytics');
577 return $result;
578 }
579
0690a271
DM
580 // Remove samples the target consider invalid.
581 $this->analysabletarget->add_sample_data($samplesdata);
582 $this->analysabletarget->filter_out_invalid_samples($sampleids, $analysable, $includetarget);
1611308b
DM
583
584 if (!$sampleids) {
585 $result->status = \core_analytics\model::NO_DATASET;
586 $result->message = get_string('novalidsamples', 'analytics');
587 $dataset->close_process();
588 return $result;
589 }
369389c9
DM
590
591 foreach ($this->indicators as $key => $indicator) {
592 // The analyser attaches the main entities the sample depends on and are provided to the
593 // indicator to calculate the sample.
a40952d3
DM
594 $this->indicators[$key]->add_sample_data($samplesdata);
595 }
369389c9
DM
596
597 // Here we start the memory intensive process that will last until $data var is
598 // unset (until the method is finished basically).
0690a271
DM
599 if ($includetarget) {
600 $data = $timesplitting->calculate($sampleids, $this->get_samples_origin(), $this->indicators, $ranges,
601 $this->analysabletarget);
602 } else {
603 $data = $timesplitting->calculate($sampleids, $this->get_samples_origin(), $this->indicators, $ranges);
604 }
369389c9
DM
605
606 if (!$data) {
413f19bc 607 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
369389c9 608 $result->message = get_string('novaliddata', 'analytics');
1611308b 609 $dataset->close_process();
369389c9
DM
610 return $result;
611 }
612
10658a1c
DM
613 // Add extra metadata.
614 $this->add_model_metadata($data);
5c5cb3ee 615
369389c9
DM
616 // Write all calculated data to a file.
617 $file = $dataset->store($data);
618
619 // Flag the model + analysable + timesplitting as analysed.
620 $dataset->close_process();
621
622 // No need to keep track of analysed stuff when evaluating.
623 if ($this->options['evaluation'] === false) {
624 // Save the samples that have been already analysed so they are not analysed again in future.
625
0690a271 626 if ($includetarget) {
369389c9
DM
627 $this->save_train_samples($sampleids, $timesplitting, $file);
628 } else {
00da1e60 629 $this->save_prediction_samples($sampleids, $ranges, $timesplitting);
369389c9
DM
630 }
631 }
632
633 $result->status = \core_analytics\model::OK;
634 $result->message = get_string('successfullyanalysed', 'analytics');
635 $result->file = $file;
636 return $result;
637 }
638
413f19bc 639 /**
00da1e60 640 * Returns the most recent range that can be used to predict.
413f19bc
DM
641 *
642 * @param \core_analytics\local\time_splitting\base $timesplitting
643 * @return array
644 */
00da1e60 645 protected function get_most_recent_prediction_range($timesplitting) {
369389c9
DM
646
647 $now = time();
00da1e60
DM
648 $ranges = $timesplitting->get_all_ranges();
649
650 // Opposite order as we are interested in the last range that can be used for prediction.
e4584b81 651 krsort($ranges);
369389c9
DM
652
653 // We already provided the analysable to the time splitting method, there is no need to feed it back.
00da1e60 654 foreach ($ranges as $rangeindex => $range) {
369389c9
DM
655 if ($timesplitting->ready_to_predict($range)) {
656 // We need to maintain the same indexes.
00da1e60 657 return array($rangeindex => $range);
369389c9
DM
658 }
659 }
660
00da1e60 661 return array();
369389c9
DM
662 }
663
413f19bc
DM
664 /**
665 * Filters out samples that have already been used for training.
666 *
667 * @param int[] $sampleids
668 * @param \core_analytics\local\time_splitting\base $timesplitting
413f19bc 669 */
00da1e60 670 protected function filter_out_train_samples(&$sampleids, $timesplitting) {
369389c9
DM
671 global $DB;
672
673 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
674 'timesplitting' => $timesplitting->get_id());
675
676 $trainingsamples = $DB->get_records('analytics_train_samples', $params);
677
678 // Skip each file trained samples.
679 foreach ($trainingsamples as $trainingfile) {
680
681 $usedsamples = json_decode($trainingfile->sampleids, true);
682
683 if (!empty($usedsamples)) {
684 // Reset $sampleids to $sampleids minus this file's $usedsamples.
685 $sampleids = array_diff_key($sampleids, $usedsamples);
686 }
687 }
369389c9
DM
688 }
689
413f19bc
DM
690 /**
691 * Filters out samples that have already been used for prediction.
692 *
00da1e60 693 * @param int[] $sampleids
413f19bc
DM
694 * @param array $ranges
695 * @param \core_analytics\local\time_splitting\base $timesplitting
413f19bc 696 */
00da1e60 697 protected function filter_out_prediction_samples_and_ranges(&$sampleids, &$ranges, $timesplitting) {
369389c9
DM
698 global $DB;
699
00da1e60
DM
700 if (count($ranges) > 1) {
701 throw new \coding_exception('$ranges argument should only contain one range');
702 }
703
704 $rangeindex = key($ranges);
705
369389c9 706 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
00da1e60
DM
707 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
708 $predictedrange = $DB->get_record('analytics_predict_samples', $params);
369389c9 709
00da1e60
DM
710 if (!$predictedrange) {
711 // Nothing to filter out.
712 return;
369389c9
DM
713 }
714
00da1e60
DM
715 $predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
716 $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
717 if (count($missingsamples) === 0) {
718 // All samples already calculated.
719 unset($ranges[$rangeindex]);
720 return;
721 }
369389c9 722
00da1e60
DM
723 // Replace the list of samples by the one excluding samples that already got predictions at this range.
724 $sampleids = $missingsamples;
369389c9
DM
725 }
726
413f19bc
DM
727 /**
728 * Saves samples that have just been used for training.
729 *
730 * @param int[] $sampleids
731 * @param \core_analytics\local\time_splitting\base $timesplitting
732 * @param \stored_file $file
00da1e60 733 * @return void
413f19bc 734 */
369389c9
DM
735 protected function save_train_samples($sampleids, $timesplitting, $file) {
736 global $DB;
737
738 $trainingsamples = new \stdClass();
739 $trainingsamples->modelid = $this->modelid;
740 $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
741 $trainingsamples->timesplitting = $timesplitting->get_id();
742 $trainingsamples->fileid = $file->get_id();
743
369389c9
DM
744 $trainingsamples->sampleids = json_encode($sampleids);
745 $trainingsamples->timecreated = time();
746
00da1e60 747 $DB->insert_record('analytics_train_samples', $trainingsamples);
369389c9
DM
748 }
749
413f19bc
DM
750 /**
751 * Saves samples that have just been used for prediction.
752 *
00da1e60 753 * @param int[] $sampleids
413f19bc
DM
754 * @param array $ranges
755 * @param \core_analytics\local\time_splitting\base $timesplitting
756 * @return void
757 */
00da1e60 758 protected function save_prediction_samples($sampleids, $ranges, $timesplitting) {
369389c9
DM
759 global $DB;
760
00da1e60
DM
761 if (count($ranges) > 1) {
762 throw new \coding_exception('$ranges argument should only contain one range');
763 }
764
765 $rangeindex = key($ranges);
369389c9 766
00da1e60
DM
767 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
768 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
769 if ($predictionrange = $DB->get_record('analytics_predict_samples', $params)) {
770 // Append the new samples used for prediction.
771 $prevsamples = json_decode($predictionrange->sampleids, true);
772 $predictionrange->sampleids = json_encode($prevsamples + $sampleids);
773 $predictionrange->timemodified = time();
774 $DB->update_record('analytics_predict_samples', $predictionrange);
775 } else {
776 $predictionrange = (object)$params;
777 $predictionrange->sampleids = json_encode($sampleids);
778 $predictionrange->timecreated = time();
779 $predictionrange->timemodified = $predictionrange->timecreated;
780 $DB->insert_record('analytics_predict_samples', $predictionrange);
369389c9
DM
781 }
782 }
5c5cb3ee
DM
783
784 /**
785 * Adds target metadata to the dataset.
786 *
787 * @param array $data
788 * @return void
789 */
10658a1c
DM
790 protected function add_model_metadata(&$data) {
791 global $CFG;
792
793 $metadata = array(
794 'moodleversion' => $CFG->version,
795 'targetcolumn' => $this->analysabletarget->get_id()
796 );
5c5cb3ee 797 if ($this->analysabletarget->is_linear()) {
10658a1c
DM
798 $metadata['targettype'] = 'linear';
799 $metadata['targetmin'] = $this->analysabletarget::get_min_value();
800 $metadata['targetmax'] = $this->analysabletarget::get_max_value();
5c5cb3ee 801 } else {
10658a1c
DM
802 $metadata['targettype'] = 'discrete';
803 $metadata['targetclasses'] = json_encode($this->analysabletarget::get_classes());
804 }
805
806 foreach ($metadata as $varname => $value) {
807 $data[0][] = $varname;
808 $data[1][] = $value;
5c5cb3ee
DM
809 }
810 }
dd13fc22
DM
811
812 /**
813 * Returns the list of analysables sorted in processing priority order.
814 *
815 * It will first return analysables that have never been analysed before
816 * and it will continue with the ones we have already seen by timeanalysed DESC
817 * order.
818 *
819 * @param bool $includetarget
820 * @return array(0 => \core_analytics\analysable[], 1 => \stdClass[])
821 */
822 protected function get_sorted_analysables($includetarget) {
823
824 $analysables = $this->get_analysables();
825
826 // Get the list of analysables that have been already processed.
827 $processedanalysables = $this->get_processed_analysables($includetarget);
828
829 // We want to start processing analysables we have not yet processed and later continue
830 // with analysables that we already processed.
831 $unseen = array_diff_key($analysables, $processedanalysables);
832
833 // Var $processed first as we want to respect its timeanalysed DESC order so analysables that
834 // have recently been processed are on the bottom of the stack.
835 $seen = array_intersect_key($processedanalysables, $analysables);
836 array_walk($seen, function(&$value, $analysableid) use ($analysables) {
837 // We replace the analytics_used_analysables record by the analysable object.
838 $value = $analysables[$analysableid];
839 });
840
841 return array($unseen + $seen, $processedanalysables);
842 }
843
844 /**
845 * Get analysables that have been already processed.
846 *
847 * @param bool $includetarget
848 * @return \stdClass[]
849 */
850 protected function get_processed_analysables($includetarget) {
851 global $DB;
852
853 $params = array('modelid' => $this->modelid);
854 $params['action'] = ($includetarget) ? 'training' : 'prediction';
855 $select = 'modelid = :modelid and action = :action';
856
857 // Weird select fields ordering for performance (analysableid key matching, analysableid is also unique by modelid).
858 return $DB->get_records_select('analytics_used_analysables', $select,
859 $params, 'timeanalysed DESC', 'analysableid, modelid, action, timeanalysed, id AS primarykey');
860 }
861
862 /**
863 * Updates the analysable analysis time.
864 *
865 * @param array $processedanalysables
866 * @param int $analysableid
867 * @param bool $includetarget
868 * @return null
869 */
870 protected function update_analysable_analysed_time($processedanalysables, $analysableid, $includetarget) {
871 global $DB;
872
873 if (!empty($processedanalysables[$analysableid])) {
874 $obj = $processedanalysables[$analysableid];
875
876 $obj->id = $obj->primarykey;
877 unset($obj->primarykey);
878
879 $obj->timeanalysed = time();
880 $DB->update_record('analytics_used_analysables', $obj);
881
882 } else {
883
884 $obj = new \stdClass();
885 $obj->modelid = $this->modelid;
886 $obj->action = ($includetarget) ? 'training' : 'prediction';
887 $obj->analysableid = $analysableid;
888 $obj->timeanalysed = time();
889
890 $DB->insert_record('analytics_used_analysables', $obj);
891 }
892 }
369389c9 893}