MDL-57791 analytics: Changes after review
[moodle.git] / analytics / classes / local / analyser / base.php
CommitLineData
369389c9
DM
1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 *
19 * @package core_analytics
20 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
21 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
22 */
23
24namespace core_analytics\local\analyser;
25
26defined('MOODLE_INTERNAL') || die();
27
28/**
29 *
30 * @package core_analytics
31 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
32 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
33 */
34abstract class base {
35
36 protected $modelid;
37
38 protected $target;
39 protected $indicators;
40 protected $timesplittings;
41
42 protected $options;
43
44 protected $log;
45
46 public function __construct($modelid, \core_analytics\local\target\base $target, $indicators, $timesplittings, $options) {
47 $this->modelid = $modelid;
48 $this->target = $target;
49 $this->indicators = $indicators;
50 $this->timesplittings = $timesplittings;
51
52 if (empty($options['evaluation'])) {
53 $options['evaluation'] = false;
54 }
55 $this->options = $options;
56
57 // Checks if the analyser satisfies the indicators requirements.
58 $this->check_indicators_requirements();
59
60 $this->log = array();
61 }
62
63 /**
64 * This function returns the list of samples that can be calculated.
65 *
66 * @param \core_analytics\analysable $analysable
a40952d3 67 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
369389c9
DM
68 */
69 abstract protected function get_all_samples(\core_analytics\analysable $analysable);
70
a40952d3
DM
71 /**
72 * get_samples
73 *
74 * @param int[] $sampleids
75 * @return array array[0] = int[] (sampleids) and array[1] = array (samplesdata)
76 */
369389c9
DM
77 abstract public function get_samples($sampleids);
78
a40952d3
DM
79 /**
80 * get_sample_analysable
81 *
82 * @param int $sampleid
83 * @return \core_analytics\analysable
84 */
85 abstract public function get_sample_analysable($sampleid);
86
87 /**
88 * get_samples_origin
89 *
90 * @return string
91 */
369389c9
DM
92 abstract protected function get_samples_origin();
93
94 /**
95 * moodle/analytics:listinsights will be required at this level to access the sample predictions.
96 *
97 * @param int $sampleid
98 * @return \context
99 */
100 abstract public function sample_access_context($sampleid);
101
a40952d3
DM
102 /**
103 * sample_description
104 *
105 * @param int $sampleid
106 * @param int $contextid
107 * @param array $sampledata
108 * @return array array(string, \renderable)
109 */
369389c9
DM
110 abstract public function sample_description($sampleid, $contextid, $sampledata);
111
112 protected function provided_sample_data() {
113 return array($this->get_samples_origin());
114 }
115
116 /**
117 * Main analyser method which processes the site analysables.
118 *
119 * \core_analytics\local\analyser\by_course and \core_analytics\local\analyser\sitewide are implementing
120 * this method returning site courses (by_course) and the whole system (sitewide) as analysables.
121 * In most of the cases you should have enough extending from one of these classes so you don't need
122 * to reimplement this method.
123 *
124 * @return \stored_file[]
125 */
126 abstract public function get_analysable_data($includetarget);
127
128 public function get_labelled_data() {
129 return $this->get_analysable_data(true);
130 }
131
132 public function get_unlabelled_data() {
133 return $this->get_analysable_data(false);
134 }
135
136 /**
137 * Checks if the analyser satisfies all the model indicators requirements.
138 *
139 * @throws \core_analytics\requirements_exception
140 * @return void
141 */
142 protected function check_indicators_requirements() {
143
144 foreach ($this->indicators as $indicator) {
145 $missingrequired = $this->check_indicator_requirements($indicator);
146 if ($missingrequired !== true) {
147 throw new \core_analytics\requirements_exception(get_class($indicator) . ' indicator requires ' .
148 json_encode($missingrequired) . ' sample data which is not provided by ' . get_class($this));
149 }
150 }
151 }
152
153 /**
154 * check_indicator_requirements
155 *
156 * @param \core_analytics\local\indicator\base $indicator
157 * @return true|string[] True if all good, missing requirements list otherwise
158 */
159 public function check_indicator_requirements(\core_analytics\local\indicator\base $indicator) {
160
161 $providedsampledata = $this->provided_sample_data();
162
163 $requiredsampledata = $indicator::required_sample_data();
164 if (empty($requiredsampledata)) {
165 // The indicator does not need any sample data.
166 return true;
167 }
168 $missingrequired = array_diff($requiredsampledata, $providedsampledata);
169
170 if (empty($missingrequired)) {
171 return true;
172 }
173
174 return $missingrequired;
175 }
176
177 /**
178 * Processes an analysable
179 *
180 * This method returns the general analysable status, an array of files by time splitting method and
181 * an error message if there is any problem.
182 *
183 * @param \core_analytics\analysable $analysable
184 * @param bool $includetarget
185 * @return \stored_file[] Files by time splitting method
186 */
187 public function process_analysable($analysable, $includetarget) {
188
189 // Default returns.
190 $files = array();
191 $message = null;
192
193 // Target instances scope is per-analysable (it can't be lower as calculations run once per
194 // analysable, not time splitting method nor time range).
1611308b 195 $target = call_user_func(array($this->target, 'instance'));
369389c9
DM
196
197 // We need to check that the analysable is valid for the target even if we don't include targets
198 // as we still need to discard invalid analysables for the target.
1611308b 199 $result = $target->is_valid_analysable($analysable, $includetarget, true);
369389c9
DM
200 if ($result !== true) {
201 $a = new \stdClass();
202 $a->analysableid = $analysable->get_id();
203 $a->result = $result;
a40952d3 204 $this->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
369389c9
DM
205 return array();
206 }
207
208 // Process all provided time splitting methods.
209 $results = array();
210 foreach ($this->timesplittings as $timesplitting) {
211
212 // For evaluation purposes we don't need to be that strict about how updated the data is,
213 // if this analyser was analysed less that 1 week ago we skip generating a new one. This
214 // helps scale the evaluation process as sites with tons of courses may a lot of time to
215 // complete an evaluation.
216 if (!empty($this->options['evaluation']) && !empty($this->options['reuseprevanalysed'])) {
217
218 $previousanalysis = \core_analytics\dataset_manager::get_evaluation_analysable_file($this->modelid,
219 $analysable->get_id(), $timesplitting->get_id());
1611308b 220 // 1 week is a partly random time interval, no need to worry about DST.
369389c9
DM
221 $boundary = time() - WEEKSECS;
222 if ($previousanalysis && $previousanalysis->get_timecreated() > $boundary) {
223 // Recover the previous analysed file and avoid generating a new one.
224
225 // Don't bother filling a result object as it is only useful when there are no files generated.
226 $files[$timesplitting->get_id()] = $previousanalysis;
227 continue;
228 }
229 }
230
231 if ($includetarget) {
232 $result = $this->process_time_splitting($timesplitting, $analysable, $target);
233 } else {
234 $result = $this->process_time_splitting($timesplitting, $analysable);
235 }
236
237 if (!empty($result->file)) {
238 $files[$timesplitting->get_id()] = $result->file;
239 }
240 $results[] = $result;
241 }
242
243 if (empty($files)) {
244 $errors = array();
245 foreach ($results as $timesplittingid => $result) {
246 $errors[] = $timesplittingid . ': ' . $result->message;
247 }
248
249 $a = new \stdClass();
250 $a->analysableid = $analysable->get_id();
251 $a->errors = implode(', ', $errors);
a40952d3 252 $this->add_log(get_string('analysablenotused', 'analytics', $a));
369389c9
DM
253 }
254
255 return $files;
256 }
257
a40952d3
DM
258 /**
259 * add_log
260 *
261 * @param string $string
262 * @return void
263 */
264 public function add_log($string) {
265 $this->log[] = $string;
266 }
267
268 /**
269 * get_logs
270 *
271 * @return string[]
272 */
369389c9
DM
273 public function get_logs() {
274 return $this->log;
275 }
276
277 protected function process_time_splitting($timesplitting, $analysable, $target = false) {
278
279 $result = new \stdClass();
280
281 if (!$timesplitting->is_valid_analysable($analysable)) {
282 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
283 $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
284 $timesplitting->get_name());
285 return $result;
286 }
287 $timesplitting->set_analysable($analysable);
288
289 if (CLI_SCRIPT && !PHPUNIT_TEST) {
290 mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() . '" time splitting method...');
291 }
292
293 // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
294 // attempt... it is on what we will base indicators calculations.
295 list($sampleids, $samplesdata) = $this->get_all_samples($analysable);
296
297 if (count($sampleids) === 0) {
298 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
299 $result->message = get_string('nodata', 'analytics');
300 return $result;
301 }
302
303 if ($target) {
304 // All ranges are used when we are calculating data for training.
305 $ranges = $timesplitting->get_all_ranges();
306 } else {
307 // Only some ranges can be used for prediction (it depends on the time range where we are right now).
308 $ranges = $this->get_prediction_ranges($timesplitting);
309 }
310
311 // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
312 if ($this->options['evaluation'] === false) {
313
314 if (empty($ranges)) {
315 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
316 $result->message = get_string('nonewdata', 'analytics');
317 return $result;
318 }
319
320 // We skip all samples that are already part of a training dataset, even if they have noe been used for training yet.
321 $sampleids = $this->filter_out_train_samples($sampleids, $timesplitting);
322
323 if (count($sampleids) === 0) {
324 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
325 $result->message = get_string('nonewdata', 'analytics');
326 return $result;
327 }
328
329 // TODO We may be interested in limiting $samplesdata contents to $sampleids after filtering out some sampleids.
330
331 // Only when processing data for predictions.
332 if ($target === false) {
333 // We also filter out ranges that have already been used for predictions.
334 $ranges = $this->filter_out_prediction_ranges($ranges, $timesplitting);
335 }
336
337 if (count($ranges) === 0) {
338 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
339 $result->message = get_string('nonewtimeranges', 'analytics');
340 return $result;
341 }
342 }
343
344 $dataset = new \core_analytics\dataset_manager($this->modelid, $analysable->get_id(), $timesplitting->get_id(),
345 $this->options['evaluation'], !empty($target));
346
347 // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
1611308b
DM
348 if (!$dataset->init_process()) {
349 // If this model + analysable + timesplitting combination is being analysed we skip this process.
350 $result->status = \core_analytics\model::NO_DATASET;
351 $result->message = get_string('analysisinprogress', 'analytics');
352 return $result;
353 }
354
355 // Remove samples the target consider invalid. Note that we use $this->target, $target will be false
356 // during prediction, but we still need to discard samples the target considers invalid.
357 $this->target->add_sample_data($samplesdata);
358 $this->target->filter_out_invalid_samples($sampleids, $analysable, $target);
359
360 if (!$sampleids) {
361 $result->status = \core_analytics\model::NO_DATASET;
362 $result->message = get_string('novalidsamples', 'analytics');
363 $dataset->close_process();
364 return $result;
365 }
369389c9
DM
366
367 foreach ($this->indicators as $key => $indicator) {
368 // The analyser attaches the main entities the sample depends on and are provided to the
369 // indicator to calculate the sample.
a40952d3
DM
370 $this->indicators[$key]->add_sample_data($samplesdata);
371 }
1611308b
DM
372 // Provide samples to the target instance (different than $this->target) $target is the new instance we get
373 // for each analysis in progress.
a40952d3 374 if ($target) {
a40952d3 375 $target->add_sample_data($samplesdata);
369389c9
DM
376 }
377
1611308b 378
369389c9
DM
379 // Here we start the memory intensive process that will last until $data var is
380 // unset (until the method is finished basically).
381 $data = $timesplitting->calculate($sampleids, $this->get_samples_origin(), $this->indicators, $ranges, $target);
382
383 if (!$data) {
384 $result->status = \core_analytics\model::ANALYSE_REJECTED_RANGE_PROCESSOR;
385 $result->message = get_string('novaliddata', 'analytics');
1611308b 386 $dataset->close_process();
369389c9
DM
387 return $result;
388 }
389
390 // Write all calculated data to a file.
391 $file = $dataset->store($data);
392
393 // Flag the model + analysable + timesplitting as analysed.
394 $dataset->close_process();
395
396 // No need to keep track of analysed stuff when evaluating.
397 if ($this->options['evaluation'] === false) {
398 // Save the samples that have been already analysed so they are not analysed again in future.
399
400 if ($target) {
401 $this->save_train_samples($sampleids, $timesplitting, $file);
402 } else {
403 $this->save_prediction_ranges($ranges, $timesplitting);
404 }
405 }
406
407 $result->status = \core_analytics\model::OK;
408 $result->message = get_string('successfullyanalysed', 'analytics');
409 $result->file = $file;
410 return $result;
411 }
412
413 protected function get_prediction_ranges($timesplitting) {
414
415 $now = time();
416
417 // We already provided the analysable to the time splitting method, there is no need to feed it back.
418 $predictionranges = array();
419 foreach ($timesplitting->get_all_ranges() as $rangeindex => $range) {
420 if ($timesplitting->ready_to_predict($range)) {
421 // We need to maintain the same indexes.
422 $predictionranges[$rangeindex] = $range;
423 }
424 }
425
426 return $predictionranges;
427 }
428
429 protected function filter_out_train_samples($sampleids, $timesplitting) {
430 global $DB;
431
432 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
433 'timesplitting' => $timesplitting->get_id());
434
435 $trainingsamples = $DB->get_records('analytics_train_samples', $params);
436
437 // Skip each file trained samples.
438 foreach ($trainingsamples as $trainingfile) {
439
440 $usedsamples = json_decode($trainingfile->sampleids, true);
441
442 if (!empty($usedsamples)) {
443 // Reset $sampleids to $sampleids minus this file's $usedsamples.
444 $sampleids = array_diff_key($sampleids, $usedsamples);
445 }
446 }
447
448 return $sampleids;
449 }
450
451 protected function filter_out_prediction_ranges($ranges, $timesplitting) {
452 global $DB;
453
454 $params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
455 'timesplitting' => $timesplitting->get_id());
456
457 $predictedranges = $DB->get_records('analytics_predict_ranges', $params);
458 foreach ($predictedranges as $predictedrange) {
459 if (!empty($ranges[$predictedrange->rangeindex])) {
460 unset($ranges[$predictedrange->rangeindex]);
461 }
462 }
463
464 return $ranges;
465
466 }
467
468 protected function save_train_samples($sampleids, $timesplitting, $file) {
469 global $DB;
470
471 $trainingsamples = new \stdClass();
472 $trainingsamples->modelid = $this->modelid;
473 $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
474 $trainingsamples->timesplitting = $timesplitting->get_id();
475 $trainingsamples->fileid = $file->get_id();
476
477 // TODO We just need the keys, we can save some space by removing the values.
478 $trainingsamples->sampleids = json_encode($sampleids);
479 $trainingsamples->timecreated = time();
480
481 return $DB->insert_record('analytics_train_samples', $trainingsamples);
482 }
483
484 protected function save_prediction_ranges($ranges, $timesplitting) {
485 global $DB;
486
487 $predictionrange = new \stdClass();
488 $predictionrange->modelid = $this->modelid;
489 $predictionrange->analysableid = $timesplitting->get_analysable()->get_id();
490 $predictionrange->timesplitting = $timesplitting->get_id();
491 $predictionrange->timecreated = time();
492
493 foreach ($ranges as $rangeindex => $unused) {
494 $predictionrange->rangeindex = $rangeindex;
495 $DB->insert_record('analytics_predict_ranges', $predictionrange);
496 }
497 }
498}