2 // This file is part of Moodle - http://moodle.org/
4 // Moodle is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
9 // Moodle is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>.
18 * Php predictions processor
20 * @package mlbackend_php
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
25 namespace mlbackend_php;
27 defined('MOODLE_INTERNAL') || die();
29 use Phpml\Preprocessing\Normalizer;
30 use Phpml\CrossValidation\RandomSplit;
31 use Phpml\Dataset\ArrayDataset;
32 use Phpml\ModelManager;
35 * PHP predictions processor.
37 * @package mlbackend_php
38 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
39 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
41 class processor implements \core_analytics\classifier, \core_analytics\regressor {
44 * Size of training / prediction batches.
46 const BATCH_SIZE = 5000;
49 * Number of train iterations.
51 const TRAIN_ITERATIONS = 500;
54 * File name of the serialised model.
56 const MODEL_FILENAME = 'model.ser';
61 protected $limitedsize = false;
64 * Checks if the processor is ready to use.
68 public function is_ready() {
69 if (version_compare(phpversion(), '7.0.0') < 0) {
70 return get_string('errorphp7required', 'mlbackend_php');
76 * Train this processor classification model using the provided supervised learning dataset.
78 * @param string $uniqueid
79 * @param \stored_file $dataset
80 * @param string $outputdir
83 public function train_classification($uniqueid, \stored_file $dataset, $outputdir) {
85 // Output directory is already unique to the model.
86 $modelfilepath = $outputdir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;
88 $modelmanager = new ModelManager();
90 if (file_exists($modelfilepath)) {
91 $classifier = $modelmanager->restoreFromFile($modelfilepath);
93 $classifier = new \Phpml\Classification\Linear\LogisticRegression(self::TRAIN_ITERATIONS, Normalizer::NORM_L2);
96 $fh = $dataset->get_content_file_handle();
98 // The first lines are var names and the second one values.
99 $metadata = $this->extract_metadata($fh);
106 while (($data = fgetcsv($fh)) !== false) {
107 $sampledata = array_map('floatval', $data);
108 $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
109 $targets[] = intval($data[$metadata['nfeatures']]);
111 if (count($samples) === self::BATCH_SIZE) {
112 // Training it batches to avoid running out of memory.
114 $classifier->partialTrain($samples, $targets, array(0, 1));
121 // Train the remaining samples.
123 $classifier->partialTrain($samples, $targets, array(0, 1));
126 $resultobj = new \stdClass();
127 $resultobj->status = \core_analytics\model::OK;
128 $resultobj->info = array();
130 // Store the trained model.
131 $modelmanager->saveToFile($classifier, $modelfilepath);
137 * Classifies the provided dataset samples.
139 * @param string $uniqueid
140 * @param \stored_file $dataset
141 * @param string $outputdir
144 public function classify($uniqueid, \stored_file $dataset, $outputdir) {
146 // Output directory is already unique to the model.
147 $modelfilepath = $outputdir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;
149 if (!file_exists($modelfilepath)) {
150 throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath);
153 $modelmanager = new ModelManager();
154 $classifier = $modelmanager->restoreFromFile($modelfilepath);
156 $fh = $dataset->get_content_file_handle();
158 // The first lines are var names and the second one values.
159 $metadata = $this->extract_metadata($fh);
164 $sampleids = array();
166 $predictions = array();
167 while (($data = fgetcsv($fh)) !== false) {
168 $sampledata = array_map('floatval', $data);
169 $sampleids[] = $data[0];
170 $samples[] = array_slice($sampledata, 1, $metadata['nfeatures']);
172 if (count($samples) === self::BATCH_SIZE) {
173 // Prediction it batches to avoid running out of memory.
175 // Append predictions incrementally, we want $sampleids keys in sync with $predictions keys.
176 $newpredictions = $classifier->predict($samples);
177 foreach ($newpredictions as $prediction) {
178 array_push($predictions, $prediction);
185 // Finish the remaining predictions.
187 $predictions = $predictions + $classifier->predict($samples);
190 $resultobj = new \stdClass();
191 $resultobj->status = \core_analytics\model::OK;
192 $resultobj->info = array();
194 foreach ($predictions as $index => $prediction) {
195 $resultobj->predictions[$index] = array($sampleids[$index], $prediction);
202 * Evaluates this processor classification model using the provided supervised learning dataset.
204 * During evaluation we need to shuffle the evaluation dataset samples to detect deviated results,
205 * if the dataset is massive we can not load everything into memory. We know that 2GB is the
206 * minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory
207 * that we already consumed and the memory that Phpml algorithms will need we should still have at
208 * least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust
209 * solution that will work for all sites but it should minimize memory limit problems. Site admins
210 * can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit.
212 * @param string $uniqueid
213 * @param float $maxdeviation
214 * @param int $niterations
215 * @param \stored_file $dataset
216 * @param string $outputdir
219 public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset, $outputdir) {
220 $fh = $dataset->get_content_file_handle();
222 // The first lines are var names and the second one values.
223 $metadata = $this->extract_metadata($fh);
228 if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
230 $limit = get_real_size('500MB');
232 // Just an approximation, will depend on PHP version, compile options...
233 // Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes)
234 // https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html.
235 $floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96;
240 while (($data = fgetcsv($fh)) !== false) {
241 $sampledata = array_map('floatval', $data);
243 $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
244 $targets[] = intval($data[$metadata['nfeatures']]);
246 if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
247 // We allow admins to disable evaluation memory usage limits by modifying config.php.
249 // We will have plenty of missing values in the dataset so it should be a conservative approximation.
250 $samplessize = $samplessize + (count($sampledata) * $floatsize);
252 // Stop fetching more samples.
253 if ($samplessize >= $limit) {
254 $this->limitedsize = true;
261 // We need at least 2 samples belonging to each target.
262 $counts = array_count_values($targets);
263 foreach ($counts as $count) {
265 $notenoughdata = true;
268 if (!empty($notenoughdata)) {
269 $resultobj = new \stdClass();
270 $resultobj->status = \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
271 $resultobj->score = 0;
272 $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
278 // Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data.
279 for ($i = 0; $i < $niterations; $i++) {
281 $classifier = new \Phpml\Classification\Linear\LogisticRegression(self::TRAIN_ITERATIONS, Normalizer::NORM_L2);
283 // Split up the dataset in classifier and testing.
284 $data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2);
286 $classifier->train($data->getTrainSamples(), $data->getTrainLabels());
288 $predictedlabels = $classifier->predict($data->getTestSamples());
289 $phis[] = $this->get_phi($data->getTestLabels(), $predictedlabels);
292 // Let's fill the results changing the returned status code depending on the phi-related calculated metrics.
293 return $this->get_evaluation_result_object($dataset, $phis, $maxdeviation);
297 * Returns the results objects from all evaluations.
299 * @param \stored_file $dataset
301 * @param float $maxdeviation
304 protected function get_evaluation_result_object(\stored_file $dataset, $phis, $maxdeviation) {
306 // Average phi of all evaluations as final score.
307 if (count($phis) === 1) {
308 $avgphi = reset($phis);
310 $avgphi = \Phpml\Math\Statistic\Mean::arithmetic($phis);
313 // Standard deviation should ideally be calculated against the area under the curve.
314 if (count($phis) === 1) {
317 $modeldev = \Phpml\Math\Statistic\StandardDeviation::population($phis);
320 // Let's fill the results object.
321 $resultobj = new \stdClass();
323 // Zero is ok, now we add other bits if something is not right.
324 $resultobj->status = \core_analytics\model::OK;
325 $resultobj->info = array();
327 // Convert phi to a standard score (from -1 to 1 to a value between 0 and 1).
328 $resultobj->score = ($avgphi + 1) / 2;
330 // If each iteration results varied too much we need more data to confirm that this is a valid model.
331 if ($modeldev > $maxdeviation) {
332 $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
333 $a = new \stdClass();
334 $a->deviation = $modeldev;
335 $a->accepteddeviation = $maxdeviation;
336 $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);
339 if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
340 $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_LOW_SCORE;
341 $a = new \stdClass();
342 $a->score = $resultobj->score;
343 $a->minscore = \core_analytics\model::MIN_SCORE;
344 $resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a);
347 if ($this->limitedsize === true) {
348 $resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize()));
355 * Train this processor regression model using the provided supervised learning dataset.
357 * @throws new \coding_exception
358 * @param string $uniqueid
359 * @param \stored_file $dataset
360 * @param string $outputdir
363 public function train_regression($uniqueid, \stored_file $dataset, $outputdir) {
364 throw new \coding_exception('This predictor does not support regression yet.');
368 * Estimates linear values for the provided dataset samples.
370 * @throws new \coding_exception
371 * @param string $uniqueid
372 * @param \stored_file $dataset
373 * @param mixed $outputdir
376 public function estimate($uniqueid, \stored_file $dataset, $outputdir) {
377 throw new \coding_exception('This predictor does not support regression yet.');
381 * Evaluates this processor regression model using the provided supervised learning dataset.
383 * @throws new \coding_exception
384 * @param string $uniqueid
385 * @param float $maxdeviation
386 * @param int $niterations
387 * @param \stored_file $dataset
388 * @param string $outputdir
391 public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset, $outputdir) {
392 throw new \coding_exception('This predictor does not support regression yet.');
396 * Returns the Phi correlation coefficient.
398 * @param array $testlabels
399 * @param array $predictedlabels
402 protected function get_phi($testlabels, $predictedlabels) {
404 // Binary here only as well.
405 $matrix = \Phpml\Metric\ConfusionMatrix::compute($testlabels, $predictedlabels, array(0, 1));
407 $tptn = $matrix[0][0] * $matrix[1][1];
408 $fpfn = $matrix[1][0] * $matrix[0][1];
409 $tpfp = $matrix[0][0] + $matrix[1][0];
410 $tpfn = $matrix[0][0] + $matrix[0][1];
411 $tnfp = $matrix[1][1] + $matrix[1][0];
412 $tnfn = $matrix[1][1] + $matrix[0][1];
413 if ($tpfp === 0 || $tpfn === 0 || $tnfp === 0 || $tnfn === 0) {
416 $phi = ( $tptn - $fpfn ) / sqrt( $tpfp * $tpfn * $tnfp * $tnfn);
423 * Extracts metadata from the dataset file.
425 * The file poiter should be located at the top of the file.
427 * @param resource $fh
430 protected function extract_metadata($fh) {
431 $metadata = fgetcsv($fh);
432 return array_combine($metadata, fgetcsv($fh));