MDL-59988 analytics: Process pending training and prediction files
authorDavid Monllao <davidm@moodle.com>
Tue, 10 Oct 2017 07:45:21 +0000 (09:45 +0200)
committerDavid Monllao <davidm@moodle.com>
Fri, 13 Oct 2017 10:29:21 +0000 (12:29 +0200)
analytics/classes/dataset_manager.php
analytics/classes/local/analyser/base.php
analytics/classes/model.php
analytics/tests/dataset_manager_test.php
lib/db/install.xml
lib/db/upgrade.php
version.php

index 4b457e8..b467168 100644 (file)
@@ -202,6 +202,61 @@ class dataset_manager {
             '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/', self::EVALUATION_FILENAME);
     }
 
+    /**
+     * Gets the list of files that couldn't be previously used for training and prediction.
+     *
+     * @param int $modelid
+     * @param bool $includetarget
+     * @param string[] $timesplittingids
+     * @return null
+     */
+    public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
+        global $DB;
+
+        $fs = get_file_storage();
+
+        if ($includetarget) {
+            $filearea = self::LABELLED_FILEAREA;
+            $usedfileaction = 'trained';
+        } else {
+            $filearea = self::UNLABELLED_FILEAREA;
+            $usedfileaction = 'predicted';
+        }
+
+        $select = 'modelid = :modelid AND action = :action';
+        $params = array('modelid' => $modelid, 'action' => $usedfileaction);
+        $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
+
+        // Very likely that we will only have 1 time splitting method here.
+        $filesbytimesplitting = array();
+        foreach ($timesplittingids as $timesplittingid) {
+
+            $filepath = '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/';
+            $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
+            foreach ($files as $file) {
+
+                // Discard evaluation files.
+                if ($file->get_filename() === self::EVALUATION_FILENAME) {
+                    continue;
+                }
+
+                // No dirs.
+                if ($file->is_directory()) {
+                    continue;
+                }
+
+                // Already used for training.
+                if (in_array($file->get_id(), $usedfileids)) {
+                    continue;
+                }
+
+                $filesbytimesplitting[$timesplittingid][] = $file;
+            }
+        }
+
+        return $filesbytimesplitting;
+    }
+
     /**
      * Deletes previous evaluation files of this model.
      *
index e516977..8d792b4 100644 (file)
@@ -190,13 +190,13 @@ abstract class base {
         list($analysables, $processedanalysables) = $this->get_sorted_analysables($includetarget);
 
         $inittime = time();
-        foreach ($analysables as $analysable) {
+        foreach ($analysables as $key => $analysable) {
 
             $files = $this->process_analysable($analysable, $includetarget);
 
             // Later we will need to aggregate data by time splitting method.
             foreach ($files as $timesplittingid => $file) {
-                $filesbytimesplitting[$timesplittingid][$analysable->get_id()] = $file;
+                $filesbytimesplitting[$timesplittingid][] = $file;
             }
 
             $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id(), $includetarget);
@@ -208,11 +208,35 @@ abstract class base {
                     break;
                 }
             }
+
+            unset($analysables[$key]);
+        }
+
+        if ($this->options['evaluation'] === false) {
+            // Look for previous training and prediction files we generated and couldn't be used
+            // by machine learning backends because they weren't big enough.
+
+            $pendingfiles = \core_analytics\dataset_manager::get_pending_files($this->modelid, $includetarget,
+                array_keys($filesbytimesplitting));
+            foreach ($pendingfiles as $timesplittingid => $files) {
+                foreach ($files as $file) {
+                    $filesbytimesplitting[$timesplittingid][] = $file;
+                }
+            }
         }
 
         // We join the datasets by time splitting method.
         $timesplittingfiles = $this->merge_analysable_files($filesbytimesplitting, $includetarget);
 
+        if (!empty($pendingfiles)) {
+            // We must remove them now as they are already part of another dataset.
+            foreach ($pendingfiles as $timesplittingid => $files) {
+                foreach ($files as $file) {
+                    $file->delete();
+                }
+            }
+        }
+
         return $timesplittingfiles;
     }
 
index 2ec50e8..f04eb96 100644 (file)
@@ -693,7 +693,7 @@ class model {
         $samplesfile = $samplesdata[$this->model->timesplitting];
 
         // We need to throw an exception if we are trying to predict stuff that was already predicted.
-        $params = array('modelid' => $this->model->id, 'fileid' => $samplesfile->get_id(), 'action' => 'predicted');
+        $params = array('modelid' => $this->model->id, 'action' => 'predicted', 'fileid' => $samplesfile->get_id());
         if ($predicted = $DB->get_record('analytics_used_files', $params)) {
             throw new \moodle_exception('erroralreadypredict', 'analytics', '', $samplesfile->get_id());
         }
index d36baf5..3856c5f 100644 (file)
@@ -34,22 +34,30 @@ defined('MOODLE_INTERNAL') || die();
 class dataset_manager_testcase extends advanced_testcase {
 
     /**
-     * test_create_dataset
+     * setUp
      *
-     * @return
+     * @return null
      */
-    public function test_create_dataset() {
+    public function setUp() {
         $this->resetAfterTest(true);
 
-        $sharedtoprows = array(
+        $this->sharedtoprows = array(
             array('var1', 'var2'),
             array('value1', 'value2'),
             array('header1', 'header2')
         );
+    }
+
+    /**
+     * test_create_dataset
+     *
+     * @return null
+     */
+    public function test_create_dataset() {
 
         $dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset1->init_process();
-        $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
         $f1 = $dataset1->store($dataset1data);
         $dataset1->close_process();
 
@@ -63,26 +71,19 @@ class dataset_manager_testcase extends advanced_testcase {
     /**
      * test_merge_datasets
      *
-     * @return
+     * @return null
      */
     public function test_merge_datasets() {
-        $this->resetAfterTest(true);
-
-        $sharedtoprows = array(
-            array('var1', 'var2'),
-            array('value1', 'value2'),
-            array('header1', 'header2')
-        );
 
         $dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset1->init_process();
-        $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
         $f1 = $dataset1->store($dataset1data);
         $dataset1->close_process();
 
         $dataset2 = new \core_analytics\dataset_manager(1, 2, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset2->init_process();
-        $dataset2data = array_merge($sharedtoprows, array(array('no', 'no', 'no')));
+        $dataset2data = array_merge($this->sharedtoprows, array(array('no', 'no', 'no')));
         $f2 = $dataset2->store($dataset2data);
         $dataset2->close_process();
 
@@ -97,4 +98,70 @@ class dataset_manager_testcase extends advanced_testcase {
         $this->assertContains('value1', $mergedfilecontents);
         $this->assertContains('header1', $mergedfilecontents);
     }
+
+    /**
+     * test_get_pending_files
+     *
+     * @return null
+     */
+    public function test_get_pending_files() {
+        global $DB;
+
+        $this->resetAfterTest();
+
+        $fakemodelid = 123;
+        $timesplittingids = array(
+            '\core\analytics\time_splitting\quarters',
+            '\core\analytics\time_splitting\quarters_accum',
+        );
+
+        // No files.
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        // We will reuse this analysable file to create training and prediction datasets (analysable level files are
+        // merged into training and prediction files).
+        $analysabledataset = new \core_analytics\dataset_manager($fakemodelid, 1, 'whatever',
+            \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+        $analysabledataset->init_process();
+        $analysabledatasetdata = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $file = $analysabledataset->store($analysabledatasetdata);
+        $analysabledataset->close_process();
+
+        // Evaluation files ignored.
+        $evaluationdataset = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, true);
+
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        // Training and prediction files are not mixed up.
+        $trainingfile1 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+        $trainingfile2 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+        $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+        $this->assertCount(2, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        $predictionfile = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::UNLABELLED_FILEAREA, false);
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids);
+        $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+        $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+        // Already used for training and prediction are discarded.
+        $usedfile = (object)['modelid' => $fakemodelid, 'fileid' => $trainingfile1->get_id(), 'action' => 'trained',
+            'time' => time()];
+        $DB->insert_record('analytics_used_files', $usedfile);
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+        $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+        $usedfile->fileid = $predictionfile->get_id();
+        $usedfile->action = 'predicted';
+        $DB->insert_record('analytics_used_files', $usedfile);
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+    }
 }
index e636b71..bbf96b4 100644 (file)
         <KEY NAME="fileid" TYPE="foreign" FIELDS="fileid" REFTABLE="files" REFFIELDS="id"/>
       </KEYS>
       <INDEXES>
-        <INDEX NAME="modelidandfileidandaction" UNIQUE="false" FIELDS="modelid, fileid, action" COMMENT="Index on modelid and fileid and action"/>
+        <INDEX NAME="modelidandactionandfileid" UNIQUE="false" FIELDS="modelid, action, fileid" COMMENT="Index on modelid and action and fileid"/>
       </INDEXES>
     </TABLE>
     <TABLE NAME="analytics_indicator_calc" COMMENT="Stored indicator calculations">
index 5877954..e8b25fc 100644 (file)
@@ -2725,5 +2725,29 @@ function xmldb_main_upgrade($oldversion) {
         upgrade_main_savepoint(true, 2017101200.00);
     }
 
+    // Index modification upgrade step.
+    if ($oldversion < 2017101300.01) {
+
+        $table = new xmldb_table('analytics_used_files');
+
+        // Define index modelidandfileidandaction (not unique) to be dropped form analytics_used_files.
+        $index = new xmldb_index('modelidandfileidandaction', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'fileid', 'action'));
+
+        // Conditionally launch drop index modelidandfileidandaction.
+        if ($dbman->index_exists($table, $index)) {
+            $dbman->drop_index($table, $index);
+        }
+
+        // Define index modelidandactionandfileid (not unique) to be dropped form analytics_used_files.
+        $index = new xmldb_index('modelidandactionandfileid', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'action', 'fileid'));
+
+        // Conditionally launch add index modelidandactionandfileid.
+        if (!$dbman->index_exists($table, $index)) {
+            $dbman->add_index($table, $index);
+        }
+
+        // Main savepoint reached.
+        upgrade_main_savepoint(true, 2017101300.01);
+    }
     return true;
 }
index f2d9fcc..5cf6c98 100644 (file)
@@ -29,7 +29,7 @@
 
 defined('MOODLE_INTERNAL') || die();
 
-$version  = 2017101300.00;              // YYYYMMDD      = weekly release date of this DEV branch.
+$version  = 2017101300.01;              // YYYYMMDD      = weekly release date of this DEV branch.
                                         //         RR    = release increments - 00 in DEV branches.
                                         //           .XX = incremental changes.