'/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/', self::EVALUATION_FILENAME);
}
+ /**
+ * Gets the list of files that couldn't be previously used for training and prediction.
+ *
+ * @param int $modelid
+ * @param bool $includetarget
+ * @param string[] $timesplittingids
+ * @return null
+ */
+ public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
+ global $DB;
+
+ $fs = get_file_storage();
+
+ if ($includetarget) {
+ $filearea = self::LABELLED_FILEAREA;
+ $usedfileaction = 'trained';
+ } else {
+ $filearea = self::UNLABELLED_FILEAREA;
+ $usedfileaction = 'predicted';
+ }
+
+ $select = 'modelid = :modelid AND action = :action';
+ $params = array('modelid' => $modelid, 'action' => $usedfileaction);
+ $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
+
+ // Very likely that we will only have 1 time splitting method here.
+ $filesbytimesplitting = array();
+ foreach ($timesplittingids as $timesplittingid) {
+
+ $filepath = '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/';
+ $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
+ foreach ($files as $file) {
+
+ // Discard evaluation files.
+ if ($file->get_filename() === self::EVALUATION_FILENAME) {
+ continue;
+ }
+
+ // No dirs.
+ if ($file->is_directory()) {
+ continue;
+ }
+
+ // Already used for training.
+ if (in_array($file->get_id(), $usedfileids)) {
+ continue;
+ }
+
+ $filesbytimesplitting[$timesplittingid][] = $file;
+ }
+ }
+
+ return $filesbytimesplitting;
+ }
+
/**
* Deletes previous evaluation files of this model.
*
list($analysables, $processedanalysables) = $this->get_sorted_analysables($includetarget);
$inittime = time();
- foreach ($analysables as $analysable) {
+ foreach ($analysables as $key => $analysable) {
$files = $this->process_analysable($analysable, $includetarget);
// Later we will need to aggregate data by time splitting method.
foreach ($files as $timesplittingid => $file) {
- $filesbytimesplitting[$timesplittingid][$analysable->get_id()] = $file;
+ $filesbytimesplitting[$timesplittingid][] = $file;
}
$this->update_analysable_analysed_time($processedanalysables, $analysable->get_id(), $includetarget);
break;
}
}
+
+ unset($analysables[$key]);
+ }
+
+ if ($this->options['evaluation'] === false) {
+ // Look for previous training and prediction files we generated and couldn't be used
+ // by machine learning backends because they weren't big enough.
+
+ $pendingfiles = \core_analytics\dataset_manager::get_pending_files($this->modelid, $includetarget,
+ array_keys($filesbytimesplitting));
+ foreach ($pendingfiles as $timesplittingid => $files) {
+ foreach ($files as $file) {
+ $filesbytimesplitting[$timesplittingid][] = $file;
+ }
+ }
}
// We join the datasets by time splitting method.
$timesplittingfiles = $this->merge_analysable_files($filesbytimesplitting, $includetarget);
+ if (!empty($pendingfiles)) {
+ // We must remove them now as they are already part of another dataset.
+ foreach ($pendingfiles as $timesplittingid => $files) {
+ foreach ($files as $file) {
+ $file->delete();
+ }
+ }
+ }
+
return $timesplittingfiles;
}
class dataset_manager_testcase extends advanced_testcase {
/**
- * test_create_dataset
+ * setUp
*
- * @return
+ * @return null
*/
- public function test_create_dataset() {
+ public function setUp() {
$this->resetAfterTest(true);
- $sharedtoprows = array(
+ $this->sharedtoprows = array(
array('var1', 'var2'),
array('value1', 'value2'),
array('header1', 'header2')
);
+ }
+
+ /**
+ * test_create_dataset
+ *
+ * @return null
+ */
+ public function test_create_dataset() {
$dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset1->init_process();
- $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+ $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$f1 = $dataset1->store($dataset1data);
$dataset1->close_process();
/**
* test_merge_datasets
*
- * @return
+ * @return null
*/
public function test_merge_datasets() {
- $this->resetAfterTest(true);
-
- $sharedtoprows = array(
- array('var1', 'var2'),
- array('value1', 'value2'),
- array('header1', 'header2')
- );
$dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset1->init_process();
- $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+ $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$f1 = $dataset1->store($dataset1data);
$dataset1->close_process();
$dataset2 = new \core_analytics\dataset_manager(1, 2, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset2->init_process();
- $dataset2data = array_merge($sharedtoprows, array(array('no', 'no', 'no')));
+ $dataset2data = array_merge($this->sharedtoprows, array(array('no', 'no', 'no')));
$f2 = $dataset2->store($dataset2data);
$dataset2->close_process();
$this->assertContains('value1', $mergedfilecontents);
$this->assertContains('header1', $mergedfilecontents);
}
+
+ /**
+ * test_get_pending_files
+ *
+ * @return null
+ */
+ public function test_get_pending_files() {
+ global $DB;
+
+ $this->resetAfterTest();
+
+ $fakemodelid = 123;
+ $timesplittingids = array(
+ '\core\analytics\time_splitting\quarters',
+ '\core\analytics\time_splitting\quarters_accum',
+ );
+
+ // No files.
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+ // We will reuse this analysable file to create training and prediction datasets (analysable level files are
+ // merged into training and prediction files).
+ $analysabledataset = new \core_analytics\dataset_manager($fakemodelid, 1, 'whatever',
+ \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+ $analysabledataset->init_process();
+ $analysabledatasetdata = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+ $file = $analysabledataset->store($analysabledatasetdata);
+ $analysabledataset->close_process();
+
+ // Evaluation files ignored.
+ $evaluationdataset = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+ '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, true);
+
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+ // Training and prediction files are not mixed up.
+ $trainingfile1 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+ '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+ $trainingfile2 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+ '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+
+ $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+ $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+ $this->assertCount(2, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+ $predictionfile = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+ '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::UNLABELLED_FILEAREA, false);
+ $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids);
+ $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+ $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+ // Already used for training and prediction are discarded.
+ $usedfile = (object)['modelid' => $fakemodelid, 'fileid' => $trainingfile1->get_id(), 'action' => 'trained',
+ 'time' => time()];
+ $DB->insert_record('analytics_used_files', $usedfile);
+ $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+ $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+ $usedfile->fileid = $predictionfile->get_id();
+ $usedfile->action = 'predicted';
+ $DB->insert_record('analytics_used_files', $usedfile);
+ $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+ }
}