From 325b3bdd8e92a4952e9e3db8da3b0eec69c8d252 Mon Sep 17 00:00:00 2001
From: David Monllao <davidm@moodle.com>
Date: Thu, 7 Sep 2017 14:42:17 +0200
Subject: [PATCH 1/2] MDL-59988 analytics: Files marked as used only if valid

- Basic unit test for minimum machine learning backends requirements
- Warning return messages now include not enough data
- Clear models when the predictions processor is changed
- Refined the name of a couple of constants / methods
---
 .../analytics/classes/output/renderer.php     | 10 +-
 analytics/classes/admin_setting_predictor.php |  9 ++
 analytics/classes/local/target/base.php       |  2 +-
 analytics/classes/local/target/discrete.php   |  5 +-
 analytics/classes/local/target/linear.php     |  2 +-
 analytics/classes/model.php                   | 35 ++++---
 analytics/tests/model_test.php                |  7 +-
 analytics/tests/prediction_test.php           | 94 ++++++++++++++++++-
 lang/en/analytics.php                         |  2 +-
 lib/mlbackend/php/classes/processor.php       | 19 +++-
 lib/mlbackend/python/classes/processor.php    | 14 ++-
 11 files changed, 165 insertions(+), 34 deletions(-)

diff --git a/admin/tool/analytics/classes/output/renderer.php b/admin/tool/analytics/classes/output/renderer.php
index 32bdcfc4482..628b097f264 100644
--- a/admin/tool/analytics/classes/output/renderer.php
+++ b/admin/tool/analytics/classes/output/renderer.php
@@ -159,11 +159,12 @@ class renderer extends plugin_renderer_base {
             if ($trainresults->status == 0) {
                 $output .= $OUTPUT->notification(get_string('trainingprocessfinished', 'tool_analytics'),
                     \core\output\notification::NOTIFY_SUCCESS);
-            } else if ($trainresults->status === \core_analytics\model::NO_DATASET) {
+            } else if ($trainresults->status === \core_analytics\model::NO_DATASET ||
+                    $trainresults->status === \core_analytics\model::NOT_ENOUGH_DATA) {
                 $output .= $OUTPUT->notification(get_string('nodatatotrain', 'tool_analytics'),
                     \core\output\notification::NOTIFY_WARNING);
             } else {
-                $output .= $OUTPUT->notification(get_string('generalerror', 'analytics', $trainresults->status),
+                $output .= $OUTPUT->notification(get_string('generalerror', 'tool_analytics', $trainresults->status),
                     \core\output\notification::NOTIFY_ERROR);
             }
         }
@@ -183,11 +184,12 @@ class renderer extends plugin_renderer_base {
             if ($predictresults->status == 0) {
                 $output .= $OUTPUT->notification(get_string('predictionprocessfinished', 'tool_analytics'),
                     \core\output\notification::NOTIFY_SUCCESS);
-            } else if ($predictresults->status === \core_analytics\model::NO_DATASET) {
+            } else if ($predictresults->status === \core_analytics\model::NO_DATASET ||
+                    $predictresults->status === \core_analytics\model::NOT_ENOUGH_DATA) {
                 $output .= $OUTPUT->notification(get_string('nodatatopredict', 'tool_analytics'),
                     \core\output\notification::NOTIFY_WARNING);
             } else {
-                $output .= $OUTPUT->notification(get_string('generalerror', 'analytics', $predictresults->status),
+                $output .= $OUTPUT->notification(get_string('generalerror', 'tool_analytics', $predictresults->status),
                     \core\output\notification::NOTIFY_ERROR);
             }
         }
diff --git a/analytics/classes/admin_setting_predictor.php b/analytics/classes/admin_setting_predictor.php
index 203ad5542e6..c523697ef71 100644
--- a/analytics/classes/admin_setting_predictor.php
+++ b/analytics/classes/admin_setting_predictor.php
@@ -58,6 +58,15 @@ class admin_setting_predictor extends \admin_setting_configselect {
             return get_string('errorprocessornotready', 'analytics', $isready);
         }
 
+        $currentvalue = get_config('analytics', 'predictionsprocessor');
+        if (!empty($currentvalue) && $currentvalue != str_replace('\\\\', '\\', $data)) {
+            // Clear all models data.
+            $models = \core_analytics\manager::get_all_models();
+            foreach ($models as $model) {
+                $model->clear();
+            }
+        }
+
         return ($this->config_write($this->name, $data) ? '' : get_string('errorsetting', 'admin'));
     }
 }
diff --git a/analytics/classes/local/target/base.php b/analytics/classes/local/target/base.php
index 5b741d27678..3e84120e7ce 100644
--- a/analytics/classes/local/target/base.php
+++ b/analytics/classes/local/target/base.php
@@ -261,7 +261,7 @@ abstract class base extends \core_analytics\calculable {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore
diff --git a/analytics/classes/local/target/discrete.php b/analytics/classes/local/target/discrete.php
index cbd8fe09d76..26044c7ac5d 100644
--- a/analytics/classes/local/target/discrete.php
+++ b/analytics/classes/local/target/discrete.php
@@ -152,7 +152,10 @@ abstract class discrete extends base {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
+     *
+     * This method internally calls ignored_predicted_classes to skip classes
+     * flagged by the target as not important for users.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore
diff --git a/analytics/classes/local/target/linear.php b/analytics/classes/local/target/linear.php
index d16ad96075f..f10d8438540 100644
--- a/analytics/classes/local/target/linear.php
+++ b/analytics/classes/local/target/linear.php
@@ -84,7 +84,7 @@ abstract class linear extends base {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore
diff --git a/analytics/classes/model.php b/analytics/classes/model.php
index 44b9e1e8953..2ec50e8335b 100644
--- a/analytics/classes/model.php
+++ b/analytics/classes/model.php
@@ -53,12 +53,12 @@ class model {
     /**
      * Model with low prediction accuracy.
      */
-    const EVALUATE_LOW_SCORE = 4;
+    const LOW_SCORE = 4;
 
     /**
      * Not enough data to evaluate the model properly.
      */
-    const EVALUATE_NOT_ENOUGH_DATA = 8;
+    const NOT_ENOUGH_DATA = 8;
 
     /**
      * Invalid analysable for the time splitting method.
@@ -437,7 +437,7 @@ class model {
                 $this->model->indicators !== $indicatorsstr) {
 
             // Delete generated predictions before changing the model version.
-            $this->clear_model();
+            $this->clear();
 
             // It needs to be reset as the version changes.
             $this->uniqueid = null;
@@ -474,9 +474,9 @@ class model {
 
         \core_analytics\manager::check_can_manage_models();
 
-        $this->clear_model();
+        $this->clear();
 
-        // Method self::clear_model is already clearing the current model version.
+        // Method self::clear is already clearing the current model version.
         $predictor = \core_analytics\manager::get_predictions_processor();
         $predictor->delete_output_dir($this->get_output_dir(array(), true));
 
@@ -633,6 +633,10 @@ class model {
         $result->status = $predictorresult->status;
         $result->info = $predictorresult->info;
 
+        if ($result->status !== self::OK) {
+            return $result;
+        }
+
         $this->flag_file_as_used($samplesfile, 'trained');
 
         // Mark the model as trained if it wasn't.
@@ -717,6 +721,10 @@ class model {
             $result->predictions = $this->format_predictor_predictions($predictorresult);
         }
 
+        if ($result->status !== self::OK) {
+            return $result;
+        }
+
         if ($result->predictions) {
             $samplecontexts = $this->execute_prediction_callbacks($result->predictions, $indicatorcalculations);
         }
@@ -780,15 +788,16 @@ class model {
 
         // Here we will store all predictions' contexts, this will be used to limit which users will see those predictions.
         $samplecontexts = array();
+        $records = array();
 
         foreach ($predictions as $uniquesampleid => $prediction) {
 
+            // The unique sample id contains both the sampleid and the rangeindex.
+            list($sampleid, $rangeindex) = $this->get_time_splitting()->infer_sample_info($uniquesampleid);
+
             if ($this->get_target()->triggers_callback($prediction->prediction, $prediction->predictionscore)) {
 
-                // The unique sample id contains both the sampleid and the rangeindex.
-                list($sampleid, $rangeindex) = $this->get_time_splitting()->infer_sample_info($uniquesampleid);
-
-                // Store the predicted values.
+                // Prepare the record to store the predicted values.
                 list($record, $samplecontext) = $this->prepare_prediction_record($sampleid, $rangeindex, $prediction->prediction,
                     $prediction->predictionscore, json_encode($indicatorcalculations[$uniquesampleid]));
 
@@ -990,7 +999,7 @@ class model {
             }
 
             // Delete generated predictions before changing the model version.
-            $this->clear_model();
+            $this->clear();
 
             // It needs to be reset as the version changes.
             $this->uniqueid = null;
@@ -1268,7 +1277,7 @@ class model {
             $outputdir = rtrim($CFG->dataroot, '/') . DIRECTORY_SEPARATOR . 'models';
         }
 
-        // Append model id
+        // Append model id.
         $outputdir .= DIRECTORY_SEPARATOR . $this->model->id;
         if (!$onlymodelid) {
             // Append version + subdirs.
@@ -1435,9 +1444,11 @@ class model {
      *
      * @return void
      */
-    private function clear_model() {
+    public function clear() {
         global $DB;
 
+        \core_analytics\manager::check_can_manage_models();
+
         // Delete current model version stored stuff.
         $predictor = \core_analytics\manager::get_predictions_processor();
         $predictor->clear_model($this->get_unique_id(), $this->get_output_dir());
diff --git a/analytics/tests/model_test.php b/analytics/tests/model_test.php
index ef2715ab9eb..2685b64bbcb 100644
--- a/analytics/tests/model_test.php
+++ b/analytics/tests/model_test.php
@@ -155,13 +155,10 @@ class analytics_model_testcase extends advanced_testcase {
         $modelversionoutputdir = $this->model->get_output_dir();
         $this->assertTrue(is_dir($modelversionoutputdir));
 
-        // Update to an empty time splitting method to force clear_model execution.
-        $this->model->update(1, false, '');
+        // Update to an empty time splitting method to force model::clear execution.
+        $this->model->clear();
         $this->assertFalse(is_dir($modelversionoutputdir));
 
-        // Restore previous time splitting method.
-        $this->model->enable('\core\analytics\time_splitting\no_splitting');
-
         // Check that most of the stuff got deleted.
         $this->assertEquals(1, $DB->count_records('analytics_models', array('id' => $this->modelobj->id)));
         $this->assertEquals(1, $DB->count_records('analytics_models_log', array('modelid' => $this->modelobj->id)));
diff --git a/analytics/tests/prediction_test.php b/analytics/tests/prediction_test.php
index 7f30037797b..f97db57beeb 100644
--- a/analytics/tests/prediction_test.php
+++ b/analytics/tests/prediction_test.php
@@ -273,6 +273,96 @@ class core_analytics_prediction_testcase extends advanced_testcase {
         return $this->add_prediction_processors($cases);
     }
 
+    /**
+     * Test the system classifiers returns.
+     *
+     * This test checks that all mlbackend plugins in the system are able to return proper status codes
+     * even under weird situations.
+     *
+     * @dataProvider provider_ml_classifiers_return
+     * @param int $success
+     * @param int $nsamples
+     * @param int $classes
+     * @param string $predictionsprocessorclass
+     * @return void
+     */
+    public function test_ml_classifiers_return($success, $nsamples, $classes, $predictionsprocessorclass) {
+        $this->resetAfterTest();
+
+        $predictionsprocessor = \core_analytics\manager::get_predictions_processor($predictionsprocessorclass, false);
+        if ($predictionsprocessor->is_ready() !== true) {
+            $this->markTestSkipped('Skipping ' . $predictionsprocessorclass . ' as the predictor is not ready.');
+        }
+
+        if ($nsamples % count($classes) != 0) {
+            throw new \coding_exception('The number of samples should be divisible by the number of classes');
+        }
+        $samplesperclass = $nsamples / count($classes);
+
+        // Metadata (we pass 2 classes even if $classes only provides 1 class samples as we want to test
+        // what the backend does in this case.
+        $dataset = "nfeatures,targetclasses,targettype" . PHP_EOL;
+        $dataset .= "3,\"[0,1]\",\"discrete\"" . PHP_EOL;
+
+        // Headers.
+        $dataset .= "feature1,feature2,feature3,target" . PHP_EOL;
+        foreach ($classes as $class) {
+            for ($i = 0; $i < $samplesperclass; $i++) {
+                $dataset .= "1,0,1,$class" . PHP_EOL;
+            }
+        }
+
+        $trainingfile = array(
+            'contextid' => \context_system::instance()->id,
+            'component' => 'analytics',
+            'filearea' => 'labelled',
+            'itemid' => 123,
+            'filepath' => '/',
+            'filename' => 'whocares.csv'
+        );
+        $fs = get_file_storage();
+        $dataset = $fs->create_file_from_string($trainingfile, $dataset);
+
+        // Training should work correctly if at least 1 sample of each class is included.
+        $dir = make_request_directory();
+        $result = $predictionsprocessor->train_classification('whatever', $dataset, $dir);
+
+        switch ($success) {
+            case 'yes':
+                $this->assertEquals(\core_analytics\model::OK, $result->status);
+                break;
+            case 'no':
+                $this->assertNotEquals(\core_analytics\model::OK, $result->status);
+                break;
+            case 'maybe':
+            default:
+                // We just check that an object is returned so we don't have an empty check,
+                // what we really want to check is that an exception was not thrown.
+                $this->assertInstanceOf(\stdClass::class, $result);
+        }
+    }
+
+    /**
+     * test_ml_classifiers_return provider
+     *
+     * We can not be very specific here as test_ml_classifiers_return only checks that
+     * mlbackend plugins behave and expected and control properly backend errors even
+     * under weird situations.
+     *
+     * @return array
+     */
+    public function provider_ml_classifiers_return() {
+        // Using verbose options as the first argument for readability.
+        $cases = array(
+            '1-samples' => array('maybe', 1, [0]),
+            '2-samples-same-class' => array('maybe', 2, [0]),
+            '2-samples-different-classes' => array('yes', 2, [0, 1]),
+            '4-samples-different-classes' => array('yes', 4, [0, 1])
+        );
+
+        // We need to test all system prediction processors.
+        return $this->add_prediction_processors($cases);
+    }
 
     /**
      * Basic test to check that prediction processors work as expected.
@@ -426,8 +516,8 @@ class core_analytics_prediction_testcase extends advanced_testcase {
                 'expectedresults' => array(
                     // The course duration is too much to be processed by in weekly basis.
                     '\core\analytics\time_splitting\weekly' => \core_analytics\model::NO_DATASET,
-                    '\core\analytics\time_splitting\single_range' => \core_analytics\model::EVALUATE_LOW_SCORE,
-                    '\core\analytics\time_splitting\quarters' => \core_analytics\model::EVALUATE_LOW_SCORE,
+                    '\core\analytics\time_splitting\single_range' => \core_analytics\model::LOW_SCORE,
+                    '\core\analytics\time_splitting\quarters' => \core_analytics\model::LOW_SCORE,
                 )
             ),
             'good' => array(
diff --git a/lang/en/analytics.php b/lang/en/analytics.php
index 40ebdf58a1a..cb0ff33d7a4 100644
--- a/lang/en/analytics.php
+++ b/lang/en/analytics.php
@@ -82,7 +82,7 @@ $string['novalidsamples'] = 'No valid samples available';
 $string['onlycli'] = 'Analytics processes execution via command line only';
 $string['onlycliinfo'] = 'Analytics processes like evaluating models, training machine learning algorithms or getting predictions can take some time, they will run as cron tasks and they can be forced via command line. Disable this setting if you want your site managers to be able to run these processes manually via web interface';
 $string['predictionsprocessor'] = 'Predictions processor';
-$string['predictionsprocessor_help'] = 'Prediction processors are the machine learning backends that process the datasets generated by calculating models\' indicators and targets.';
+$string['predictionsprocessor_help'] = 'A predictions processor is the machine-learning backend that processes the datasets generated by calculating models\' indicators and targets. All trained algorithms and predictions will be deleted if you change to another predictions processor.';
 $string['processingsitecontents'] = 'Processing site contents';
 $string['successfullyanalysed'] = 'Successfully analysed';
 $string['timesplittingmethod'] = 'Time-splitting method';
diff --git a/lib/mlbackend/php/classes/processor.php b/lib/mlbackend/php/classes/processor.php
index 9a84c5cb88f..cfba5933860 100644
--- a/lib/mlbackend/php/classes/processor.php
+++ b/lib/mlbackend/php/classes/processor.php
@@ -129,16 +129,27 @@ class processor implements \core_analytics\classifier, \core_analytics\regressor
             $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
             $targets[] = intval($data[$metadata['nfeatures']]);
 
-            if (count($samples) === self::BATCH_SIZE) {
+            $nsamples = count($samples);
+            if ($nsamples === self::BATCH_SIZE) {
                 // Training it batches to avoid running out of memory.
 
                 $classifier->partialTrain($samples, $targets, array(0, 1));
                 $samples = array();
                 $targets = array();
             }
+            if (empty($morethan1sample) && $nsamples > 1) {
+                $morethan1sample = true;
+            }
         }
         fclose($fh);
 
+        if (empty($morethan1sample)) {
+            $resultobj = new \stdClass();
+            $resultobj->status = \core_analytics\model::NO_DATASET;
+            $resultobj->info = array();
+            return $resultobj;
+        }
+
         // Train the remaining samples.
         if ($samples) {
             $classifier->partialTrain($samples, $targets, array(0, 1));
@@ -288,7 +299,7 @@ class processor implements \core_analytics\classifier, \core_analytics\regressor
         }
         if (!empty($notenoughdata)) {
             $resultobj = new \stdClass();
-            $resultobj->status = \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
+            $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;
             $resultobj->score = 0;
             $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
             return $resultobj;
@@ -350,7 +361,7 @@ class processor implements \core_analytics\classifier, \core_analytics\regressor
 
         // If each iteration results varied too much we need more data to confirm that this is a valid model.
         if ($modeldev > $maxdeviation) {
-            $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
+            $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;
             $a = new \stdClass();
             $a->deviation = $modeldev;
             $a->accepteddeviation = $maxdeviation;
@@ -358,7 +369,7 @@ class processor implements \core_analytics\classifier, \core_analytics\regressor
         }
 
         if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
-            $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_LOW_SCORE;
+            $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;
             $a = new \stdClass();
             $a->score = $resultobj->score;
             $a->minscore = \core_analytics\model::MIN_SCORE;
diff --git a/lib/mlbackend/python/classes/processor.php b/lib/mlbackend/python/classes/processor.php
index 1ba59c8ff5a..69420b0c58a 100644
--- a/lib/mlbackend/python/classes/processor.php
+++ b/lib/mlbackend/python/classes/processor.php
@@ -38,7 +38,7 @@ class processor implements  \core_analytics\classifier, \core_analytics\regresso
     /**
      * The required version of the python package that performs all calculations.
      */
-    const REQUIRED_PIP_PACKAGE_VERSION = '0.0.2';
+    const REQUIRED_PIP_PACKAGE_VERSION = '0.0.3';
 
     /**
      * The path to the Python bin.
@@ -150,7 +150,11 @@ class processor implements  \core_analytics\classifier, \core_analytics\regresso
         }
 
         if ($exitcode != 0) {
-            throw new \moodle_exception('errorpredictionsprocessor', 'analytics', '', implode(', ', $resultobj->errors));
+            $errors = $resultobj->errors;
+            if (is_array($errors)) {
+                $errors = implode(', ', $errors);
+            }
+            $resultobj->info = array(get_string('errorpredictionsprocessor', 'analytics', $errors));
         }
 
         return $resultobj;
@@ -191,7 +195,11 @@ class processor implements  \core_analytics\classifier, \core_analytics\regresso
         }
 
         if ($exitcode != 0) {
-            throw new \moodle_exception('errorpredictionsprocessor', 'analytics', '', implode(', ', $resultobj->errors));
+            $errors = $resultobj->errors;
+            if (is_array($errors)) {
+                $errors = implode(', ', $errors);
+            }
+            $resultobj->info = array(get_string('errorpredictionsprocessor', 'analytics', $errors));
         }
 
         return $resultobj;

From 2dca1339163b14fdfbae6fa000f45c11d4b64a9e Mon Sep 17 00:00:00 2001
From: David Monllao <davidm@moodle.com>
Date: Tue, 10 Oct 2017 09:45:21 +0200
Subject: [PATCH 2/2] MDL-59988 analytics: Process pending training and
 prediction files

---
 analytics/classes/dataset_manager.php     | 55 +++++++++++++
 analytics/classes/local/analyser/base.php | 28 ++++++-
 analytics/classes/model.php               |  2 +-
 analytics/tests/dataset_manager_test.php  | 97 +++++++++++++++++++----
 lib/db/install.xml                        |  2 +-
 lib/db/upgrade.php                        | 24 ++++++
 version.php                               |  2 +-
 7 files changed, 190 insertions(+), 20 deletions(-)

diff --git a/analytics/classes/dataset_manager.php b/analytics/classes/dataset_manager.php
index 4b457e88682..b4671688cd4 100644
--- a/analytics/classes/dataset_manager.php
+++ b/analytics/classes/dataset_manager.php
@@ -202,6 +202,61 @@ class dataset_manager {
             '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/', self::EVALUATION_FILENAME);
     }
 
+    /**
+     * Gets the list of files that couldn't be previously used for training and prediction.
+     *
+     * @param int $modelid
+     * @param bool $includetarget
+     * @param string[] $timesplittingids
+     * @return null
+     */
+    public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
+        global $DB;
+
+        $fs = get_file_storage();
+
+        if ($includetarget) {
+            $filearea = self::LABELLED_FILEAREA;
+            $usedfileaction = 'trained';
+        } else {
+            $filearea = self::UNLABELLED_FILEAREA;
+            $usedfileaction = 'predicted';
+        }
+
+        $select = 'modelid = :modelid AND action = :action';
+        $params = array('modelid' => $modelid, 'action' => $usedfileaction);
+        $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
+
+        // Very likely that we will only have 1 time splitting method here.
+        $filesbytimesplitting = array();
+        foreach ($timesplittingids as $timesplittingid) {
+
+            $filepath = '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/';
+            $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
+            foreach ($files as $file) {
+
+                // Discard evaluation files.
+                if ($file->get_filename() === self::EVALUATION_FILENAME) {
+                    continue;
+                }
+
+                // No dirs.
+                if ($file->is_directory()) {
+                    continue;
+                }
+
+                // Already used for training.
+                if (in_array($file->get_id(), $usedfileids)) {
+                    continue;
+                }
+
+                $filesbytimesplitting[$timesplittingid][] = $file;
+            }
+        }
+
+        return $filesbytimesplitting;
+    }
+
     /**
      * Deletes previous evaluation files of this model.
      *
diff --git a/analytics/classes/local/analyser/base.php b/analytics/classes/local/analyser/base.php
index e516977ef3a..8d792b48ddd 100644
--- a/analytics/classes/local/analyser/base.php
+++ b/analytics/classes/local/analyser/base.php
@@ -190,13 +190,13 @@ abstract class base {
         list($analysables, $processedanalysables) = $this->get_sorted_analysables($includetarget);
 
         $inittime = time();
-        foreach ($analysables as $analysable) {
+        foreach ($analysables as $key => $analysable) {
 
             $files = $this->process_analysable($analysable, $includetarget);
 
             // Later we will need to aggregate data by time splitting method.
             foreach ($files as $timesplittingid => $file) {
-                $filesbytimesplitting[$timesplittingid][$analysable->get_id()] = $file;
+                $filesbytimesplitting[$timesplittingid][] = $file;
             }
 
             $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id(), $includetarget);
@@ -208,11 +208,35 @@ abstract class base {
                     break;
                 }
             }
+
+            unset($analysables[$key]);
+        }
+
+        if ($this->options['evaluation'] === false) {
+            // Look for previous training and prediction files we generated and couldn't be used
+            // by machine learning backends because they weren't big enough.
+
+            $pendingfiles = \core_analytics\dataset_manager::get_pending_files($this->modelid, $includetarget,
+                array_keys($filesbytimesplitting));
+            foreach ($pendingfiles as $timesplittingid => $files) {
+                foreach ($files as $file) {
+                    $filesbytimesplitting[$timesplittingid][] = $file;
+                }
+            }
         }
 
         // We join the datasets by time splitting method.
         $timesplittingfiles = $this->merge_analysable_files($filesbytimesplitting, $includetarget);
 
+        if (!empty($pendingfiles)) {
+            // We must remove them now as they are already part of another dataset.
+            foreach ($pendingfiles as $timesplittingid => $files) {
+                foreach ($files as $file) {
+                    $file->delete();
+                }
+            }
+        }
+
         return $timesplittingfiles;
     }
 
diff --git a/analytics/classes/model.php b/analytics/classes/model.php
index 2ec50e8335b..f04eb96268a 100644
--- a/analytics/classes/model.php
+++ b/analytics/classes/model.php
@@ -693,7 +693,7 @@ class model {
         $samplesfile = $samplesdata[$this->model->timesplitting];
 
         // We need to throw an exception if we are trying to predict stuff that was already predicted.
-        $params = array('modelid' => $this->model->id, 'fileid' => $samplesfile->get_id(), 'action' => 'predicted');
+        $params = array('modelid' => $this->model->id, 'action' => 'predicted', 'fileid' => $samplesfile->get_id());
         if ($predicted = $DB->get_record('analytics_used_files', $params)) {
             throw new \moodle_exception('erroralreadypredict', 'analytics', '', $samplesfile->get_id());
         }
diff --git a/analytics/tests/dataset_manager_test.php b/analytics/tests/dataset_manager_test.php
index d36baf5ede4..3856c5f8b6f 100644
--- a/analytics/tests/dataset_manager_test.php
+++ b/analytics/tests/dataset_manager_test.php
@@ -34,22 +34,30 @@ defined('MOODLE_INTERNAL') || die();
 class dataset_manager_testcase extends advanced_testcase {
 
     /**
-     * test_create_dataset
+     * setUp
      *
-     * @return
+     * @return null
      */
-    public function test_create_dataset() {
+    public function setUp() {
         $this->resetAfterTest(true);
 
-        $sharedtoprows = array(
+        $this->sharedtoprows = array(
             array('var1', 'var2'),
             array('value1', 'value2'),
             array('header1', 'header2')
         );
+    }
+
+    /**
+     * test_create_dataset
+     *
+     * @return null
+     */
+    public function test_create_dataset() {
 
         $dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset1->init_process();
-        $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
         $f1 = $dataset1->store($dataset1data);
         $dataset1->close_process();
 
@@ -63,26 +71,19 @@ class dataset_manager_testcase extends advanced_testcase {
     /**
      * test_merge_datasets
      *
-     * @return
+     * @return null
      */
     public function test_merge_datasets() {
-        $this->resetAfterTest(true);
-
-        $sharedtoprows = array(
-            array('var1', 'var2'),
-            array('value1', 'value2'),
-            array('header1', 'header2')
-        );
 
         $dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset1->init_process();
-        $dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
         $f1 = $dataset1->store($dataset1data);
         $dataset1->close_process();
 
         $dataset2 = new \core_analytics\dataset_manager(1, 2, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
         $dataset2->init_process();
-        $dataset2data = array_merge($sharedtoprows, array(array('no', 'no', 'no')));
+        $dataset2data = array_merge($this->sharedtoprows, array(array('no', 'no', 'no')));
         $f2 = $dataset2->store($dataset2data);
         $dataset2->close_process();
 
@@ -97,4 +98,70 @@ class dataset_manager_testcase extends advanced_testcase {
         $this->assertContains('value1', $mergedfilecontents);
         $this->assertContains('header1', $mergedfilecontents);
     }
+
+    /**
+     * test_get_pending_files
+     *
+     * @return null
+     */
+    public function test_get_pending_files() {
+        global $DB;
+
+        $this->resetAfterTest();
+
+        $fakemodelid = 123;
+        $timesplittingids = array(
+            '\core\analytics\time_splitting\quarters',
+            '\core\analytics\time_splitting\quarters_accum',
+        );
+
+        // No files.
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        // We will reuse this analysable file to create training and prediction datasets (analysable level files are
+        // merged into training and prediction files).
+        $analysabledataset = new \core_analytics\dataset_manager($fakemodelid, 1, 'whatever',
+            \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+        $analysabledataset->init_process();
+        $analysabledatasetdata = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
+        $file = $analysabledataset->store($analysabledatasetdata);
+        $analysabledataset->close_process();
+
+        // Evaluation files ignored.
+        $evaluationdataset = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, true);
+
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        // Training and prediction files are not mixed up.
+        $trainingfile1 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+        $trainingfile2 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
+
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+        $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+        $this->assertCount(2, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+
+        $predictionfile = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
+            '\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::UNLABELLED_FILEAREA, false);
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids);
+        $this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
+        $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+        // Already used for training and prediction are discarded.
+        $usedfile = (object)['modelid' => $fakemodelid, 'fileid' => $trainingfile1->get_id(), 'action' => 'trained',
+            'time' => time()];
+        $DB->insert_record('analytics_used_files', $usedfile);
+        $bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
+        $this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
+
+        $usedfile->fileid = $predictionfile->get_id();
+        $usedfile->action = 'predicted';
+        $DB->insert_record('analytics_used_files', $usedfile);
+        $this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
+    }
 }
diff --git a/lib/db/install.xml b/lib/db/install.xml
index e636b719861..bbf96b4be67 100644
--- a/lib/db/install.xml
+++ b/lib/db/install.xml
@@ -3687,7 +3687,7 @@
         <KEY NAME="fileid" TYPE="foreign" FIELDS="fileid" REFTABLE="files" REFFIELDS="id"/>
       </KEYS>
       <INDEXES>
-        <INDEX NAME="modelidandfileidandaction" UNIQUE="false" FIELDS="modelid, fileid, action" COMMENT="Index on modelid and fileid and action"/>
+        <INDEX NAME="modelidandactionandfileid" UNIQUE="false" FIELDS="modelid, action, fileid" COMMENT="Index on modelid and action and fileid"/>
       </INDEXES>
     </TABLE>
     <TABLE NAME="analytics_indicator_calc" COMMENT="Stored indicator calculations">
diff --git a/lib/db/upgrade.php b/lib/db/upgrade.php
index 5877954f704..e8b25fc1825 100644
--- a/lib/db/upgrade.php
+++ b/lib/db/upgrade.php
@@ -2725,5 +2725,29 @@ function xmldb_main_upgrade($oldversion) {
         upgrade_main_savepoint(true, 2017101200.00);
     }
 
+    // Index modification upgrade step.
+    if ($oldversion < 2017101300.01) {
+
+        $table = new xmldb_table('analytics_used_files');
+
+        // Define index modelidandfileidandaction (not unique) to be dropped form analytics_used_files.
+        $index = new xmldb_index('modelidandfileidandaction', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'fileid', 'action'));
+
+        // Conditionally launch drop index modelidandfileidandaction.
+        if ($dbman->index_exists($table, $index)) {
+            $dbman->drop_index($table, $index);
+        }
+
+        // Define index modelidandactionandfileid (not unique) to be dropped form analytics_used_files.
+        $index = new xmldb_index('modelidandactionandfileid', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'action', 'fileid'));
+
+        // Conditionally launch add index modelidandactionandfileid.
+        if (!$dbman->index_exists($table, $index)) {
+            $dbman->add_index($table, $index);
+        }
+
+        // Main savepoint reached.
+        upgrade_main_savepoint(true, 2017101300.01);
+    }
     return true;
 }
diff --git a/version.php b/version.php
index f2d9fcc0091..5cf6c98e6ac 100644
--- a/version.php
+++ b/version.php
@@ -29,7 +29,7 @@
 
 defined('MOODLE_INTERNAL') || die();
 
-$version  = 2017101300.00;              // YYYYMMDD      = weekly release date of this DEV branch.
+$version  = 2017101300.01;              // YYYYMMDD      = weekly release date of this DEV branch.
                                         //         RR    = release increments - 00 in DEV branches.
                                         //           .XX = incremental changes.