MDL-59116 analytics: Multiple predictions for each analysable time range

Now we only predict using the most recent range available, this means
that if someone upgrades to moodle 3.4 at three quarters of a course
we will only calculate the latest range, previous ranges were not
displayed anyway once more recent predictions were available.

This commit deletes all previous predictions :) this shouldn't be a
problem in master as we don't provide any guarantee, the alternative
(retrive sampleids from mdl_files) would have been slow and a waste of
time as well as require horrible code in an upgrade step (text fields
do not accept defaults nor we can use NOTNULL).
This commit is contained in:
David Monllao 2017-07-21 17:46:42 +02:00
parent 8146b1f06d
commit 00da1e6010
7 changed files with 190 additions and 66 deletions

View File

@ -376,8 +376,8 @@ abstract class base {
// All ranges are used when we are calculating data for training.
$ranges = $timesplitting->get_all_ranges();
} else {
// Only some ranges can be used for prediction (it depends on the time range where we are right now).
$ranges = $this->get_prediction_ranges($timesplitting);
// The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
$ranges = $this->get_most_recent_prediction_range($timesplitting);
}
// There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
@ -385,12 +385,12 @@ abstract class base {
if (empty($ranges)) {
$result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
$result->message = get_string('nonewdata', 'analytics');
$result->message = get_string('noranges', 'analytics');
return $result;
}
// We skip all samples that are already part of a training dataset, even if they have noe been used for training yet.
$sampleids = $this->filter_out_train_samples($sampleids, $timesplitting);
// We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
$this->filter_out_train_samples($sampleids, $timesplitting);
if (count($sampleids) === 0) {
$result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
@ -400,13 +400,19 @@ abstract class base {
// Only when processing data for predictions.
if ($target === false) {
// We also filter out ranges that have already been used for predictions.
$ranges = $this->filter_out_prediction_ranges($ranges, $timesplitting);
// We also filter out samples and ranges that have already been used for predictions.
$this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
}
if (count($sampleids) === 0) {
$result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
$result->message = get_string('nonewdata', 'analytics');
return $result;
}
if (count($ranges) === 0) {
$result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
$result->message = get_string('nonewtimeranges', 'analytics');
$result->message = get_string('nonewranges', 'analytics');
return $result;
}
}
@ -469,7 +475,7 @@ abstract class base {
if ($target) {
$this->save_train_samples($sampleids, $timesplitting, $file);
} else {
$this->save_prediction_ranges($ranges, $timesplitting);
$this->save_prediction_samples($sampleids, $ranges, $timesplitting);
}
}
@ -480,25 +486,28 @@ abstract class base {
}
/**
* Returns the ranges of a time splitting that can be used to predict.
* Returns the most recent range that can be used to predict.
*
* @param \core_analytics\local\time_splitting\base $timesplitting
* @return array
*/
protected function get_prediction_ranges($timesplitting) {
protected function get_most_recent_prediction_range($timesplitting) {
$now = time();
$ranges = $timesplitting->get_all_ranges();
// Opposite order as we are interested in the last range that can be used for prediction.
arsort($ranges);
// We already provided the analysable to the time splitting method, there is no need to feed it back.
$predictionranges = array();
foreach ($timesplitting->get_all_ranges() as $rangeindex => $range) {
foreach ($ranges as $rangeindex => $range) {
if ($timesplitting->ready_to_predict($range)) {
// We need to maintain the same indexes.
$predictionranges[$rangeindex] = $range;
return array($rangeindex => $range);
}
}
return $predictionranges;
return array();
}
/**
@ -506,9 +515,8 @@ abstract class base {
*
* @param int[] $sampleids
* @param \core_analytics\local\time_splitting\base $timesplitting
* @return int[]
*/
protected function filter_out_train_samples($sampleids, $timesplitting) {
protected function filter_out_train_samples(&$sampleids, $timesplitting) {
global $DB;
$params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
@ -526,32 +534,43 @@ abstract class base {
$sampleids = array_diff_key($sampleids, $usedsamples);
}
}
return $sampleids;
}
/**
* Filters out samples that have already been used for prediction.
*
* @param int[] $sampleids
* @param array $ranges
* @param \core_analytics\local\time_splitting\base $timesplitting
* @return int[]
*/
protected function filter_out_prediction_ranges($ranges, $timesplitting) {
protected function filter_out_prediction_samples_and_ranges(&$sampleids, &$ranges, $timesplitting) {
global $DB;
if (count($ranges) > 1) {
throw new \coding_exception('$ranges argument should only contain one range');
}
$rangeindex = key($ranges);
$params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
'timesplitting' => $timesplitting->get_id());
'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
$predictedrange = $DB->get_record('analytics_predict_samples', $params);
$predictedranges = $DB->get_records('analytics_predict_ranges', $params);
foreach ($predictedranges as $predictedrange) {
if (!empty($ranges[$predictedrange->rangeindex])) {
unset($ranges[$predictedrange->rangeindex]);
}
if (!$predictedrange) {
// Nothing to filter out.
return;
}
return $ranges;
$predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
$missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
if (count($missingsamples) === 0) {
// All samples already calculated.
unset($ranges[$rangeindex]);
return;
}
// Replace the list of samples by the one excluding samples that already got predictions at this range.
$sampleids = $missingsamples;
}
/**
@ -560,7 +579,7 @@ abstract class base {
* @param int[] $sampleids
* @param \core_analytics\local\time_splitting\base $timesplitting
* @param \stored_file $file
* @return bool
* @return void
*/
protected function save_train_samples($sampleids, $timesplitting, $file) {
global $DB;
@ -574,28 +593,40 @@ abstract class base {
$trainingsamples->sampleids = json_encode($sampleids);
$trainingsamples->timecreated = time();
return $DB->insert_record('analytics_train_samples', $trainingsamples);
$DB->insert_record('analytics_train_samples', $trainingsamples);
}
/**
* Saves samples that have just been used for prediction.
*
* @param int[] $sampleids
* @param array $ranges
* @param \core_analytics\local\time_splitting\base $timesplitting
* @return void
*/
protected function save_prediction_ranges($ranges, $timesplitting) {
protected function save_prediction_samples($sampleids, $ranges, $timesplitting) {
global $DB;
$predictionrange = new \stdClass();
$predictionrange->modelid = $this->modelid;
$predictionrange->analysableid = $timesplitting->get_analysable()->get_id();
$predictionrange->timesplitting = $timesplitting->get_id();
$predictionrange->timecreated = time();
if (count($ranges) > 1) {
throw new \coding_exception('$ranges argument should only contain one range');
}
foreach ($ranges as $rangeindex => $unused) {
$predictionrange->rangeindex = $rangeindex;
$DB->insert_record('analytics_predict_ranges', $predictionrange);
$rangeindex = key($ranges);
$params = array('modelid' => $this->modelid, 'analysableid' => $timesplitting->get_analysable()->get_id(),
'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
if ($predictionrange = $DB->get_record('analytics_predict_samples', $params)) {
// Append the new samples used for prediction.
$prevsamples = json_decode($predictionrange->sampleids, true);
$predictionrange->sampleids = json_encode($prevsamples + $sampleids);
$predictionrange->timemodified = time();
$DB->update_record('analytics_predict_samples', $predictionrange);
} else {
$predictionrange = (object)$params;
$predictionrange->sampleids = json_encode($sampleids);
$predictionrange->timecreated = time();
$predictionrange->timemodified = $predictionrange->timecreated;
$DB->insert_record('analytics_predict_samples', $predictionrange);
}
}
}

View File

@ -1013,7 +1013,7 @@ class model {
*/
public function any_prediction_obtained() {
global $DB;
return $DB->record_exists('analytics_predict_ranges',
return $DB->record_exists('analytics_predict_samples',
array('modelid' => $this->model->id, 'timesplitting' => $this->model->timesplitting));
}
@ -1317,8 +1317,8 @@ class model {
private function clear_model() {
global $DB;
$DB->delete_records('analytics_predict_ranges', array('modelid' => $this->model->id));
$DB->delete_records('analytics_predictions', array('modelid' => $this->model->id));
$DB->delete_records('analytics_predict_samples', array('modelid' => $this->model->id));
$DB->delete_records('analytics_train_samples', array('modelid' => $this->model->id));
$DB->delete_records('analytics_used_files', array('modelid' => $this->model->id));

View File

@ -31,6 +31,8 @@ require_once(__DIR__ . '/fixtures/test_indicator_random.php');
require_once(__DIR__ . '/fixtures/test_target_shortname.php');
require_once(__DIR__ . '/fixtures/test_static_target_shortname.php');
require_once(__DIR__ . '/../../course/lib.php');
/**
* Unit tests for evaluation, training and prediction.
*
@ -81,7 +83,7 @@ class core_analytics_prediction_testcase extends advanced_testcase {
}
// 1 range for each analysable.
$predictedranges = $DB->get_records('analytics_predict_ranges', array('modelid' => $model->get_id()));
$predictedranges = $DB->get_records('analytics_predict_samples', array('modelid' => $model->get_id()));
$this->assertCount(2, $predictedranges);
$this->assertEquals(1, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
@ -91,7 +93,7 @@ class core_analytics_prediction_testcase extends advanced_testcase {
// No new generated files nor records as there are no new courses available.
$model->predict();
$predictedranges = $DB->get_records('analytics_predict_ranges', array('modelid' => $model->get_id()));
$predictedranges = $DB->get_records('analytics_predict_samples', array('modelid' => $model->get_id()));
$this->assertCount(2, $predictedranges);
$this->assertEquals(1, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
@ -104,11 +106,11 @@ class core_analytics_prediction_testcase extends advanced_testcase {
*
* @dataProvider provider_ml_training_and_prediction
* @param string $timesplittingid
* @param int $npredictedranges
* @param int $predictedrangeindex
* @param string $predictionsprocessorclass
* @return void
*/
public function test_ml_training_and_prediction($timesplittingid, $npredictedranges, $predictionsprocessorclass) {
public function test_ml_training_and_prediction($timesplittingid, $predictedrangeindex, $predictionsprocessorclass) {
global $DB;
$this->resetAfterTest(true);
@ -176,22 +178,75 @@ class core_analytics_prediction_testcase extends advanced_testcase {
$this->assertEquals($correct[$sampleid], $predictiondata->prediction);
}
// 2 ranges will be predicted.
$predictedranges = $DB->get_records('analytics_predict_ranges', array('modelid' => $model->get_id()));
$this->assertCount($npredictedranges, $predictedranges);
// 1 range will be predicted.
$predictedranges = $DB->get_records('analytics_predict_samples', array('modelid' => $model->get_id()));
$this->assertCount(1, $predictedranges);
foreach ($predictedranges as $predictedrange) {
$this->assertEquals($predictedrangeindex, $predictedrange->rangeindex);
$sampleids = json_decode($predictedrange->sampleids, true);
$this->assertCount(2, $sampleids);
$this->assertContains($course1->id, $sampleids);
$this->assertContains($course2->id, $sampleids);
}
$this->assertEquals(1, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
// 2 predictions for each range.
$this->assertEquals(2 * $npredictedranges, $DB->count_records('analytics_predictions',
// 2 predictions.
$this->assertEquals(2, $DB->count_records('analytics_predictions',
array('modelid' => $model->get_id())));
// No new generated files nor records as there are no new courses available.
$model->predict();
$predictedranges = $DB->get_records('analytics_predict_ranges', array('modelid' => $model->get_id()));
$this->assertCount($npredictedranges, $predictedranges);
$predictedranges = $DB->get_records('analytics_predict_samples', array('modelid' => $model->get_id()));
$this->assertCount(1, $predictedranges);
foreach ($predictedranges as $predictedrange) {
$this->assertEquals($predictedrangeindex, $predictedrange->rangeindex);
}
$this->assertEquals(1, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
$this->assertEquals(2 * $npredictedranges, $DB->count_records('analytics_predictions',
$this->assertEquals(2, $DB->count_records('analytics_predictions',
array('modelid' => $model->get_id())));
// New samples that can be used for prediction.
$courseparams = $params + array('shortname' => 'cccccc', 'fullname' => 'cccccc', 'visible' => 0);
$course3 = $this->getDataGenerator()->create_course($courseparams);
$courseparams = $params + array('shortname' => 'dddddd', 'fullname' => 'dddddd', 'visible' => 0);
$course4 = $this->getDataGenerator()->create_course($courseparams);
$result = $model->predict();
$predictedranges = $DB->get_records('analytics_predict_samples', array('modelid' => $model->get_id()));
$this->assertCount(1, $predictedranges);
foreach ($predictedranges as $predictedrange) {
$this->assertEquals($predictedrangeindex, $predictedrange->rangeindex);
$sampleids = json_decode($predictedrange->sampleids, true);
$this->assertCount(4, $sampleids);
$this->assertContains($course1->id, $sampleids);
$this->assertContains($course2->id, $sampleids);
$this->assertContains($course3->id, $sampleids);
$this->assertContains($course4->id, $sampleids);
}
$this->assertEquals(2, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
$this->assertEquals(4, $DB->count_records('analytics_predictions',
array('modelid' => $model->get_id())));
// New visible course (for training).
$course5 = $this->getDataGenerator()->create_course(array('shortname' => 'aaa', 'fullname' => 'aa'));
$course6 = $this->getDataGenerator()->create_course();
$result = $model->train();
$this->assertEquals(2, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'trained')));
// Update one of the courses to not visible, it should be used again for prediction.
$course5->visible = 0;
update_course($course5);
$model->predict();
$this->assertEquals(1, $DB->count_records('analytics_predict_samples',
array('modelid' => $model->get_id())));
$this->assertEquals(2, $DB->count_records('analytics_used_files',
array('modelid' => $model->get_id(), 'action' => 'predicted')));
$this->assertEquals(4, $DB->count_records('analytics_predictions',
array('modelid' => $model->get_id())));
set_config('enabled_stores', '', 'tool_log');
@ -205,8 +260,8 @@ class core_analytics_prediction_testcase extends advanced_testcase {
*/
public function provider_ml_training_and_prediction() {
$cases = array(
'no_splitting' => array('\core\analytics\time_splitting\no_splitting', 1),
'quarters' => array('\core\analytics\time_splitting\quarters', 4)
'no_splitting' => array('\core\analytics\time_splitting\no_splitting', 0),
'quarters' => array('\core\analytics\time_splitting\quarters', 3)
);
// We need to test all system prediction processors.

View File

@ -60,15 +60,17 @@ $string['insightinfomessagehtml'] = 'The system generated some insights for you:
$string['invalidtimesplitting'] = 'Model with id {$a} needs a time splitting method before it can be used to train';
$string['invalidanalysablefortimesplitting'] = 'It can not be analysed using {$a} time splitting method';
$string['nocourses'] = 'No courses to analyse';
$string['nodata'] = 'No data available';
$string['modeloutputdir'] = 'Models output directory';
$string['modeloutputdirinfo'] = 'Directory where prediction processors store all evaluation info. Useful for debugging and research.';
$string['noevaluationbasedassumptions'] = 'Models based on assumptions can not be evaluated';
$string['nodata'] = 'No data to analyse';
$string['noinsightsmodel'] = 'This model does not generate insights';
$string['noinsights'] = 'No insights reported';
$string['nonewdata'] = 'No new data available';
$string['nonewranges'] = 'No new predictions yet';
$string['nonewtimeranges'] = 'No new time ranges, nothing to predict';
$string['nopredictionsyet'] = 'No predictions available yet';
$string['noranges'] = 'No predictions yet';
$string['notrainingbasedassumptions'] = 'Models based on assumptions do not need training';
$string['novaliddata'] = 'No valid data available';
$string['novalidsamples'] = 'No valid samples available';

View File

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?>
<XMLDB PATH="lib/db" VERSION="20170502" COMMENT="XMLDB file for core Moodle tables"
<XMLDB PATH="lib/db" VERSION="20170721" COMMENT="XMLDB file for core Moodle tables"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="../../lib/xmldb/xmldb.xsd"
>
@ -3592,7 +3592,7 @@
<FIELD NAME="target" TYPE="char" LENGTH="255" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="indicators" TYPE="text" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="timesplitting" TYPE="char" LENGTH="255" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="score" TYPE="number" LENGTH="10" DECIMALS="5" NOTNULL="true" DEFAULT="0" SEQUENCE="false"/>
<FIELD NAME="score" TYPE="number" LENGTH="10" NOTNULL="true" DEFAULT="0" SEQUENCE="false" DECIMALS="5"/>
<FIELD NAME="info" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="dir" TYPE="text" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="timecreated" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="false"/>
@ -3605,7 +3605,6 @@
<INDEX NAME="modelid" UNIQUE="false" FIELDS="modelid" COMMENT="Index on modelid"/>
</INDEXES>
</TABLE>
<TABLE NAME="analytics_predictions" COMMENT="Predictions">
<FIELDS>
<FIELD NAME="id" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="true"/>
@ -3614,7 +3613,7 @@
<FIELD NAME="sampleid" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="rangeindex" TYPE="int" LENGTH="5" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="prediction" TYPE="int" LENGTH="2" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="predictionscore" TYPE="number" LENGTH="10" DECIMALS="5" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="predictionscore" TYPE="number" LENGTH="10" NOTNULL="true" SEQUENCE="false" DECIMALS="5"/>
<FIELD NAME="calculations" TYPE="text" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="timecreated" TYPE="int" LENGTH="10" NOTNULL="true" DEFAULT="0" SEQUENCE="false"/>
</FIELDS>
@ -3642,20 +3641,22 @@
<INDEX NAME="modelidandanalysableidandtimesplitting" UNIQUE="false" FIELDS="modelid, analysableid, timesplitting" COMMENT="Index on modelid and analysableid and timesplitting"/>
</INDEXES>
</TABLE>
<TABLE NAME="analytics_predict_ranges" COMMENT="Time ranges already used for predictions.">
<TABLE NAME="analytics_predict_samples" COMMENT="Samples already used for predictions.">
<FIELDS>
<FIELD NAME="id" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="true"/>
<FIELD NAME="modelid" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="analysableid" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="timesplitting" TYPE="char" LENGTH="255" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="rangeindex" TYPE="int" LENGTH="10" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="sampleids" TYPE="text" NOTNULL="true" SEQUENCE="false"/>
<FIELD NAME="timecreated" TYPE="int" LENGTH="10" NOTNULL="true" DEFAULT="0" SEQUENCE="false"/>
<FIELD NAME="timemodified" TYPE="int" LENGTH="10" NOTNULL="true" DEFAULT="0" SEQUENCE="false"/>
</FIELDS>
<KEYS>
<KEY NAME="primary" TYPE="primary" FIELDS="id"/>
</KEYS>
<INDEXES>
<INDEX NAME="modelidandanalysableidandtimesplitting" UNIQUE="false" FIELDS="modelid, analysableid, timesplitting" COMMENT="Index on modelid and analysableid and timesplitting"/>
<INDEX NAME="modelidandanalysableidandtimesplittingandrangeindex" UNIQUE="false" FIELDS="modelid, analysableid, timesplitting, rangeindex" COMMENT="Index on modelid and analysableid and timesplitting"/>
</INDEXES>
</TABLE>
<TABLE NAME="analytics_used_files" COMMENT="Files that have already been used for training and prediction.">

View File

@ -2255,5 +2255,40 @@ function xmldb_main_upgrade($oldversion) {
upgrade_main_savepoint(true, 2017072700.02);
}
if ($oldversion < 2017080400.01) {
// Get the table by its previous name.
$table = new xmldb_table('analytics_predict_ranges');
if ($dbman->table_exists($table)) {
// We can only accept this because we are in master.
$DB->delete_records('analytics_predictions');
$DB->delete_records('analytics_used_files', array('action' => 'predicted'));
$DB->delete_records('analytics_predict_ranges');
// Define field sampleids to be added to analytics_predict_ranges (renamed below to analytics_predict_samples).
$field = new xmldb_field('sampleids', XMLDB_TYPE_TEXT, null, null, XMLDB_NOTNULL, null, null, 'rangeindex');
// Conditionally launch add field sampleids.
if (!$dbman->field_exists($table, $field)) {
$dbman->add_field($table, $field);
}
// Define field timemodified to be added to analytics_predict_ranges (renamed below to analytics_predict_samples).
$field = new xmldb_field('timemodified', XMLDB_TYPE_INTEGER, '10', null, XMLDB_NOTNULL, null, '0', 'timecreated');
// Conditionally launch add field timemodified.
if (!$dbman->field_exists($table, $field)) {
$dbman->add_field($table, $field);
}
// Rename the table to its new name.
$dbman->rename_table($table, 'analytics_predict_samples');
}
// Main savepoint reached.
upgrade_main_savepoint(true, 2017080400.01);
}
return true;
}

View File

@ -29,7 +29,7 @@
defined('MOODLE_INTERNAL') || die();
$version = 2017080400.00; // YYYYMMDD = weekly release date of this DEV branch.
$version = 2017080400.01; // YYYYMMDD = weekly release date of this DEV branch.
// RR = release increments - 00 in DEV branches.
// .XX = incremental changes.