moodle/analytics/classes/dataset_manager.php
David Monllao 0690a271c3 MDL-59067 analytics: Store indicator calculations
This was supposed to be split into multiple commits to make it easier to understand
but I failed to do it properly. So this is the list of changes:

- New analytics_indicator_calc db table to store indicators calculations
- Reuse previous calculations during prediction/training; other models
  previous calculations should also be reused as long as they belong to
  the same sample (sampleid depends on sampleorigin), time range and indicator
- Allow bulk inserting of these calculations as this can hurt database performance
- Block the same analysable to be analysed for training and for prediction
- Use a new instance of the target and use it for is_valid_* functions
  as using ::is_valid_sample can lead to problems if people
  uses it to cache stuff
2017-08-22 22:28:04 +02:00

442 lines
14 KiB
PHP

<?php
// This file is part of Moodle - http://moodle.org/
//
// Moodle is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Moodle is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
/**
* Datasets manager.
*
* @package core_analytics
* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
*/
namespace core_analytics;
defined('MOODLE_INTERNAL') || die();
/**
* Datasets manager.
*
* @package core_analytics
* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
*/
class dataset_manager {
/**
* File area for labelled datasets.
*/
const LABELLED_FILEAREA = 'labelled';
/**
* File area for unlabelled datasets.
*/
const UNLABELLED_FILEAREA = 'unlabelled';
/**
* File area for exported datasets.
*/
const EXPORT_FILEAREA = 'export';
/**
* Evaluation file file name.
*/
const EVALUATION_FILENAME = 'evaluation.csv';
/**
* The model id.
*
* @var int
*/
protected $modelid;
/**
* Range processor in use.
*
* @var string
*/
protected $timesplittingid;
/**
* @var int
*/
protected $analysableid;
/**
* Whether this is a dataset for evaluation or not.
*
* @var bool
*/
protected $evaluation;
/**
* The dataset filearea. Must be one of the self::*_FILEAREA options.
*
* @var string
*/
protected $filearea;
/**
* Constructor method.
*
* @throws \coding_exception
* @param int $modelid
* @param int $analysableid
* @param string $timesplittingid
* @param string $filearea
* @param bool $evaluation
* @return void
*/
public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {
if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&
$filearea !== self::UNLABELLED_FILEAREA) {
throw new \coding_exception('Invalid provided filearea');
}
$this->modelid = $modelid;
$this->analysableid = $analysableid;
$this->timesplittingid = $timesplittingid;
$this->evaluation = $evaluation;
$this->filearea = $filearea;
}
/**
* Mark the analysable as being analysed.
*
* @return bool Could we get the lock or not.
*/
public function init_process() {
// Do not include $this->includetarget as we don't want the same analysable to be analysed for training
// and prediction at the same time.
$lockkey = 'modelid:' . $this->modelid . '-analysableid:' . $this->analysableid .
'-timesplitting:' . self::clean_time_splitting_id($this->timesplittingid);
// Large timeout as processes may be quite long.
$lockfactory = \core\lock\lock_config::get_lock_factory('core_analytics');
// If it is not ready in 10 secs skip this model + analysable + timesplittingmethod combination
// it will attempt it again during next cron run.
if (!$this->lock = $lockfactory->get_lock($lockkey, 10)) {
return false;
}
return true;
}
/**
* Store the dataset in the internal file system.
*
* @param array $data
* @return \stored_file
*/
public function store($data) {
// Delete previous file if it exists.
$fs = get_file_storage();
$filerecord = [
'component' => 'analytics',
'filearea' => $this->filearea,
'itemid' => $this->modelid,
'contextid' => \context_system::instance()->id,
'filepath' => '/analysable/' . $this->analysableid . '/' . self::clean_time_splitting_id($this->timesplittingid) . '/',
'filename' => self::get_filename($this->evaluation)
];
// Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.
$select = " = {$filerecord['itemid']} AND filepath = :filepath";
$fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],
$select, array('filepath' => $filerecord['filepath']));
// Write all this stuff to a tmp file.
$filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];
$fh = fopen($filepath, 'w+');
if (!$fh) {
$this->close_process();
throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
}
foreach ($data as $line) {
fputcsv($fh, $line);
}
fclose($fh);
return $fs->create_file_from_pathname($filerecord, $filepath);
}
/**
* Mark as analysed.
*
* @return void
*/
public function close_process() {
$this->lock->release();
}
/**
* Returns the previous evaluation file.
*
* Important to note that this is per modelid + timesplittingid, when dealing with multiple
* analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file
*
* @param int $modelid
* @param string $timesplittingid
* @return \stored_file
*/
public static function get_previous_evaluation_file($modelid, $timesplittingid) {
$fs = get_file_storage();
// Evaluation data is always labelled.
return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,
'/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/', self::EVALUATION_FILENAME);
}
/**
* Deletes previous evaluation files of this model.
*
* @param int $modelid
* @param string $timesplittingid
* @return bool
*/
public static function delete_previous_evaluation_file($modelid, $timesplittingid) {
if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {
$file->delete();
return true;
}
return false;
}
/**
* Returns this (model + analysable + time splitting) file.
*
* @param int $modelid
* @param int $analysableid
* @param string $timesplittingid
* @return \stored_file
*/
public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {
// Delete previous file if it exists.
$fs = get_file_storage();
// Always evaluation.csv and labelled as it is an evaluation file.
$filearea = self::LABELLED_FILEAREA;
$filename = self::get_filename(true);
$filepath = '/analysable/' . $analysableid . '/' . self::clean_time_splitting_id($timesplittingid) . '/';
return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);
}
/**
* Merge multiple files into one.
*
* Important! It is the caller responsability to ensure that the datasets are compatible.
*
* @param array $files
* @param int $modelid
* @param string $timesplittingid
* @param string $filearea
* @param bool $evaluation
* @return \stored_file
*/
public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {
$tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';
// Add headers.
// We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file
// once all file contents are merged.
$varnames = '';
$analysablesvalues = array();
foreach ($files as $file) {
$rh = $file->get_content_file_handle();
// Copy the var names as they are, all files should have the same var names.
$varnames = fgetcsv($rh);
$analysablesvalues[] = fgetcsv($rh);
// Copy the columns as they are, all files should have the same columns.
$columns = fgetcsv($rh);
}
// Merge analysable values skipping the ones that are the same in all analysables.
$values = array();
foreach ($analysablesvalues as $analysablevalues) {
foreach ($analysablevalues as $varkey => $value) {
// Sha1 to make it unique.
$values[$varkey][sha1($value)] = $value;
}
}
foreach ($values as $varkey => $varvalues) {
$values[$varkey] = implode('|', $varvalues);
}
// Start writing to the merge file.
$wh = fopen($tmpfilepath, 'w');
if (!$wh) {
throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
}
fputcsv($wh, $varnames);
fputcsv($wh, $values);
fputcsv($wh, $columns);
// Iterate through all files and add them to the tmp one. We don't want file contents in memory.
foreach ($files as $file) {
$rh = $file->get_content_file_handle();
// Skip headers.
fgets($rh);
fgets($rh);
fgets($rh);
// Copy all the following lines.
while ($line = fgets($rh)) {
fwrite($wh, $line);
}
fclose($rh);
}
fclose($wh);
$filerecord = [
'component' => 'analytics',
'filearea' => $filearea,
'itemid' => $modelid,
'contextid' => \context_system::instance()->id,
'filepath' => '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/',
'filename' => self::get_filename($evaluation)
];
$fs = get_file_storage();
return $fs->create_file_from_pathname($filerecord, $tmpfilepath);
}
/**
* Exports the model training data.
*
* @param int $modelid
* @param string $timesplittingid
* @return \stored_file|false
*/
public static function export_training_data($modelid, $timesplittingid) {
$fs = get_file_storage();
$contextid = \context_system::instance()->id;
$filepath = '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/';
$files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,
$filepath, true, false);
// Discard evaluation files.
foreach ($files as $key => $file) {
if ($file->get_filename() === self::EVALUATION_FILENAME) {
unset($files[$key]);
}
}
if (empty($files)) {
return false;
}
return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);
}
/**
* Returns the dataset file data structured by sampleids using the indicators and target column names.
*
* @param \stored_file $dataset
* @return array
*/
public static function get_structured_data(\stored_file $dataset) {
if ($dataset->get_filearea() !== 'unlabelled') {
throw new \coding_exception('Sorry, only support for unlabelled data');
}
$rh = $dataset->get_content_file_handle();
// Skip dataset info.
fgets($rh);
fgets($rh);
$calculations = array();
$headers = fgetcsv($rh);
// Get rid of the sampleid column name.
array_shift($headers);
while ($columns = fgetcsv($rh)) {
$uniquesampleid = array_shift($columns);
// Unfortunately fgetcsv does not respect line's var types.
$calculations[$uniquesampleid] = array_map(function($value) {
if ($value === '') {
// We really want them as null because converted to float become 0
// and we need to treat the values separately.
return null;
} else if (is_numeric($value)) {
return floatval($value);
}
return $value;
}, array_combine($headers, $columns));
}
return $calculations;
}
/**
* Delete all files of a model.
*
* @param int $modelid
* @return bool
*/
public static function clear_model_files($modelid) {
$fs = get_file_storage();
return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);
}
/**
* Remove all possibly problematic chars from the time splitting method id (id = its full class name).
*
* @param string $timesplittingid
* @return string
*/
protected static function clean_time_splitting_id($timesplittingid) {
$timesplittingid = str_replace('\\', '-', $timesplittingid);
return clean_param($timesplittingid, PARAM_ALPHANUMEXT);
}
/**
* Returns the file name to be used.
*
* @param strinbool $evaluation
* @return string
*/
protected static function get_filename($evaluation) {
if ($evaluation === true) {
$filename = self::EVALUATION_FILENAME;
} else {
// Incremental time, the lock will make sure we don't have concurrency problems.
$filename = microtime(true) . '.csv';
}
return $filename;
}
}