MDL-70796 count_words: match the count from LibraOffice & MS Word

This commit is contained in:
Tim Hunt 2021-02-04 18:17:35 +00:00
parent 4e398ff3f7
commit 6c7cf1123e
3 changed files with 27 additions and 17 deletions

View File

@ -8378,14 +8378,14 @@ function count_words($string) {
$string = strip_tags($string);
// Decode HTML entities.
$string = html_entity_decode($string);
// Replace underscores (which are classed as word characters) with spaces.
$string = preg_replace('/_/u', ' ', $string);
// Remove any characters that shouldn't be treated as word boundaries.
$string = preg_replace('/[\'"-]/u', '', $string);
// Remove dots and commas from within numbers only.
$string = preg_replace('/([0-9])[.,]([0-9])/u', '$1$2', $string);
return count(preg_split('/\w\b/u', $string)) - 1;
// Now, the word count is the number of blocks of characters separated
// by any sort of space. That seems to be the definition used by all other systems.
// To be precise about what is considered to separate words:
// * Anything that Unicode considers a 'Separator'
// * Anything that Unicode considers a 'Control character'
// * An em- or en- dash.
return count(preg_split('~[\p{Z}\p{Cc}—–]+~u', $string, -1, PREG_SPLIT_NO_EMPTY));
}
/**

View File

@ -3808,22 +3808,27 @@ class core_moodlelib_testcase extends advanced_testcase {
* @return array of test cases.
*/
public function count_words_testcases(): array {
// The counts here should match MS Word and Libre Office.
return [
[0, ''],
[4, 'one two three four'],
[3, "one two three'four"],
[3, 'one+two threefour'],
[2, 'one"two three-four'],
[4, 'one@two three_four'],
[4, 'one\two three/four'],
[1, "a'b"],
[1, '1+1=2'],
[1, ' one-sided '],
[2, 'one two'],
[1, 'email@example.com'],
[2, 'first\part second/part'],
[4, '<p>one two<br></br>three four</p>'],
[4, '<p>one two<br>three four</p>'],
[4, '<p>one two<br />three four</p>'], // XHTML style.
[4, ' one ... two &nbsp; three...four '],
[4, 'one.2 3,four'],
[3, ' one ... three '],
[1, 'just...one'],
[3, ' one & three '],
[1, 'just&one'],
[2, 'em—dash'],
[2, 'endash'],
[4, '1³ £2 €3.45 $6,789'],
[4, 'one—two ブルース カンベッル'],
[4, 'one…two ブルース … カンベッル'],
[2, 'ブルース カンベッル'], // MS word counts this as 11, but we don't handle that yet.
[4, '<p>one two</p><p>three four</p>'],
[4, '<p>one two</p><p><br/></p><p>three four</p>'],
[4, '<p>one</p><ul><li>two</li><li>three</li></ul><p>four.</p>'],
@ -3832,7 +3837,12 @@ class core_moodlelib_testcase extends advanced_testcase {
[1, '<p>em<strong>phas</strong>is.</p>'],
[1, '<p>em<em>phas</em>is.</p>'],
[2, "one\ntwo"],
[2, "one\rtwo"],
[2, "one\ttwo"],
[2, "one\vtwo"],
[2, "one\ftwo"],
[1, "SO<sub>4</sub><sup>2-</sup>"],
[6, '4+4=8 i.e. O(1) a,b,c,d Im black&blue_really'],
];
}

View File

@ -295,7 +295,7 @@ class qtype_essay_question_test extends advanced_testcase {
public function test_get_validation_error(int $responserequired,
int $minwordlimit, int $maxwordlimit, string $expected): void {
$question = test_question_maker::make_an_essay_question();
$response = ['answer' => 'In this essay, I will be testing a function called check_input_word_count().'];
$response = ['answer' => 'One two three four five six seven eight nine ten eleven twelve thirteen fourteen.'];
$question->responserequired = $responserequired;
$question->minwordlimit = $minwordlimit;
$question->maxwordlimit = $maxwordlimit;