From f25ad012c54f8659301f75134be4bee02a715bc1 Mon Sep 17 00:00:00 2001
From: Petr Skoda
Date: Tue, 20 Jun 2023 09:30:46 +0200
Subject: [PATCH] MDL-78525 core: fix word and character counting
---
lib/moodlelib.php | 16 +++++++++--
lib/tests/moodlelib_test.php | 35 ++++++++++++++++++++---
mod/forum/classes/local/entities/post.php | 4 +--
mod/forum/post.php | 7 ++---
4 files changed, 50 insertions(+), 12 deletions(-)
diff --git a/lib/moodlelib.php b/lib/moodlelib.php
index d9799521629..d00252caba9 100644
--- a/lib/moodlelib.php
+++ b/lib/moodlelib.php
@@ -8381,9 +8381,10 @@ function moodle_setlocale($locale='') {
*
* @category string
* @param string $string The text to be searched for words. May be HTML.
+ * @param int|null $format
* @return int The count of words in the specified string
*/
-function count_words($string) {
+function count_words($string, $format = null) {
// Before stripping tags, add a space after the close tag of anything that is not obviously inline.
// Also, br is a special case because it definitely delimits a word, but has no close tag.
$string = preg_replace('~
@@ -8400,6 +8401,11 @@ function count_words($string) {
|
# Special cases that are not close tags.
)
~x', '$1 ', $string); // Add a space after the close tag.
+ if ($format !== null && $format != FORMAT_PLAIN) {
+ // Match the usual text cleaning before display.
+ // Ideally we should apply multilang filter only here, other filters might add extra text.
+ $string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
+ }
// Now remove HTML tags.
$string = strip_tags($string);
// Decode HTML entities.
@@ -8421,9 +8427,15 @@ function count_words($string) {
*
* @category string
* @param string $string The text to be searched for letters. May be HTML.
+ * @param int|null $format
* @return int The count of letters in the specified text.
*/
-function count_letters($string) {
+function count_letters($string, $format = null) {
+ if ($format !== null && $format != FORMAT_PLAIN) {
+ // Match the usual text cleaning before display.
+ // Ideally we should apply multilang filter only here, other filters might add extra text.
+ $string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
+ }
$string = strip_tags($string); // Tags are out now.
$string = html_entity_decode($string, ENT_COMPAT);
$string = preg_replace('/[[:space:]]*/', '', $string); // Whitespace are out now.
diff --git a/lib/tests/moodlelib_test.php b/lib/tests/moodlelib_test.php
index 591b4e35ab9..bc8ac172a8e 100644
--- a/lib/tests/moodlelib_test.php
+++ b/lib/tests/moodlelib_test.php
@@ -3958,9 +3958,11 @@ EOF;
* @dataProvider count_words_testcases
* @param int $expectedcount number of words in $string.
* @param string $string the test string to count the words of.
+ * @param int|null $format
*/
- public function test_count_words(int $expectedcount, string $string): void {
- $this->assertEquals($expectedcount, count_words($string));
+ public function test_count_words(int $expectedcount, string $string, $format = null): void {
+ $this->assertEquals($expectedcount, count_words($string, $format),
+ "'$string' with format '$format' does not match count $expectedcount");
}
/**
@@ -3969,6 +3971,13 @@ EOF;
* @return array of test cases.
*/
public function count_words_testcases(): array {
+ // Copy-pasting example from MDL-64240.
+ $copypasted = <<Snoot is booped
+
+
+EOT;
+
// The counts here should match MS Word and Libre Office.
return [
[0, ''],
@@ -4005,6 +4014,16 @@ EOF;
[1, "SO42-"],
[6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'],
[1, 'ab'],
+ [1, 'ab', FORMAT_PLAIN],
+ [1, 'ab', FORMAT_HTML],
+ [1, 'ab', FORMAT_MOODLE],
+ [1, 'ab', FORMAT_MARKDOWN],
+ [1, 'aa pokus'],
+ [2, 'aa pokus', FORMAT_HTML],
+ [6, $copypasted],
+ [6, $copypasted, FORMAT_PLAIN],
+ [3, $copypasted, FORMAT_HTML],
+ [3, $copypasted, FORMAT_MOODLE],
];
}
@@ -4014,9 +4033,11 @@ EOF;
* @dataProvider count_letters_testcases
* @param int $expectedcount number of characters in $string.
* @param string $string the test string to count the letters of.
+ * @param int|null $format
*/
- public function test_count_letters(int $expectedcount, string $string): void {
- $this->assertEquals($expectedcount, count_letters($string));
+ public function test_count_letters(int $expectedcount, string $string, $format = null): void {
+ $this->assertEquals($expectedcount, count_letters($string, $format),
+ "'$string' with format '$format' does not match count $expectedcount");
}
/**
@@ -4030,6 +4051,12 @@ EOF;
[1, 'x'],
[1, '&'],
[4, 'frog
'],
+ [4, 'frog
', FORMAT_PLAIN],
+ [4, 'frog
', FORMAT_MOODLE],
+ [4, 'frog
', FORMAT_HTML],
+ [4, 'frog
', FORMAT_MARKDOWN],
+ [2, 'aa pokus'],
+ [7, 'aa pokus', FORMAT_HTML],
];
}
diff --git a/mod/forum/classes/local/entities/post.php b/mod/forum/classes/local/entities/post.php
index 90cd0fc6ba8..119e1d02cd2 100644
--- a/mod/forum/classes/local/entities/post.php
+++ b/mod/forum/classes/local/entities/post.php
@@ -350,8 +350,8 @@ class post {
*/
public static function add_message_counts(\stdClass $record) : void {
if (!empty($record->message)) {
- $record->wordcount = count_words($record->message);
- $record->charcount = count_letters($record->message);
+ $record->wordcount = count_words($record->message, $record->messageformat);
+ $record->charcount = count_letters($record->message, $record->messageformat);
}
}
}
diff --git a/mod/forum/post.php b/mod/forum/post.php
index a3f2925cc0a..9ea0f2aef31 100644
--- a/mod/forum/post.php
+++ b/mod/forum/post.php
@@ -799,10 +799,9 @@ if ($mformpost->is_cancelled()) {
// WARNING: the $fromform->message array has been overwritten, do not use it anymore!
$fromform->messagetrust = trusttext_trusted($modcontext);
- // Clean message text, unless markdown which should be saved as it is, otherwise editing messes things up.
- if ($fromform->messageformat != FORMAT_MARKDOWN) {
- $fromform = trusttext_pre_edit($fromform, 'message', $modcontext);
- }
+ // Do not clean text here, text cleaning can be done only after conversion to HTML.
+ // Word counting now uses text formatting, there is no need to abuse trusttext_pre_edit() here.
+
if ($fromform->edit) {
// Updating a post.
unset($fromform->groupid);