mirror of
https://github.com/moodle/moodle.git
synced 2025-04-25 10:26:17 +02:00
MDL-78525 core: fix word and character counting
This commit is contained in:
parent
272fdb321a
commit
f25ad012c5
@ -8381,9 +8381,10 @@ function moodle_setlocale($locale='') {
|
||||
*
|
||||
* @category string
|
||||
* @param string $string The text to be searched for words. May be HTML.
|
||||
* @param int|null $format
|
||||
* @return int The count of words in the specified string
|
||||
*/
|
||||
function count_words($string) {
|
||||
function count_words($string, $format = null) {
|
||||
// Before stripping tags, add a space after the close tag of anything that is not obviously inline.
|
||||
// Also, br is a special case because it definitely delimits a word, but has no close tag.
|
||||
$string = preg_replace('~
|
||||
@ -8400,6 +8401,11 @@ function count_words($string) {
|
||||
<br> | <br\s*/> # Special cases that are not close tags.
|
||||
)
|
||||
~x', '$1 ', $string); // Add a space after the close tag.
|
||||
if ($format !== null && $format != FORMAT_PLAIN) {
|
||||
// Match the usual text cleaning before display.
|
||||
// Ideally we should apply multilang filter only here, other filters might add extra text.
|
||||
$string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
|
||||
}
|
||||
// Now remove HTML tags.
|
||||
$string = strip_tags($string);
|
||||
// Decode HTML entities.
|
||||
@ -8421,9 +8427,15 @@ function count_words($string) {
|
||||
*
|
||||
* @category string
|
||||
* @param string $string The text to be searched for letters. May be HTML.
|
||||
* @param int|null $format
|
||||
* @return int The count of letters in the specified text.
|
||||
*/
|
||||
function count_letters($string) {
|
||||
function count_letters($string, $format = null) {
|
||||
if ($format !== null && $format != FORMAT_PLAIN) {
|
||||
// Match the usual text cleaning before display.
|
||||
// Ideally we should apply multilang filter only here, other filters might add extra text.
|
||||
$string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
|
||||
}
|
||||
$string = strip_tags($string); // Tags are out now.
|
||||
$string = html_entity_decode($string, ENT_COMPAT);
|
||||
$string = preg_replace('/[[:space:]]*/', '', $string); // Whitespace are out now.
|
||||
|
@ -3958,9 +3958,11 @@ EOF;
|
||||
* @dataProvider count_words_testcases
|
||||
* @param int $expectedcount number of words in $string.
|
||||
* @param string $string the test string to count the words of.
|
||||
* @param int|null $format
|
||||
*/
|
||||
public function test_count_words(int $expectedcount, string $string): void {
|
||||
$this->assertEquals($expectedcount, count_words($string));
|
||||
public function test_count_words(int $expectedcount, string $string, $format = null): void {
|
||||
$this->assertEquals($expectedcount, count_words($string, $format),
|
||||
"'$string' with format '$format' does not match count $expectedcount");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3969,6 +3971,13 @@ EOF;
|
||||
* @return array of test cases.
|
||||
*/
|
||||
public function count_words_testcases(): array {
|
||||
// Copy-pasting example from MDL-64240.
|
||||
$copypasted = <<<EOT
|
||||
<p onclick="alert('boop');">Snoot is booped</p>
|
||||
<script>alert('Boop the snoot');</script>
|
||||
<img alt="Boop the Snoot." src="https://proxy.duckduckgo.com/iu/?u=http%3A%2F%2Fwww.geekfill.com%2Fwp-content%2Fuploads%2F2015%2F08%2FBoop-the-Snoot.jpg&f=1">
|
||||
EOT;
|
||||
|
||||
// The counts here should match MS Word and Libre Office.
|
||||
return [
|
||||
[0, ''],
|
||||
@ -4005,6 +4014,16 @@ EOF;
|
||||
[1, "SO<sub>4</sub><sup>2-</sup>"],
|
||||
[6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'],
|
||||
[1, '<span>a</span><span>b</span>'],
|
||||
[1, '<span>a</span><span>b</span>', FORMAT_PLAIN],
|
||||
[1, '<span>a</span><span>b</span>', FORMAT_HTML],
|
||||
[1, '<span>a</span><span>b</span>', FORMAT_MOODLE],
|
||||
[1, '<span>a</span><span>b</span>', FORMAT_MARKDOWN],
|
||||
[1, 'aa <argh <bleh>pokus</bleh>'],
|
||||
[2, 'aa <argh <bleh>pokus</bleh>', FORMAT_HTML],
|
||||
[6, $copypasted],
|
||||
[6, $copypasted, FORMAT_PLAIN],
|
||||
[3, $copypasted, FORMAT_HTML],
|
||||
[3, $copypasted, FORMAT_MOODLE],
|
||||
];
|
||||
}
|
||||
|
||||
@ -4014,9 +4033,11 @@ EOF;
|
||||
* @dataProvider count_letters_testcases
|
||||
* @param int $expectedcount number of characters in $string.
|
||||
* @param string $string the test string to count the letters of.
|
||||
* @param int|null $format
|
||||
*/
|
||||
public function test_count_letters(int $expectedcount, string $string): void {
|
||||
$this->assertEquals($expectedcount, count_letters($string));
|
||||
public function test_count_letters(int $expectedcount, string $string, $format = null): void {
|
||||
$this->assertEquals($expectedcount, count_letters($string, $format),
|
||||
"'$string' with format '$format' does not match count $expectedcount");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4030,6 +4051,12 @@ EOF;
|
||||
[1, 'x'],
|
||||
[1, '&'],
|
||||
[4, '<p>frog</p>'],
|
||||
[4, '<p>frog</p>', FORMAT_PLAIN],
|
||||
[4, '<p>frog</p>', FORMAT_MOODLE],
|
||||
[4, '<p>frog</p>', FORMAT_HTML],
|
||||
[4, '<p>frog</p>', FORMAT_MARKDOWN],
|
||||
[2, 'aa <argh <bleh>pokus</bleh>'],
|
||||
[7, 'aa <argh <bleh>pokus</bleh>', FORMAT_HTML],
|
||||
];
|
||||
}
|
||||
|
||||
|
@ -350,8 +350,8 @@ class post {
|
||||
*/
|
||||
public static function add_message_counts(\stdClass $record) : void {
|
||||
if (!empty($record->message)) {
|
||||
$record->wordcount = count_words($record->message);
|
||||
$record->charcount = count_letters($record->message);
|
||||
$record->wordcount = count_words($record->message, $record->messageformat);
|
||||
$record->charcount = count_letters($record->message, $record->messageformat);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -799,10 +799,9 @@ if ($mformpost->is_cancelled()) {
|
||||
// WARNING: the $fromform->message array has been overwritten, do not use it anymore!
|
||||
$fromform->messagetrust = trusttext_trusted($modcontext);
|
||||
|
||||
// Clean message text, unless markdown which should be saved as it is, otherwise editing messes things up.
|
||||
if ($fromform->messageformat != FORMAT_MARKDOWN) {
|
||||
$fromform = trusttext_pre_edit($fromform, 'message', $modcontext);
|
||||
}
|
||||
// Do not clean text here, text cleaning can be done only after conversion to HTML.
|
||||
// Word counting now uses text formatting, there is no need to abuse trusttext_pre_edit() here.
|
||||
|
||||
if ($fromform->edit) {
|
||||
// Updating a post.
|
||||
unset($fromform->groupid);
|
||||
|
Loading…
x
Reference in New Issue
Block a user