mirror of
https://github.com/moodle/moodle.git
synced 2025-05-04 15:27:34 +02:00
MDL-35364 shorten text: don't return invalid HTML.
I also broke the unit tests into more, smaller, named tests, so that when things start failing, more tests are run, and it is clearer what the problem is. In a couple of cases, I adjusted the $ideal lenght in the test. A careful counting of the characters in the test input (added as comments) suggests that the new values make for better tests.
This commit is contained in:
parent
3a8c4380c0
commit
4a7cc24808
@ -9413,13 +9413,13 @@ function shorten_text($text, $ideal=30, $exact = false, $ending='...') {
|
||||
|
||||
global $CFG;
|
||||
|
||||
// if the plain text is shorter than the maximum length, return the whole text
|
||||
// If the plain text is shorter than the maximum length, return the whole text.
|
||||
if (textlib::strlen(preg_replace('/<.*?>/', '', $text)) <= $ideal) {
|
||||
return $text;
|
||||
}
|
||||
|
||||
// Splits on HTML tags. Each open/close/empty tag will be the first thing
|
||||
// and only tag in its 'line'
|
||||
// and only tag in its 'line'.
|
||||
preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
|
||||
|
||||
$total_length = textlib::strlen($ending);
|
||||
@ -9428,37 +9428,43 @@ function shorten_text($text, $ideal=30, $exact = false, $ending='...') {
|
||||
// This array stores information about open and close tags and their position
|
||||
// in the truncated string. Each item in the array is an object with fields
|
||||
// ->open (true if open), ->tag (tag name in lower case), and ->pos
|
||||
// (byte position in truncated text)
|
||||
// (byte position in truncated text).
|
||||
$tagdetails = array();
|
||||
|
||||
foreach ($lines as $line_matchings) {
|
||||
// if there is any html-tag in this line, handle it and add it (uncounted) to the output
|
||||
// If there is any html-tag in this line, handle it and add it (uncounted) to the output.
|
||||
if (!empty($line_matchings[1])) {
|
||||
// if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>)
|
||||
// If it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>).
|
||||
if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
|
||||
// do nothing
|
||||
// if tag is a closing tag (f.e. </b>)
|
||||
// Do nothing.
|
||||
|
||||
} else if (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
|
||||
// record closing tag
|
||||
$tagdetails[] = (object)array('open'=>false,
|
||||
'tag'=>textlib::strtolower($tag_matchings[1]), 'pos'=>textlib::strlen($truncate));
|
||||
// if tag is an opening tag (f.e. <b>)
|
||||
// Record closing tag.
|
||||
$tagdetails[] = (object) array(
|
||||
'open' => false,
|
||||
'tag' => textlib::strtolower($tag_matchings[1]),
|
||||
'pos' => textlib::strlen($truncate),
|
||||
);
|
||||
|
||||
} else if (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
|
||||
// record opening tag
|
||||
$tagdetails[] = (object)array('open'=>true,
|
||||
'tag'=>textlib::strtolower($tag_matchings[1]), 'pos'=>textlib::strlen($truncate));
|
||||
// Record opening tag.
|
||||
$tagdetails[] = (object) array(
|
||||
'open' => true,
|
||||
'tag' => textlib::strtolower($tag_matchings[1]),
|
||||
'pos' => textlib::strlen($truncate),
|
||||
);
|
||||
}
|
||||
// add html-tag to $truncate'd text
|
||||
// Add html-tag to $truncate'd text.
|
||||
$truncate .= $line_matchings[1];
|
||||
}
|
||||
|
||||
// calculate the length of the plain text part of the line; handle entities as one character
|
||||
// Calculate the length of the plain text part of the line; handle entities as one character.
|
||||
$content_length = textlib::strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
|
||||
if ($total_length + $content_length > $ideal) {
|
||||
// the number of characters which are left
|
||||
// The number of characters which are left.
|
||||
$left = $ideal - $total_length;
|
||||
$entities_length = 0;
|
||||
// search for html entities
|
||||
// Search for html entities.
|
||||
if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
|
||||
// calculate the real length of all entities in the legal range
|
||||
foreach ($entities[0] as $entity) {
|
||||
@ -9471,7 +9477,32 @@ function shorten_text($text, $ideal=30, $exact = false, $ending='...') {
|
||||
}
|
||||
}
|
||||
}
|
||||
$truncate .= textlib::substr($line_matchings[2], 0, $left+$entities_length);
|
||||
$breakpos = $left + $entities_length;
|
||||
|
||||
// if the words shouldn't be cut in the middle...
|
||||
if (!$exact) {
|
||||
// ...search the last occurence of a space...
|
||||
for (; $breakpos > 0; $breakpos--) {
|
||||
if ($char = textlib::substr($line_matchings[2], $breakpos, 1)) {
|
||||
if ($char === '.' or $char === ' ') {
|
||||
$breakpos += 1;
|
||||
break;
|
||||
} else if (strlen($char) > 2) { // Chinese/Japanese/Korean text
|
||||
$breakpos += 1; // can be truncated at any UTF-8
|
||||
break; // character boundary.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($breakpos == 0) {
|
||||
// This deals with the test_shorten_text_no_spaces case.
|
||||
$breakpos = $left + $entities_length;
|
||||
} else if ($breakpos > $left + $entities_length) {
|
||||
// This deals with the previous for loop breaking on the first char.
|
||||
$breakpos = $left + $entities_length;
|
||||
}
|
||||
|
||||
$truncate .= textlib::substr($line_matchings[2], 0, $breakpos);
|
||||
// maximum length is reached, so get off the loop
|
||||
break;
|
||||
} else {
|
||||
@ -9479,55 +9510,31 @@ function shorten_text($text, $ideal=30, $exact = false, $ending='...') {
|
||||
$total_length += $content_length;
|
||||
}
|
||||
|
||||
// if the maximum length is reached, get off the loop
|
||||
// If the maximum length is reached, get off the loop.
|
||||
if($total_length >= $ideal) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if the words shouldn't be cut in the middle...
|
||||
if (!$exact) {
|
||||
// ...search the last occurence of a space...
|
||||
for ($k=textlib::strlen($truncate);$k>0;$k--) {
|
||||
if ($char = textlib::substr($truncate, $k, 1)) {
|
||||
if ($char === '.' or $char === ' ') {
|
||||
$breakpos = $k+1;
|
||||
break;
|
||||
} else if (strlen($char) > 2) { // Chinese/Japanese/Korean text
|
||||
$breakpos = $k+1; // can be truncated at any UTF-8
|
||||
break; // character boundary.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($breakpos)) {
|
||||
// ...and cut the text in this position
|
||||
$truncate = textlib::substr($truncate, 0, $breakpos);
|
||||
}
|
||||
}
|
||||
|
||||
// add the defined ending to the text
|
||||
// Add the defined ending to the text.
|
||||
$truncate .= $ending;
|
||||
|
||||
// Now calculate the list of open html tags based on the truncate position
|
||||
// Now calculate the list of open html tags based on the truncate position.
|
||||
$open_tags = array();
|
||||
foreach ($tagdetails as $taginfo) {
|
||||
if(isset($breakpos) && $taginfo->pos >= $breakpos) {
|
||||
// Don't include tags after we made the break!
|
||||
break;
|
||||
}
|
||||
if ($taginfo->open) {
|
||||
// add tag to the beginning of $open_tags list
|
||||
// Add tag to the beginning of $open_tags list.
|
||||
array_unshift($open_tags, $taginfo->tag);
|
||||
} else {
|
||||
$pos = array_search($taginfo->tag, array_reverse($open_tags, true)); // can have multiple exact same open tags, close the last one
|
||||
// Can have multiple exact same open tags, close the last one.
|
||||
$pos = array_search($taginfo->tag, array_reverse($open_tags, true));
|
||||
if ($pos !== false) {
|
||||
unset($open_tags[$pos]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// close all unclosed html-tags
|
||||
// Close all unclosed html-tags.
|
||||
foreach ($open_tags as $tag) {
|
||||
$truncate .= '</' . $tag . '>';
|
||||
}
|
||||
|
@ -1097,62 +1097,95 @@ class moodlelib_testcase extends advanced_testcase {
|
||||
}
|
||||
}
|
||||
|
||||
function test_shorten_text() {
|
||||
function test_shorten_text_no_tags_already_short_enough() {
|
||||
// ......12345678901234567890123456.
|
||||
$text = "short text already no tags";
|
||||
$this->assertEquals($text, shorten_text($text));
|
||||
}
|
||||
|
||||
function test_shorten_text_with_tags_already_short_enough() {
|
||||
// .........123456...7890....12345678.......901234567.
|
||||
$text = "<p>short <b>text</b> already</p><p>with tags</p>";
|
||||
$this->assertEquals($text, shorten_text($text));
|
||||
}
|
||||
|
||||
function test_shorten_text_no_tags_needs_shortening() {
|
||||
// Default truncation is after 30 chars, but allowing 3 for the final '...'.
|
||||
// ......12345678901234567890123456789023456789012345678901234.
|
||||
$text = "long text without any tags blah de blah blah blah what";
|
||||
$this->assertEquals('long text without any tags ...', shorten_text($text));
|
||||
}
|
||||
|
||||
function test_shorten_text_with_tags_needs_shortening() {
|
||||
// .......................................123456789012345678901234567890...
|
||||
$text = "<div class='frog'><p><blockquote>Long text with tags that will ".
|
||||
"be chopped off but <b>should be added back again</b></blockquote></p></div>";
|
||||
$this->assertEquals("<div class='frog'><p><blockquote>Long text with " .
|
||||
"tags that ...</blockquote></p></div>", shorten_text($text));
|
||||
}
|
||||
|
||||
function test_shorten_text_with_entities() {
|
||||
// Remember to allow 3 chars for the final '...'.
|
||||
// ......123456789012345678901234567_____890...
|
||||
$text = "some text which shouldn't break there";
|
||||
$this->assertEquals("some text which shouldn't ...",
|
||||
shorten_text($text, 31));
|
||||
$this->assertEquals("some text which shouldn't ...",
|
||||
$this->assertEquals("some text which shouldn't ...",
|
||||
shorten_text($text, 30));
|
||||
$this->assertEquals("some text which shouldn't ...",
|
||||
shorten_text($text, 29));
|
||||
}
|
||||
|
||||
function test_shorten_text_known_tricky_case() {
|
||||
// This case caused a bug up to 1.9.5
|
||||
// ..........123456789012345678901234567890123456789.....0_____1___2___...
|
||||
$text = "<h3>standard 'break-out' sub groups in TGs?</h3> <<There are several";
|
||||
$this->assertEquals("<h3>standard 'break-out' sub groups in ...</h3>",
|
||||
shorten_text($text, 41));
|
||||
$this->assertEquals("<h3>standard 'break-out' sub groups in TGs?...</h3>",
|
||||
shorten_text($text, 42));
|
||||
$this->assertEquals("<h3>standard 'break-out' sub groups in TGs?</h3> ...",
|
||||
shorten_text($text, 43));
|
||||
}
|
||||
|
||||
$text = "<h1>123456789</h1>";//a string with no convenient breaks
|
||||
function test_shorten_text_no_spaces() {
|
||||
// ..........123456789.
|
||||
$text = "<h1>123456789</h1>"; // A string with no convenient breaks.
|
||||
$this->assertEquals("<h1>12345...</h1>",
|
||||
shorten_text($text, 8));
|
||||
}
|
||||
|
||||
// ==== this must work with UTF-8 too! ======
|
||||
|
||||
// text without tags
|
||||
function test_shorten_text_utf8_european() {
|
||||
// Text without tags.
|
||||
// ......123456789012345678901234567.
|
||||
$text = "Žluťoučký koníček přeskočil";
|
||||
$this->assertEquals($text, shorten_text($text)); // 30 chars by default
|
||||
$this->assertEquals($text, shorten_text($text)); // 30 chars by default.
|
||||
$this->assertEquals("Žluťoučký koníče...", shorten_text($text, 19, true));
|
||||
$this->assertEquals("Žluťoučký ...", shorten_text($text, 19, false));
|
||||
// And try it with 2-less (that are, in bytes, the middle of a sequence)
|
||||
// And try it with 2-less (that are, in bytes, the middle of a sequence).
|
||||
$this->assertEquals("Žluťoučký koní...", shorten_text($text, 17, true));
|
||||
$this->assertEquals("Žluťoučký ...", shorten_text($text, 17, false));
|
||||
|
||||
// .........123456789012345678...901234567....89012345.
|
||||
$text = "<p>Žluťoučký koníček <b>přeskočil</b> potůček</p>";
|
||||
$this->assertEquals($text, shorten_text($text, 60));
|
||||
$this->assertEquals("<p>Žluťoučký koníček ...</p>", shorten_text($text, 21));
|
||||
$this->assertEquals("<p>Žluťoučký koníče...</p>", shorten_text($text, 19, true));
|
||||
$this->assertEquals("<p>Žluťoučký ...</p>", shorten_text($text, 19, false));
|
||||
// And try it with 2-less (that are, in bytes, the middle of a sequence)
|
||||
// And try it with 2 fewer (that are, in bytes, the middle of a sequence).
|
||||
$this->assertEquals("<p>Žluťoučký koní...</p>", shorten_text($text, 17, true));
|
||||
$this->assertEquals("<p>Žluťoučký ...</p>", shorten_text($text, 17, false));
|
||||
// And try over one tag (start/end), it does proper text len
|
||||
// And try over one tag (start/end), it does proper text len.
|
||||
$this->assertEquals("<p>Žluťoučký koníček <b>př...</b></p>", shorten_text($text, 23, true));
|
||||
$this->assertEquals("<p>Žluťoučký koníček <b>přeskočil</b> pot...</p>", shorten_text($text, 34, true));
|
||||
// And in the middle of one tag
|
||||
// And in the middle of one tag.
|
||||
$this->assertEquals("<p>Žluťoučký koníček <b>přeskočil...</b></p>", shorten_text($text, 30, true));
|
||||
}
|
||||
|
||||
function test_shorten_text_utf8_oriental() {
|
||||
// Japanese
|
||||
// text without tags
|
||||
// ......123456789012345678901234.
|
||||
$text = '言語設定言語設定abcdefghijkl';
|
||||
$this->assertEquals($text, shorten_text($text)); // 30 chars by default
|
||||
$this->assertEquals("言語設定言語...", shorten_text($text, 9, true));
|
||||
@ -1161,13 +1194,27 @@ class moodlelib_testcase extends advanced_testcase {
|
||||
$this->assertEquals("言語設定言語設定...", shorten_text($text, 13, false));
|
||||
|
||||
// Chinese
|
||||
// text without tags
|
||||
// ......123456789012345678901234.
|
||||
$text = '简体中文简体中文abcdefghijkl';
|
||||
$this->assertEquals($text, shorten_text($text)); // 30 chars by default
|
||||
$this->assertEquals("简体中文简体...", shorten_text($text, 9, true));
|
||||
$this->assertEquals("简体中文简体...", shorten_text($text, 9, false));
|
||||
$this->assertEquals("简体中文简体中文ab...", shorten_text($text, 13, true));
|
||||
$this->assertEquals("简体中文简体中文...", shorten_text($text, 13, false));
|
||||
}
|
||||
|
||||
function test_shorten_text_multilang() {
|
||||
// This is not necessaryily specific to multilang. The issue is really
|
||||
// tags with attributes, where before we were generating invalid HTML
|
||||
// output like shorten_text('<span id="x" class="y">A</span> B', 1);
|
||||
// returning '<span id="x" ...</span>'. It is just that multilang
|
||||
// requires the sort of HTML that is quite likely to trigger this.
|
||||
// ........................................1...
|
||||
$text = '<span lang="en" class="multilang">A</span>' .
|
||||
'<span lang="fr" class="multilang">B</span>';
|
||||
$this->assertEquals('<span lang="en" class="multilang">...</span>',
|
||||
shorten_text($text, 1));
|
||||
}
|
||||
|
||||
function test_usergetdate() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user