From c2c6af946ce4609144049a5d89480702878d52f8 Mon Sep 17 00:00:00 2001 From: Tim Hunt Date: Fri, 26 Apr 2013 17:51:23 +0100 Subject: [PATCH] MDL-22390 autolink filter: handle URLs in brackets better. This breaks some legitimate URLs like http://en.wikipedia.org/wiki/Slash_(punctuation). This is a necessary trade-off. Many other web systems do not handle that case correctly either. The work-around it so escape the ) as %29. This commit also improves the way the unit tests for this work. It also fixes a couple of other tricky cases that were spotted in the forums while this was being discussed. See the new test cases. --- filter/urltolink/filter.php | 28 ++++++++------ filter/urltolink/tests/filter_test.php | 52 ++++++++++++++++++-------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/filter/urltolink/filter.php b/filter/urltolink/filter.php index d7ca920b55d..9d01e8d0eb6 100644 --- a/filter/urltolink/filter.php +++ b/filter/urltolink/filter.php @@ -122,19 +122,25 @@ class filter_urltolink extends moodle_text_filter { $unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silently, returning false, } - //todo: MDL-21296 - use of unicode modifiers may cause a timeout - if ($unicoderegexp) { //We can use unicode modifiers - $text = preg_replace('#(?\\1', $text); - $text = preg_replace('#(?\\1', $text); - } else { //We cannot use unicode modifiers - $text = preg_replace('#(?\\1', $text); - $text = preg_replace('#(?\\1', $text); + // TODO MDL-21296 - use of unicode modifiers may cause a timeout + $domainsegment = '(?:[\pLl0-9][\pLl0-9-]*[\pLl0-9]|[\pLl0-9])'; + $numericip = '(?:(?:[0-9]{1,3}\.){3}[0-9]{1,3})'; + $port = '(?::\d*)'; + $pathchar = '(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})'; + $path = "(?:/$pathchar*)*"; + $querystring = '(?:\?(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)'; + $fragment = '(?:\#(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)'; + + $regex = "(?$0', $text); + if (!empty($ignoretags)) { $ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems. $text = str_replace(array_keys($ignoretags),$ignoretags,$text); diff --git a/filter/urltolink/tests/filter_test.php b/filter/urltolink/tests/filter_test.php index 95109d17916..70fb3f1d02d 100644 --- a/filter/urltolink/tests/filter_test.php +++ b/filter/urltolink/tests/filter_test.php @@ -44,7 +44,7 @@ class filter_urltolink_testcase extends basic_testcase { '$1www.$2$3', $text); } - function test_convert_urls_into_links() { + function get_convert_urls_into_links_test_cases() { $texts = array ( //just a url 'http://moodle.org - URL' => 'http://moodle.org - URL', @@ -57,13 +57,26 @@ class filter_urltolink_testcase extends basic_testcase { 'URL: https://moodle.org/s/i=1&j=2' => 'URL: https://moodle.org/s/i=1&j=2', //url with port and params 'URL: http://moodle.org:8080/s/i=1' => 'URL: http://moodle.org:8080/s/i=1', - //url in brackets + // URL with complex fragment. + 'Most voted issues: https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel' => 'Most voted issues: https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel', + // Domain with more parts + 'URL: www.bbc.co.uk.' => 'URL: www.bbc.co.uk.', + // URL in brackets. '(http://moodle.org) - URL' => '(http://moodle.org) - URL', '(www.moodle.org) - URL' => '(www.moodle.org) - URL', - //url in square brackets + // URL in brackets with a path. + '(http://example.com/index.html) - URL' => '(http://example.com/index.html) - URL', + '(www.example.com/index.html) - URL' => '(www.example.com/index.html) - URL', + // URL in brackets with anchor. + '(http://moodle.org/main#anchor) - URL' => '(http://moodle.org/main#anchor) - URL', + '(www.moodle.org/main#anchor) - URL' => '(www.moodle.org/main#anchor) - URL', + // URL in square brackets. '[http://moodle.org] - URL' => '[http://moodle.org] - URL', '[www.moodle.org] - URL' => '[www.moodle.org] - URL', - //url in brackets with anchor + // URL in square brackets with a path. + '[http://example.com/index.html] - URL' => '[http://example.com/index.html] - URL', + '[www.example.com/index.html] - URL' => '[www.example.com/index.html] - URL', + // URL in square brackets with anchor. '[http://moodle.org/main#anchor] - URL' => '[http://moodle.org/main#anchor] - URL', '[www.moodle.org/main#anchor] - URL' => '[www.moodle.org/main#anchor] - URL', //brackets within the url @@ -71,7 +84,8 @@ class filter_urltolink_testcase extends basic_testcase { 'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: www.cc.org/url_(withpar)_go/?i=2', 'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: http://cc.org/url_(with)_(par)_go/?i=2', 'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: www.cc.org/url_(with)_(par)_go/?i=2', - 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', + // URL legitimately ending in a bracket. Commented out as part of MDL-22390. See next tests for work-arounds. + // 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL', 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL', //escaped brackets in url @@ -94,8 +108,6 @@ class filter_urltolink_testcase extends basic_testcase { 'URL: www.moodle.org?u=1.23' => 'URL: www.moodle.org?u=1.23', //escaped space in url 'URL: www.moodle.org?u=test+param&' => 'URL: www.moodle.org?u=test+param&', - //odd characters in url param - 'URL: www.moodle.org?param=:)' => 'URL: www.moodle.org?param=:)', //multiple urls 'URL: http://moodle.org www.moodle.org' => 'URL: http://moodle.org www.moodle.org', @@ -151,17 +163,25 @@ class filter_urltolink_testcase extends basic_testcase { //''=>'' ); + $data = array(); + foreach ($texts as $text => $correctresult) { + $data[] = array($text, $correctresult); + } + return $data; + } + + /** + * @dataProvider get_convert_urls_into_links_test_cases + */ + function test_convert_urls_into_links($text, $correctresult) { + $testablefilter = new testable_filter_urltolink(); + $testablefilter->convert_urls_into_links($text); + $this->assertEquals($correctresult, $text); + } + + function test_convert_urls_into_links_performance() { $testablefilter = new testable_filter_urltolink(); - foreach ($texts as $text => $correctresult) { - $msg = "Testing text: ". str_replace('%', '%%', $text) . ": %s"; // Escape original '%' so sprintf() wont get confused - - $testablefilter->convert_urls_into_links($text); - - $this->assertEquals($text, $correctresult, $msg); - } - - //performance testing $reps = 1000; $text = file_get_contents(__DIR__ . '/fixtures/sample.txt'); $time_start = microtime(true);