From 062b4110d4324bd437e854383f545e30649c79cb Mon Sep 17 00:00:00 2001 From: David Mudrak Date: Thu, 7 Oct 2010 09:01:15 +0000 Subject: [PATCH] MDL-24542 New filter urltolink, replacing core convert_urls_into_links() function --- filter/urltolink/filter.php | 141 ++++++++++++ filter/urltolink/filtersettings.php | 33 +++ filter/urltolink/lang/en/filter_urltolink.php | 31 +++ filter/urltolink/simpletest/sample.txt | 16 ++ filter/urltolink/simpletest/testfilter.php | 201 ++++++++++++++++++ filter/urltolink/version.php | 27 +++ 6 files changed, 449 insertions(+) create mode 100644 filter/urltolink/filter.php create mode 100644 filter/urltolink/filtersettings.php create mode 100644 filter/urltolink/lang/en/filter_urltolink.php create mode 100644 filter/urltolink/simpletest/sample.txt create mode 100644 filter/urltolink/simpletest/testfilter.php create mode 100644 filter/urltolink/version.php diff --git a/filter/urltolink/filter.php b/filter/urltolink/filter.php new file mode 100644 index 00000000000..f1ed7ae8392 --- /dev/null +++ b/filter/urltolink/filter.php @@ -0,0 +1,141 @@ +. + +/** + * Filter converting URLs in the text to HTML links + * + * @package filter + * @subpackage urltolink + * @copyright 2010 David Mudrak + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ + +defined('MOODLE_INTERNAL') || die(); + +class filter_urltolink extends moodle_text_filter { + + /** + * @var array global configuration for this filter + * + * This might be eventually moved into parent class if we found it + * useful for other filters, too. + */ + protected static $globalconfig; + + /** + * Apply the filter to the text + * + * @see filter_manager::apply_filter_chain() + * @param string $text to be processed by the text + * @param array $options filter options + * @return string text after processing + */ + public function filter($text, array $options = array()) { + if (!isset($options['originalformat'])) { + debugging('filter_urltolink requires originalformat option to be provided', DEBUG_DEVELOPER); + return $text; + } + if (in_array($options['originalformat'], explode(',', $this->get_global_config('formats')))) { + $this->convert_urls_into_links($text); + } + return $text; + } + + //////////////////////////////////////////////////////////////////////////// + // internal implementation starts here + //////////////////////////////////////////////////////////////////////////// + + /** + * Returns the global filter setting + * + * If the $name is provided, returns single value. Otherwise returns all + * global settings in object. Returns null if the named setting is not + * found. + * + * @param mixed $name optional config variable name, defaults to null for all + * @return string|object|null + */ + protected function get_global_config($name=null) { + $this->load_global_config(); + if (is_null($name)) { + return self::$globalconfig; + + } elseif (array_key_exists($name, self::$globalconfig)) { + return self::$globalconfig->{$name}; + + } else { + return null; + } + } + + /** + * Makes sure that the global config is loaded in $this->globalconfig + * + * @return void + */ + protected function load_global_config() { + if (is_null(self::$globalconfig)) { + self::$globalconfig = get_config('filter_urltolink'); + } + } + + /** + * Given some text this function converts any URLs it finds into HTML links + * + * @param string $text Passed in by reference. The string to be searched for urls. + */ + protected function convert_urls_into_links(&$text) { + //I've added img tags to this list of tags to ignore. + //See MDL-21168 for more info. A better way to ignore tags whether or not + //they are escaped partially or completely would be desirable. For example: + // + //<a href="blah"> + //<a href="blah"> + $filterignoretagsopen = array(']+?>'); + $filterignoretagsclose = array(''); + filter_save_ignore_tags($text,$filterignoretagsopen,$filterignoretagsclose,$ignoretags); + + // Check if we support unicode modifiers in regular expressions. Cache it. + // TODO: this check should be a environment requirement in Moodle 2.0, as far as unicode + // chars are going to arrive to URLs officially really soon (2010?) + // Original RFC regex from: http://www.bytemycode.com/snippets/snippet/796/ + // Various ideas from: http://alanstorm.com/url_regex_explained + // Unicode check, negative assertion and other bits from Moodle. + static $unicoderegexp; + if (!isset($unicoderegexp)) { + $unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silently, returning false, + } + + //todo: MDL-21296 - use of unicode modifiers may cause a timeout + if ($unicoderegexp) { //We can use unicode modifiers + $text = preg_replace('#(?\\1', $text); + $text = preg_replace('#(?\\1', $text); + } else { //We cannot use unicode modifiers + $text = preg_replace('#(?\\1', $text); + $text = preg_replace('#(?\\1', $text); + } + + if (!empty($ignoretags)) { + $ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems. + $text = str_replace(array_keys($ignoretags),$ignoretags,$text); + } + } +} diff --git a/filter/urltolink/filtersettings.php b/filter/urltolink/filtersettings.php new file mode 100644 index 00000000000..1fa15aa8bff --- /dev/null +++ b/filter/urltolink/filtersettings.php @@ -0,0 +1,33 @@ +. + +/** + * @package plugintype + * @subpackage pluginname + * @copyright 2010 David Mudrak + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ + +defined('MOODLE_INTERNAL') || die(); + +if ($ADMIN->fulltree) { + + $settings->add(new admin_setting_configmulticheckbox('filter_urltolink/formats', + get_string('settingformats', 'filter_urltolink'), + get_string('settingformats_desc', 'filter_urltolink'), + array(FORMAT_MOODLE => 1), format_text_menu())); +} diff --git a/filter/urltolink/lang/en/filter_urltolink.php b/filter/urltolink/lang/en/filter_urltolink.php new file mode 100644 index 00000000000..c51afbf527e --- /dev/null +++ b/filter/urltolink/lang/en/filter_urltolink.php @@ -0,0 +1,31 @@ +. + +/** + * Strings for filter_urltolink + * + * @package filter + * @subpackage urltolink + * @copyright 2010 David Mudrak + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ + +defined('MOODLE_INTERNAL') || die(); + +$string['filtername'] = 'Convert URLs into links'; +$string['settingformats'] = 'Apply to formats'; +$string['settingformats_desc'] = 'The filter will be applied only if the original text was inserted in one of the selected formats.'; diff --git a/filter/urltolink/simpletest/sample.txt b/filter/urltolink/simpletest/sample.txt new file mode 100644 index 00000000000..23a45fef8f3 --- /dev/null +++ b/filter/urltolink/simpletest/sample.txt @@ -0,0 +1,16 @@ +http://www.lipsum.com +Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. +Why do we use it?dummy + +It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like). + +Where does it come from? + +Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. + +The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. +Where can I get some? + +There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration in some form, by injected humour, or randomised words which don't look even slightly believable. If you are going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc. +Wikipedia +http://www.lorem-ipsum.info/ diff --git a/filter/urltolink/simpletest/testfilter.php b/filter/urltolink/simpletest/testfilter.php new file mode 100644 index 00000000000..23103785328 --- /dev/null +++ b/filter/urltolink/simpletest/testfilter.php @@ -0,0 +1,201 @@ +. + +/** + * Unit test for the filter_urltolink + * + * @package filter + * @subpackage urltolink + * @copyright 2010 David Mudrak + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ + +defined('MOODLE_INTERNAL') || die(); + +require_once($CFG->dirroot . '/filter/urltolink/filter.php'); // Include the code to test + +/** + * Test subclass that makes all the protected methods we want to test public. + */ +class testable_filter_urltolink extends filter_urltolink { + public function __construct() { + } + public function convert_urls_into_links(&$text) { + parent::convert_urls_into_links($text); + } +} + +/** + * Test cases for filter_urltolink class + */ +class filter_urltolink_test extends UnitTestCase { + + /** + * Helper function that represents the legacy implementation + * of convert_urls_into_links() + */ + protected function old_convert_urls_into_links(&$text) { + /// Make lone URLs into links. eg http://moodle.com/ + $text = preg_replace("%([[:space:]]|^|\(|\[)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])%i", + '$1$2://$3$4', $text); + /// eg www.moodle.com + $text = preg_replace("%([[:space:]]|^|\(|\[)www\.([^[:space:]]*)([[:alnum:]#?/&=])%i", + '$1www.$2$3', $text); + } + + function test_convert_urls_into_links() { + $texts = array ( + //just a url + 'http://moodle.org - URL' => 'http://moodle.org - URL', + 'www.moodle.org - URL' => 'www.moodle.org - URL', + //url with params + 'URL: http://moodle.org/s/i=1&j=2' => 'URL: http://moodle.org/s/i=1&j=2', + //url with escaped params + 'URL: www.moodle.org/s/i=1&j=2' => 'URL: www.moodle.org/s/i=1&j=2', + //https url with params + 'URL: https://moodle.org/s/i=1&j=2' => 'URL: https://moodle.org/s/i=1&j=2', + //url with port and params + 'URL: http://moodle.org:8080/s/i=1' => 'URL: http://moodle.org:8080/s/i=1', + //url in brackets + '(http://moodle.org) - URL' => '(http://moodle.org) - URL', + '(www.moodle.org) - URL' => '(www.moodle.org) - URL', + //url in square brackets + '[http://moodle.org] - URL' => '[http://moodle.org] - URL', + '[www.moodle.org] - URL' => '[www.moodle.org] - URL', + //url in brackets with anchor + '[http://moodle.org/main#anchor] - URL' => '[http://moodle.org/main#anchor] - URL', + '[www.moodle.org/main#anchor] - URL' => '[www.moodle.org/main#anchor] - URL', + //brackets within the url + 'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: http://cc.org/url_(withpar)_go/?i=2', + 'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: www.cc.org/url_(withpar)_go/?i=2', + 'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: http://cc.org/url_(with)_(par)_go/?i=2', + 'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: www.cc.org/url_(with)_(par)_go/?i=2', + 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', + 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL', + 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL', + //escaped brackets in url + 'http://en.wikipedia.org/wiki/Slash_%28punctuation%29'=>'http://en.wikipedia.org/wiki/Slash_%28punctuation%29', + //anchor tag + 'URL: http://moodle.org' => 'URL: http://moodle.org', + 'URL: www.moodle.org' => 'URL: www.moodle.org', + 'URL: http://moodle.org' => 'URL: http://moodle.org', + 'URL: www.moodle.org' => 'URL: www.moodle.org', + //escaped anchor tag. Commented out as part of MDL-21183 + //htmlspecialchars('escaped anchor tag www.moodle.org') => 'escaped anchor tag <a href="http://moodle.org"> www.moodle.org</a>', + //trailing fullstop + 'URL: http://moodle.org/s/i=1&j=2.' => 'URL: http://moodle.org/s/i=1&j=2.', + 'URL: www.moodle.org/s/i=1&j=2.' => 'URL: www.moodle.org/s/i=1&j=2.', + //trailing unmatched bracket + 'URL: http://moodle.org)
' => 'URL: http://moodle.org)
', + //partially escaped html + 'URL:

text www.moodle.org</p> text' => 'URL:

text www.moodle.org</p> text', + //decimal url parameter + 'URL: www.moodle.org?u=1.23' => 'URL: www.moodle.org?u=1.23', + //escaped space in url + 'URL: www.moodle.org?u=test+param&' => 'URL: www.moodle.org?u=test+param&', + //odd characters in url param + 'URL: www.moodle.org?param=:)' => 'URL: www.moodle.org?param=:)', + //multiple urls + 'URL: http://moodle.org www.moodle.org' + => 'URL: http://moodle.org www.moodle.org', + //containing anchor tags including a class parameter and a url to convert + 'URL: http://moodle.org www.moodle.org http://moodle.org' + => 'URL: http://moodle.org www.moodle.org http://moodle.org', + //subdomain + 'http://subdomain.moodle.org - URL' => 'http://subdomain.moodle.org - URL', + //multiple subdomains + 'http://subdomain.subdomain.moodle.org - URL' => 'http://subdomain.subdomain.moodle.org - URL', + //looks almost like a link but isnt + 'This contains http, http:// and www but no actual links.'=>'This contains http, http:// and www but no actual links.', + //no link at all + 'This is a story about moodle.coming to a cinema near you.'=>'This is a story about moodle.coming to a cinema near you.', + //URLs containing utf 8 characters + 'http://Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'http://Iñtërnâtiônàlizætiøn.com?ô=nëø', + 'www.Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'www.Iñtërnâtiônàlizætiøn.com?ô=nëø', + //text containing utf 8 characters outside of a url + 'Iñtërnâtiônàlizætiøn is important to http://moodle.org'=>'Iñtërnâtiônàlizætiøn is important to http://moodle.org', + //too hard to identify without additional regexs + 'moodle.org' => 'moodle.org', + //some text with no link between related html tags + 'no link here' => 'no link here', + //some text with a link between related html tags + 'a link here www.moodle.org' => 'a link here www.moodle.org', + //some text containing a link within unrelated tags + '
This is some text. www.moodle.com then some more text
' => '
This is some text. www.moodle.com then some more text
', + //check we aren't modifying img tags + 'image' => 'image', + 'image' => 'image', + //and another url within one tag + ' ' => ' ', + ' ' => ' ', + '

'=>'', + //partially escaped img tag + 'partially escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />' => 'partially escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />', + //fully escaped img tag. Commented out as part of MDL-21183 + //htmlspecialchars('fully escaped img tag ') => 'fully escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />', + //Double http with www + 'One more link like http://www.moodle.org to test' => 'One more link like http://www.moodle.org to test', + //Encoded URLs in the path + 'URL: http://127.0.0.1/one%28parenthesis%29/path?param=value' => 'URL: http://127.0.0.1/one%28parenthesis%29/path?param=value', + 'URL: www.localhost.com/one%28parenthesis%29/path?param=value' => 'URL: www.localhost.com/one%28parenthesis%29/path?param=value', + //Encoded URLs in the query + 'URL: http://127.0.0.1/path/to?param=value_with%28parenthesis%29¶m2=1' => 'URL: http://127.0.0.1/path/to?param=value_with%28parenthesis%29¶m2=1', + 'URL: www.localhost.com/path/to?param=value_with%28parenthesis%29¶m2=1' => 'URL: www.localhost.com/path/to?param=value_with%28parenthesis%29¶m2=1', + //URLs in Javascript. Commented out as part of MDL-21183 + //'var url="http://moodle.org";'=>'var url="http://moodle.org";', + //'var url = "http://moodle.org";'=>'var url = "http://moodle.org";', + //'var url="www.moodle.org";'=>'var url="www.moodle.org";', + //'var url = "www.moodle.org";'=>'var url = "www.moodle.org";', + //doctype. do we care about this failing? + //''=>'' + ); + + $testablefilter = new testable_filter_urltolink(); + + foreach ($texts as $text => $correctresult) { + $msg = "Testing text: ". str_replace('%', '%%', $text) . ": %s"; // Escape original '%' so sprintf() wont get confused + + $testablefilter->convert_urls_into_links($text); + + $this->assertEqual($text, $correctresult, $msg); + } + + //performance testing + $reps = 1000; + $text = file_get_contents(dirname(__FILE__) . '/sample.txt'); + $time_start = microtime(true); + for($i=0;$i<$reps;$i++) { + $testablefilter->convert_urls_into_links($text); + } + $time_end = microtime(true); + $new_time = $time_end - $time_start; + + $time_start = microtime(true); + for($i=0;$i<$reps;$i++) { + $this->old_convert_urls_into_links($text); + } + $time_end = microtime(true); + $old_time = $time_end - $time_start; + + $fast_enough = false; + if( $new_time < $old_time ) { + $fast_enough = true; + } + + $this->assertEqual($fast_enough, true, 'Timing test: ' . $new_time . 'secs (new) < ' . $old_time . 'secs (old)'); + } +} diff --git a/filter/urltolink/version.php b/filter/urltolink/version.php new file mode 100644 index 00000000000..8618552c2b3 --- /dev/null +++ b/filter/urltolink/version.php @@ -0,0 +1,27 @@ +. + +/** + * @package filter + * @subpackage urltolink + * @copyright 2010 David Mudrak + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ + +defined('MOODLE_INTERNAL') || die(); + +$plugin->version = 2010100500;