From 92aabf2b230312a5ddb4d719bf4f47e0ccd2b9af Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@cs.stanford.edu>
Date: Wed, 2 Mar 2016 02:05:54 -0800
Subject: [PATCH] Fix #76, linkify includes dots at end of URL.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
---
 NEWS                                        |  2 ++
 library/HTMLPurifier/Injector/Linkify.php   | 11 ++++++++---
 tests/HTMLPurifier/Injector/LinkifyTest.php |  8 ++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index b8f09cfa..d563c186 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 ! %CSS.AllowDuplicates permits duplicate CSS properties.
 - alt truncation could result in malformed UTF-8 sequence. Don't
   truncate.  Thanks Brandon Farber for reporting.
+- Linkify regex is smarter, based off of Gruber's regex.
+- IDNA supported natively on PHP 5.3 and later.
 
 4.7.0, released 2015-08-04
 # opacity is now considered a "tricky" CSS property rather than a
diff --git a/library/HTMLPurifier/Injector/Linkify.php b/library/HTMLPurifier/Injector/Linkify.php
index 069708c2..74f83eaa 100644
--- a/library/HTMLPurifier/Injector/Linkify.php
+++ b/library/HTMLPurifier/Injector/Linkify.php
@@ -31,9 +31,14 @@ class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
             return;
         }
 
-        // there is/are URL(s). Let's split the string:
-        // Note: this regex is extremely permissive
-        $bits = preg_split('#((?:https?|ftp)://[^\s\'",<>()]+)#Su', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
+        // there is/are URL(s). Let's split the string.
+        // We use this regex:
+        // https://gist.github.com/gruber/249502
+        // but with @cscott's backtracking fix and also
+        // the Unicode characters un-Unicodified.
+        $bits = preg_split(
+            '/\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'".,<>?\x{00ab}\x{00bb}\x{201c}\x{201d}\x{2018}\x{2019}]))/iu',
+            $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
 
 
         $token = array();
diff --git a/tests/HTMLPurifier/Injector/LinkifyTest.php b/tests/HTMLPurifier/Injector/LinkifyTest.php
index 1954db6b..8eeac449 100644
--- a/tests/HTMLPurifier/Injector/LinkifyTest.php
+++ b/tests/HTMLPurifier/Injector/LinkifyTest.php
@@ -52,6 +52,14 @@ class HTMLPurifier_Injector_LinkifyTest extends HTMLPurifier_InjectorHarness
         $this->assertResult('<a><span>http://example.com</span></a>');
     }
 
+    public function testRegexIsSmart()
+    {
+        $this->assertResult('http://example.com/foo.', '<a href="http://example.com/foo">http://example.com/foo</a>.');
+        $this->assertResult('“http://example.com/foo”', '“<a href="http://example.com/foo">http://example.com/foo</a>”');
+        $this->assertResult('“http://example.com”', '“<a href="http://example.com">http://example.com</a>”');
+        $this->assertResult('(http://example.com/f(o)o)', '(<a href="http://example.com/f(o)o">http://example.com/f(o)o</a>)');
+    }
+
 }
 
 // vim: et sw=4 sts=4