From 461084d400620c32ef262aab1f837573f47b0951 Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Tue, 30 Jul 2013 16:41:10 -0400 Subject: [PATCH] MySQL's utf8 charset only supports up to 3-byte symbols. Insterting four byte symbols (U+010000 to U+10FFFF) can be done maliciously to break HTML mark-up. The ideal solution was to convert to MySQL's utf8mb4 charset, but then we would lose support for MySQL < 5.5.3. In this fix, incompatible characters are encoded as HTML numeric character references (eg. #65536) and just stripped from body_nommarkup. --- inc/functions.php | 2 +- post.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/functions.php b/inc/functions.php index 2915e1b6..4af7891a 100644 --- a/inc/functions.php +++ b/inc/functions.php @@ -1523,7 +1523,7 @@ function markup(&$body, $track_cites = false) { } function utf8tohtml($utf8) { - return htmlspecialchars($utf8, ENT_NOQUOTES, 'UTF-8'); + return mb_encode_numericentity(htmlspecialchars($utf8, ENT_NOQUOTES, 'UTF-8'), array(0x010000, 0xffffff, 0, 0xffffff), 'UTF-8'); } function ordutf8($string, &$offset) { diff --git a/post.php b/post.php index ad045b66..e9041ce8 100644 --- a/post.php +++ b/post.php @@ -378,7 +378,7 @@ if (isset($_POST['delete'])) { wordfilters($post['body']); - $post['body_nomarkup'] = $post['body']; + $post['body_nomarkup'] = preg_replace('/[\x{010000}-\x{ffffff}]/u', '', $post['body']); if (!($mod && isset($post['raw']) && $post['raw'])) $post['tracked_cites'] = markup($post['body'], true);