From 6f9e221bd62d2d95170e13aad6ad2a45044c5e2c Mon Sep 17 00:00:00 2001 From: Bryan Davis Date: Wed, 11 Nov 2015 20:33:44 -0700 Subject: [PATCH] Attempt to recover from json encoding errors Detect and attempt to recover from json_encode errors triggered by strings containing invalid UTF-8 sequences. Recovery will only be attempted when encoding strings or arrays. If recovery fails then a RuntimeException will be thrown. The recovery process will convert invalid UTF-8 codepoints as though the input string was encoded using the ISO-8859-15 character encoding. This conversion may result in incorrect string output if the original encoding was not ISO-8859-15, but it will be a valid UTF-8 string. Closes #545 --- src/Monolog/Formatter/NormalizerFormatter.php | 105 ++++++++++++++++-- .../Formatter/LogstashFormatterTest.php | 43 +++++++ .../Formatter/NormalizerFormatterTest.php | 95 +++++++++++++++- 3 files changed, 229 insertions(+), 14 deletions(-) diff --git a/src/Monolog/Formatter/NormalizerFormatter.php b/src/Monolog/Formatter/NormalizerFormatter.php index ce55fbc4..65f3acd7 100644 --- a/src/Monolog/Formatter/NormalizerFormatter.php +++ b/src/Monolog/Formatter/NormalizerFormatter.php @@ -138,25 +138,76 @@ class NormalizerFormatter implements FormatterInterface return $data; } + /** + * Return the JSON representation of a value + * + * @param mixed $data + * @param bool $ignoreErrors + * @return string + * @throws \RuntimeException if encoding fails and errors are not ignored + */ protected function toJson($data, $ignoreErrors = false) { // suppress json_encode errors since it's twitchy with some inputs if ($ignoreErrors) { - if (version_compare(PHP_VERSION, '5.4.0', '>=')) { - return @json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); - } - - return @json_encode($data); + return @$this->jsonEncode($data); } - if (version_compare(PHP_VERSION, '5.4.0', '>=')) { - $json = json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); - } else { - $json = json_encode($data); - } + $json = $this->jsonEncode($data); if ($json === false) { - $this->throwEncodeError(json_last_error(), $data); + $json = $this->handleJsonError(json_last_error(), $data); + } + + return $json; + } + + /** + * @param mixed $data + * @return string JSON encoded data or null on failure + */ + private function jsonEncode($data) + { + if (version_compare(PHP_VERSION, '5.4.0', '>=')) { + return json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); + } + + return json_encode($data); + } + + /** + * Handle a json_encode failure. + * + * If the failure is due to invalid string encoding, try to clean the + * input and encode again. If the second encoding iattempt fails, the + * inital error is not encoding related or the input can't be cleaned then + * raise a descriptive exception. + * + * @param int $code return code of json_last_error function + * @param mixed $data data that was meant to be encoded + * @return string JSON encoded data after error correction + * @throws \RuntimeException if failure can't be corrected + */ + private function handleJsonError($code, $data) + { + if ($code !== JSON_ERROR_UTF8) { + $this->throwEncodeError($code, $data); + } + + if (is_string($data)) { + $this->detectAndCleanUtf8($data); + + } elseif (is_array($data)) { + array_walk_recursive($data, array($this, 'detectAndCleanUtf8')); + + } else { + $this->throwEncodeError($code, $data); + } + + $json = $this->jsonEncode($data); + + if ($json === false) { + $json = $this->throwEncodeError(json_last_error(), $data); } return $json; @@ -190,4 +241,36 @@ class NormalizerFormatter implements FormatterInterface throw new \RuntimeException('JSON encoding failed: '.$msg.'. Encoding: '.var_export($data, true)); } + + /** + * Detect invalid UTF-8 string characters and convert to valid UTF-8. + * + * Valid UTF-8 input will be left unmodified, but strings containing + * invalid UTF-8 codepoints will be reencoded as UTF-8 with an assumed + * original encoding of ISO-8859-15. This conversion may result in + * incorrect output if the actual encoding was not ISO-8859-15, but it + * will be clean UTF-8 output and will not rely on expensive and fragile + * detection algorithms. + * + * Function converts the input in place in the passed variable so that it + * can be used as a callback for array_walk_recursive. + * + * @param mixed &$data Input to check and convert if needed + * @access private + */ + public function detectAndCleanUtf8(&$data) + { + if (is_string($data) && !preg_match('//u', $data)) { + $data = preg_replace_callback( + '/[\x80-\xFF]+/', + function ($m) { return utf8_encode($m[0]); }, + $data + ); + $data = str_replace( + array('¤', '¦', '¨', '´', '¸', '¼', '½', '¾'), + array('€', 'Š', 'š', 'Ž', 'ž', 'Œ', 'œ', 'Ÿ'), + $data + ); + } + } } diff --git a/tests/Monolog/Formatter/LogstashFormatterTest.php b/tests/Monolog/Formatter/LogstashFormatterTest.php index de4a3c2c..4b21cca2 100644 --- a/tests/Monolog/Formatter/LogstashFormatterTest.php +++ b/tests/Monolog/Formatter/LogstashFormatterTest.php @@ -15,6 +15,12 @@ use Monolog\Logger; class LogstashFormatterTest extends \PHPUnit_Framework_TestCase { + public function tearDown() + { + \PHPUnit_Framework_Error_Warning::$enabled = true; + return parent::tearDown(); + } + /** * @covers Monolog\Formatter\LogstashFormatter::format */ @@ -286,4 +292,41 @@ class LogstashFormatterTest extends \PHPUnit_Framework_TestCase $this->assertArrayHasKey('type', $message); $this->assertEquals('app', $message['type']); } + + public function testFormatWithLatin9Data() + { + if (version_compare(PHP_VERSION, '5.5.0', '<')) { + // Ignore the warning that will be emitted by PHP <5.5.0 + \PHPUnit_Framework_Error_Warning::$enabled = false; + } + $formatter = new LogstashFormatter('test', 'hostname'); + $record = array( + 'level' => Logger::ERROR, + 'level_name' => 'ERROR', + 'channel' => '¯\_(ツ)_/¯', + 'context' => array(), + 'datetime' => new \DateTime("@0"), + 'extra' => array( + 'user_agent' => "\xD6WN; FBCR/OrangeEspa\xF1a; Vers\xE3o/4.0; F\xE4rist", + ), + 'message' => 'log', + ); + + $message = json_decode($formatter->format($record), true); + + $this->assertEquals("1970-01-01T00:00:00.000000+00:00", $message['@timestamp']); + $this->assertEquals('log', $message['@message']); + $this->assertEquals('¯\_(ツ)_/¯', $message['@fields']['channel']); + $this->assertContains('¯\_(ツ)_/¯', $message['@tags']); + $this->assertEquals(Logger::ERROR, $message['@fields']['level']); + $this->assertEquals('test', $message['@type']); + $this->assertEquals('hostname', $message['@source']); + if (version_compare(PHP_VERSION, '5.5.0', '>=')) { + $this->assertEquals('ÖWN; FBCR/OrangeEspaña; Versão/4.0; Färist', $message['@fields']['user_agent']); + } else { + // PHP <5.5 does not return false for an element encoding failure, + // instead it emits a warning (possibly) and nulls the value. + $this->assertEquals(null, $message['@fields']['user_agent']); + } + } } diff --git a/tests/Monolog/Formatter/NormalizerFormatterTest.php b/tests/Monolog/Formatter/NormalizerFormatterTest.php index 447476ad..aa7740ff 100644 --- a/tests/Monolog/Formatter/NormalizerFormatterTest.php +++ b/tests/Monolog/Formatter/NormalizerFormatterTest.php @@ -16,6 +16,12 @@ namespace Monolog\Formatter; */ class NormalizerFormatterTest extends \PHPUnit_Framework_TestCase { + public function tearDown() + { + \PHPUnit_Framework_Error_Warning::$enabled = true; + return parent::tearDown(); + } + public function testFormat() { $formatter = new NormalizerFormatter('Y-m-d'); @@ -188,17 +194,100 @@ class NormalizerFormatterTest extends \PHPUnit_Framework_TestCase */ public function testThrowsOnInvalidEncoding() { + if (version_compare(PHP_VERSION, '5.5.0', '<')) { + // Ignore the warning that will be emitted by PHP <5.5.0 + \PHPUnit_Framework_Error_Warning::$enabled = false; + } $formatter = new NormalizerFormatter(); $reflMethod = new \ReflectionMethod($formatter, 'toJson'); $reflMethod->setAccessible(true); - // send an invalid unicode sequence - $res = $reflMethod->invoke($formatter, array('message' => "\xB1\x31")); + // send an invalid unicode sequence as a object that can't be cleaned + $record = new \stdClass; + $record->message = "\xB1\x31"; + $res = $reflMethod->invoke($formatter, $record); if (PHP_VERSION_ID < 50500 && $res === '{"message":null}') { throw new \RuntimeException('PHP 5.3/5.4 throw a warning and null the value instead of returning false entirely'); } } + public function testConvertsInvalidEncodingAsLatin9() + { + if (version_compare(PHP_VERSION, '5.5.0', '<')) { + // Ignore the warning that will be emitted by PHP <5.5.0 + \PHPUnit_Framework_Error_Warning::$enabled = false; + } + $formatter = new NormalizerFormatter(); + $reflMethod = new \ReflectionMethod($formatter, 'toJson'); + $reflMethod->setAccessible(true); + + $res = $reflMethod->invoke($formatter, array('message' => "\xA4\xA6\xA8\xB4\xB8\xBC\xBD\xBE")); + + if (version_compare(PHP_VERSION, '5.5.0', '>=')) { + $this->assertSame('{"message":"€ŠšŽžŒœŸ"}', $res); + } else { + // PHP <5.5 does not return false for an element encoding failure, + // instead it emits a warning (possibly) and nulls the value. + $this->assertSame('{"message":null}', $res); + } + } + + /** + * @param mixed $in Input + * @param mixed $expect Expected output + * @covers Monolog\Formatter\NormalizerFormatter::detectAndCleanUtf8 + * @dataProvider providesDetectAndCleanUtf8 + */ + public function testDetectAndCleanUtf8($in, $expect) + { + $formatter = new NormalizerFormatter(); + $formatter->detectAndCleanUtf8($in); + $this->assertSame($expect, $in); + } + + public function providesDetectAndCleanUtf8() + { + $obj = new \stdClass; + return array( + 'null' => array(null, null), + 'int' => array(123, 123), + 'float' => array(123.45, 123.45), + 'bool false' => array(false, false), + 'bool true' => array(true, true), + 'ascii string' => array('abcdef', 'abcdef'), + 'latin9 string' => array("\xB1\x31\xA4\xA6\xA8\xB4\xB8\xBC\xBD\xBE\xFF", '±1€ŠšŽžŒœŸÿ'), + 'unicode string' => array('¤¦¨´¸¼½¾€ŠšŽžŒœŸ', '¤¦¨´¸¼½¾€ŠšŽžŒœŸ'), + 'empty array' => array(array(), array()), + 'array' => array(array('abcdef'), array('abcdef')), + 'object' => array($obj, $obj), + ); + } + + /** + * @param int $code + * @param string $msg + * @dataProvider providesHandleJsonErrorFailure + */ + public function testHandleJsonErrorFailure($code, $msg) + { + $formatter = new NormalizerFormatter(); + $reflMethod = new \ReflectionMethod($formatter, 'handleJsonError'); + $reflMethod->setAccessible(true); + + $this->setExpectedException('RuntimeException', $msg); + $reflMethod->invoke($formatter, $code, 'faked'); + } + + public function providesHandleJsonErrorFailure() + { + return array( + 'depth' => array(JSON_ERROR_DEPTH, 'Maximum stack depth exceeded'), + 'state' => array(JSON_ERROR_STATE_MISMATCH, 'Underflow or the modes mismatch'), + 'ctrl' => array(JSON_ERROR_CTRL_CHAR, 'Unexpected control character found'), + 'default' => array(-1, 'Unknown error'), + ); + } + public function testExceptionTraceWithArgs() { if (defined('HHVM_VERSION')) { @@ -284,4 +373,4 @@ class TestToStringError { throw new \RuntimeException('Could not convert to string'); } -} \ No newline at end of file +}