2019-10-13 07:59:52 +02:00
|
|
|
<?php
|
|
|
|
|
2021-05-09 20:15:43 +00:00
|
|
|
declare (strict_types=1);
|
2019-09-23 16:36:58 +02:00
|
|
|
namespace Rector\Php70;
|
2018-10-08 13:19:10 +08:00
|
|
|
|
2021-05-29 17:17:11 +00:00
|
|
|
use RectorPrefix20210529\Nette\Utils\Strings;
|
2019-09-23 17:58:49 +02:00
|
|
|
use Rector\Php70\Exception\InvalidEregException;
|
2018-10-08 13:19:10 +08:00
|
|
|
/**
|
2021-04-19 18:15:52 +02:00
|
|
|
* @changelog https://gist.github.com/lifthrasiir/704754/7e486f43e62fd1c9d3669330c251f8ca4a59a3f8
|
2020-02-23 18:51:22 +01:00
|
|
|
*
|
2021-03-12 23:20:25 +01:00
|
|
|
* @see \Rector\Tests\Php70\EregToPcreTransformerTest
|
2018-10-08 13:19:10 +08:00
|
|
|
*/
|
|
|
|
final class EregToPcreTransformer
|
|
|
|
{
|
2020-02-18 23:09:25 +01:00
|
|
|
/**
|
|
|
|
* @var string[]
|
|
|
|
*/
|
2020-02-23 19:55:05 +01:00
|
|
|
private const CHARACTER_CLASS_MAP = [
|
2020-02-18 23:09:25 +01:00
|
|
|
':alnum:' => '[:alnum:]',
|
|
|
|
':alpha:' => '[:alpha:]',
|
|
|
|
':blank:' => '[:blank:]',
|
|
|
|
':cntrl:' => '[:cntrl:]',
|
2021-05-09 20:15:43 +00:00
|
|
|
':digit:' => '\\d',
|
2020-02-18 23:09:25 +01:00
|
|
|
':graph:' => '[:graph:]',
|
|
|
|
':lower:' => '[:lower:]',
|
|
|
|
':print:' => '[:print:]',
|
|
|
|
':punct:' => '[:punct:]',
|
2020-05-12 17:20:40 +02:00
|
|
|
// should include VT
|
2021-05-09 20:15:43 +00:00
|
|
|
':space:' => '013\\s',
|
2020-02-18 23:09:25 +01:00
|
|
|
':upper:' => '[:upper:]',
|
|
|
|
':xdigit:' => '[:xdigit:]',
|
|
|
|
];
|
2020-09-23 11:16:40 +02:00
|
|
|
/**
|
|
|
|
* @var string
|
2020-10-30 01:04:33 +07:00
|
|
|
* @see https://regex101.com/r/htpXFg/1
|
2020-09-23 11:16:40 +02:00
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private const BOUND_REGEX = '/^(?<' . self::MINIMAL_NUMBER_PART . '>\\d|[1-9]\\d|1\\d\\d|
|
|
|
|
2[0-4]\\d|25[0-5])
|
|
|
|
(?<comma>,(?<' . self::MAXIMAL_NUMBER_PART . '>\\d|[1-9]\\d|1\\d\\d|
|
|
|
|
2[0-4]\\d|25[0-5])?)?$/x';
|
2021-02-22 21:05:33 +01:00
|
|
|
/**
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private const MINIMAL_NUMBER_PART = 'minimal_number';
|
|
|
|
/**
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private const MAXIMAL_NUMBER_PART = 'maximal_number';
|
2020-02-23 19:55:05 +01:00
|
|
|
/**
|
|
|
|
* @var string[]
|
|
|
|
*/
|
|
|
|
private $icache = [];
|
|
|
|
/**
|
|
|
|
* @var string[]
|
|
|
|
*/
|
|
|
|
private $cache = [];
|
2021-05-10 23:39:21 +00:00
|
|
|
/**
|
|
|
|
* @var string
|
|
|
|
*/
|
2021-05-11 10:40:34 +00:00
|
|
|
private $pcreDelimiter = '#';
|
2018-10-08 13:19:10 +08:00
|
|
|
/**
|
2020-08-30 23:29:39 +02:00
|
|
|
* Change this via services configuratoin in rector.php if you need it
|
2018-10-08 13:19:10 +08:00
|
|
|
* Single type is chosen to prevent every regular with different delimiter.
|
|
|
|
*/
|
|
|
|
public function __construct(string $pcreDelimiter = '#')
|
|
|
|
{
|
|
|
|
$this->pcreDelimiter = $pcreDelimiter;
|
|
|
|
}
|
2021-05-09 20:15:43 +00:00
|
|
|
public function transform(string $ereg, bool $isCaseInsensitive) : string
|
2018-10-08 13:19:10 +08:00
|
|
|
{
|
2021-05-29 17:17:11 +00:00
|
|
|
if (!\RectorPrefix20210529\Nette\Utils\Strings::contains($ereg, $this->pcreDelimiter)) {
|
2018-10-08 13:19:10 +08:00
|
|
|
return $this->ere2pcre($ereg, $isCaseInsensitive);
|
|
|
|
}
|
|
|
|
// fallback
|
2021-05-09 20:15:43 +00:00
|
|
|
$quotedEreg = \preg_quote($ereg, '#');
|
2020-07-06 01:57:19 +02:00
|
|
|
return $this->ere2pcre($quotedEreg, $isCaseInsensitive);
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
// converts the ERE $s into the PCRE $r. triggers error on any invalid input.
|
2021-05-09 20:15:43 +00:00
|
|
|
private function ere2pcre(string $content, bool $ignorecase) : string
|
2018-10-08 13:19:10 +08:00
|
|
|
{
|
|
|
|
if ($ignorecase) {
|
2020-02-23 19:55:05 +01:00
|
|
|
if (isset($this->icache[$content])) {
|
|
|
|
return $this->icache[$content];
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif (isset($this->cache[$content])) {
|
|
|
|
return $this->cache[$content];
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-02-23 19:55:05 +01:00
|
|
|
[$r, $i] = $this->_ere2pcre($content, 0);
|
2021-05-09 20:15:43 +00:00
|
|
|
if ($i !== \strlen($content)) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('unescaped metacharacter ")"');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
if ($ignorecase) {
|
2020-02-23 19:55:05 +01:00
|
|
|
return $this->icache[$content] = '#' . $r . '#mi';
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-02-23 20:13:15 +01:00
|
|
|
return $this->cache[$content] = '#' . $r . '#m';
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
/**
|
|
|
|
* Recursively converts ERE into PCRE, starting at the position $i.
|
|
|
|
*
|
|
|
|
* @return mixed[]
|
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private function _ere2pcre(string $content, int $i) : array
|
2018-10-08 13:19:10 +08:00
|
|
|
{
|
|
|
|
$r = [''];
|
|
|
|
$rr = 0;
|
2021-05-09 20:15:43 +00:00
|
|
|
$l = \strlen($content);
|
2018-10-08 13:19:10 +08:00
|
|
|
while ($i < $l) {
|
|
|
|
// atom
|
2020-02-23 19:55:05 +01:00
|
|
|
$char = $content[$i];
|
|
|
|
if ($char === '(') {
|
|
|
|
$i = (int) $i;
|
|
|
|
$i = $this->processBracket($content, $i, $l, $r, $rr);
|
|
|
|
} elseif ($char === '[') {
|
2018-10-08 13:19:10 +08:00
|
|
|
++$i;
|
|
|
|
$cls = '';
|
2020-02-23 19:55:05 +01:00
|
|
|
if ($i < $l && $content[$i] === '^') {
|
2018-10-08 13:19:10 +08:00
|
|
|
$cls .= '^';
|
|
|
|
++$i;
|
|
|
|
}
|
|
|
|
if ($i >= $l) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"[" does not have a matching "]"');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2021-05-09 20:15:43 +00:00
|
|
|
$start = \true;
|
2020-02-23 18:51:22 +01:00
|
|
|
$i = (int) $i;
|
2020-02-23 19:55:05 +01:00
|
|
|
[$cls, $i] = $this->processSquareBracket($content, $i, $l, $cls, $start);
|
2018-10-08 13:19:10 +08:00
|
|
|
if ($i >= $l) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"[" does not have a matching "]"');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
$r[$rr] .= '[' . $cls . ']';
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === ')') {
|
2018-10-08 13:19:10 +08:00
|
|
|
break;
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '*' || $char === '+' || $char === '?') {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('unescaped metacharacter "' . $char . '"');
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '{') {
|
2021-05-29 17:17:11 +00:00
|
|
|
if ($i + 1 < $l && \RectorPrefix20210529\Nette\Utils\Strings::contains('0123456789', $content[$i + 1])) {
|
2021-05-09 20:15:43 +00:00
|
|
|
$r[$rr] .= '\\{';
|
2018-10-08 13:19:10 +08:00
|
|
|
} else {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('unescaped metacharacter "' . $char . '"');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '.') {
|
|
|
|
$r[$rr] .= $char;
|
|
|
|
} elseif ($char === '^' || $char === '$') {
|
|
|
|
$r[$rr] .= $char;
|
2018-10-08 13:19:10 +08:00
|
|
|
++$i;
|
|
|
|
continue;
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '|') {
|
2018-10-08 13:19:10 +08:00
|
|
|
if ($r[$rr] === '') {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('empty branch');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
$r[] = '';
|
|
|
|
++$rr;
|
|
|
|
++$i;
|
|
|
|
continue;
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '\\') {
|
2018-10-08 13:19:10 +08:00
|
|
|
if (++$i >= $l) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('an invalid escape sequence at the end');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-02-23 19:55:05 +01:00
|
|
|
$r[$rr] .= $this->_ere2pcre_escape($content[$i]);
|
2020-05-12 17:20:40 +02:00
|
|
|
} else {
|
|
|
|
// including ] and } which are allowed as a literal character
|
2020-02-23 19:55:05 +01:00
|
|
|
$r[$rr] .= $this->_ere2pcre_escape($char);
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
++$i;
|
|
|
|
if ($i >= $l) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// piece after the atom (only ONE of them is possible)
|
2020-02-23 19:55:05 +01:00
|
|
|
$char = $content[$i];
|
|
|
|
if ($char === '*' || $char === '+' || $char === '?') {
|
|
|
|
$r[$rr] .= $char;
|
2018-10-08 13:19:10 +08:00
|
|
|
++$i;
|
2020-02-23 19:55:05 +01:00
|
|
|
} elseif ($char === '{') {
|
2020-02-13 11:10:09 +01:00
|
|
|
$i = (int) $i;
|
2020-02-23 19:55:05 +01:00
|
|
|
$i = $this->processCurlyBracket($content, $i, $r, $rr);
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if ($r[$rr] === '') {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('empty regular expression or branch');
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2021-05-09 20:15:43 +00:00
|
|
|
return [\implode('|', $r), $i];
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|
2020-10-11 16:17:43 +02:00
|
|
|
/**
|
|
|
|
* @param mixed[] $r
|
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private function processBracket(string $content, int $i, int $l, array &$r, int $rr) : int
|
2018-10-31 16:34:37 +01:00
|
|
|
{
|
2020-05-12 17:20:40 +02:00
|
|
|
// special case
|
|
|
|
if ($i + 1 < $l && $content[$i + 1] === ')') {
|
2020-04-26 02:57:47 +02:00
|
|
|
$r[$rr] .= '()';
|
|
|
|
++$i;
|
|
|
|
} else {
|
2020-12-25 01:22:45 +01:00
|
|
|
$position = $i + 1;
|
2020-04-26 02:57:47 +02:00
|
|
|
[$t, $ii] = $this->_ere2pcre($content, $position);
|
|
|
|
if ($ii >= $l || $content[$ii] !== ')') {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"(" does not have a matching ")"');
|
2020-04-26 02:57:47 +02:00
|
|
|
}
|
|
|
|
$r[$rr] .= '(' . $t . ')';
|
|
|
|
$i = $ii;
|
2018-10-31 16:34:37 +01:00
|
|
|
}
|
2020-04-26 02:57:47 +02:00
|
|
|
return $i;
|
2018-10-31 16:34:37 +01:00
|
|
|
}
|
2020-08-11 12:59:04 +02:00
|
|
|
/**
|
|
|
|
* @return mixed[]
|
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private function processSquareBracket(string $s, int $i, int $l, string $cls, bool $start) : array
|
2020-02-23 18:51:22 +01:00
|
|
|
{
|
|
|
|
do {
|
2021-05-29 17:17:11 +00:00
|
|
|
if ($s[$i] === '[' && $i + 1 < $l && \RectorPrefix20210529\Nette\Utils\Strings::contains('.=:', $s[$i + 1])) {
|
2020-08-11 12:59:04 +02:00
|
|
|
/** @var string $cls */
|
2020-02-23 19:55:05 +01:00
|
|
|
[$cls, $i] = $this->processCharacterClass($s, $i, $cls);
|
2020-02-23 18:51:22 +01:00
|
|
|
} else {
|
2020-12-12 13:08:34 +01:00
|
|
|
$a = $s[$i];
|
|
|
|
++$i;
|
2021-05-09 20:15:43 +00:00
|
|
|
if ($a === '-' && !$start && !($i < $l && $s[$i] === ']')) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"-" is invalid for the start character in the brackets');
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
|
|
|
if ($i < $l && $s[$i] === '-') {
|
2020-12-12 13:08:34 +01:00
|
|
|
$b = $s[++$i];
|
2020-02-23 18:51:22 +01:00
|
|
|
++$i;
|
|
|
|
if ($b === ']') {
|
2021-05-09 20:15:43 +00:00
|
|
|
$cls .= $this->_ere2pcre_escape($a) . '\\-';
|
2020-02-23 18:51:22 +01:00
|
|
|
break;
|
2021-05-09 20:15:43 +00:00
|
|
|
} elseif (\ord($a) > \ord($b)) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException(\sprintf('an invalid character range %d-%d"', $a, $b));
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
|
|
|
$cls .= $this->_ere2pcre_escape($a) . '-' . $this->_ere2pcre_escape($b);
|
|
|
|
} else {
|
|
|
|
$cls .= $this->_ere2pcre_escape($a);
|
|
|
|
}
|
|
|
|
}
|
2021-05-09 20:15:43 +00:00
|
|
|
$start = \false;
|
2020-02-23 18:51:22 +01:00
|
|
|
} while ($i < $l && $s[$i] !== ']');
|
|
|
|
return [$cls, $i];
|
|
|
|
}
|
2021-05-09 20:15:43 +00:00
|
|
|
private function _ere2pcre_escape(string $content) : string
|
2020-04-26 02:57:47 +02:00
|
|
|
{
|
|
|
|
if ($content === "\0") {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('a literal null byte in the regex');
|
2020-07-20 14:18:15 +02:00
|
|
|
}
|
2021-05-29 17:17:11 +00:00
|
|
|
if (\RectorPrefix20210529\Nette\Utils\Strings::contains('\\^$.[]|()?*+{}-/', $content)) {
|
2020-04-26 02:57:47 +02:00
|
|
|
return '\\' . $content;
|
|
|
|
}
|
|
|
|
return $content;
|
|
|
|
}
|
2020-10-11 16:17:43 +02:00
|
|
|
/**
|
|
|
|
* @param mixed[] $r
|
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private function processCurlyBracket(string $s, int $i, array &$r, int $rr) : int
|
2020-02-23 18:51:22 +01:00
|
|
|
{
|
2021-05-09 20:15:43 +00:00
|
|
|
$ii = \strpos($s, '}', $i);
|
|
|
|
if ($ii === \false) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"{" does not have a matching "}"');
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
2020-12-25 01:22:45 +01:00
|
|
|
$start = $i + 1;
|
|
|
|
$length = $ii - ($i + 1);
|
2021-05-29 17:17:11 +00:00
|
|
|
$bound = \RectorPrefix20210529\Nette\Utils\Strings::substring($s, $start, $length);
|
|
|
|
$matches = \RectorPrefix20210529\Nette\Utils\Strings::match($bound, self::BOUND_REGEX);
|
2021-02-22 21:05:33 +01:00
|
|
|
if ($matches === null) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('an invalid bound');
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
2021-02-22 21:05:33 +01:00
|
|
|
if (isset($matches[self::MAXIMAL_NUMBER_PART])) {
|
|
|
|
if ($matches[self::MINIMAL_NUMBER_PART] > $matches[self::MAXIMAL_NUMBER_PART]) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('an invalid bound');
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
2021-02-22 21:05:33 +01:00
|
|
|
$r[$rr] .= '{' . $matches[self::MINIMAL_NUMBER_PART] . ',' . $matches[self::MAXIMAL_NUMBER_PART] . '}';
|
|
|
|
} elseif (isset($matches['comma'])) {
|
|
|
|
$r[$rr] .= '{' . $matches[self::MINIMAL_NUMBER_PART] . ',}';
|
2020-02-23 19:55:05 +01:00
|
|
|
} else {
|
2021-02-22 21:05:33 +01:00
|
|
|
$r[$rr] .= '{' . $matches[self::MINIMAL_NUMBER_PART] . '}';
|
2020-02-23 19:55:05 +01:00
|
|
|
}
|
2020-02-23 20:13:15 +01:00
|
|
|
return $ii + 1;
|
2020-02-23 19:55:05 +01:00
|
|
|
}
|
2020-08-11 12:59:04 +02:00
|
|
|
/**
|
|
|
|
* @return int[]|string[]
|
|
|
|
*/
|
2021-05-09 20:15:43 +00:00
|
|
|
private function processCharacterClass(string $content, int $i, string $cls) : array
|
2020-02-23 19:55:05 +01:00
|
|
|
{
|
2020-12-25 01:22:45 +01:00
|
|
|
$offset = $i;
|
2021-05-09 20:15:43 +00:00
|
|
|
$ii = \strpos($content, ']', $offset);
|
|
|
|
if ($ii === \false) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('"[" does not have a matching "]"');
|
2020-02-23 19:55:05 +01:00
|
|
|
}
|
2020-12-25 01:22:45 +01:00
|
|
|
$start = $i + 1;
|
|
|
|
$length = $ii - ($i + 1);
|
2021-05-29 17:17:11 +00:00
|
|
|
$ccls = \RectorPrefix20210529\Nette\Utils\Strings::substring($content, $start, $length);
|
2021-05-09 20:15:43 +00:00
|
|
|
if (!isset(self::CHARACTER_CLASS_MAP[$ccls])) {
|
2021-05-10 22:23:08 +00:00
|
|
|
throw new \Rector\Php70\Exception\InvalidEregException('an invalid or unsupported character class [' . $ccls . ']');
|
2020-02-23 19:55:05 +01:00
|
|
|
}
|
|
|
|
$cls .= self::CHARACTER_CLASS_MAP[$ccls];
|
2020-02-23 18:51:22 +01:00
|
|
|
$i = $ii + 1;
|
2020-02-23 19:55:05 +01:00
|
|
|
return [$cls, $i];
|
2020-02-23 18:51:22 +01:00
|
|
|
}
|
2018-10-08 13:19:10 +08:00
|
|
|
}
|