rector/rules/php-70/src/EregToPcreTransformer.php

271 lines
8.9 KiB
PHP
Raw Normal View History

2019-10-13 07:59:52 +02:00
<?php
declare(strict_types=1);
2018-10-08 13:19:10 +08:00
2019-09-23 16:36:58 +02:00
namespace Rector\Php70;
2018-10-08 13:19:10 +08:00
use Nette\Utils\Strings;
use Rector\Php70\Exception\InvalidEregException;
2018-10-08 13:19:10 +08:00
/**
* @author Kang Seonghoon <public+ere2pcre@mearie.org>
* @source https://gist.github.com/lifthrasiir/704754/7e486f43e62fd1c9d3669330c251f8ca4a59a3f8
*
* @see \Rector\Php70\Tests\EregToPcreTransformerTest
2018-10-08 13:19:10 +08:00
*/
final class EregToPcreTransformer
{
2020-02-18 23:09:25 +01:00
/**
* @var string[]
*/
private const CCLSMAP = [
':alnum:' => '[:alnum:]',
':alpha:' => '[:alpha:]',
':blank:' => '[:blank:]',
':cntrl:' => '[:cntrl:]',
':digit:' => '\d',
':graph:' => '[:graph:]',
':lower:' => '[:lower:]',
':print:' => '[:print:]',
':punct:' => '[:punct:]',
':space:' => '\013\s', // should include VT
':upper:' => '[:upper:]',
':xdigit:' => '[:xdigit:]',
];
2018-10-08 13:19:10 +08:00
/**
* @var string
*/
private $pcreDelimiter;
/**
2019-02-03 18:46:45 +01:00
* Change this via services configuratoin in rector.yaml if you need it
2018-10-08 13:19:10 +08:00
* Single type is chosen to prevent every regular with different delimiter.
*/
public function __construct(string $pcreDelimiter = '#')
{
$this->pcreDelimiter = $pcreDelimiter;
}
public function transform(string $ereg, bool $isCaseInsensitive): string
{
if (! Strings::contains($ereg, $this->pcreDelimiter)) {
return $this->ere2pcre($ereg, $isCaseInsensitive);
}
// fallback
return $this->ere2pcre(preg_quote($ereg, '#'), $isCaseInsensitive);
}
// converts the ERE $s into the PCRE $r. triggers error on any invalid input.
public function ere2pcre(string $s, bool $ignorecase): string
{
static $cache = [], $icache = [];
if ($ignorecase) {
if (isset($icache[$s])) {
return $icache[$s];
}
2019-10-30 15:38:30 +01:00
} elseif (isset($cache[$s])) {
return $cache[$s];
2018-10-08 13:19:10 +08:00
}
[$r, $i] = $this->_ere2pcre($s, 0);
if ($i !== strlen($s)) {
throw new InvalidEregException('unescaped metacharacter ")"');
}
if ($ignorecase) {
return $icache[$s] = '#' . $r . '#mi';
}
return $cache[$s] = '#' . $r . '#m';
}
/**
* Recursively converts ERE into PCRE, starting at the position $i.
*
* @return mixed[]
*/
private function _ere2pcre(string $s, int $i): array
{
$r = [''];
$rr = 0;
$l = strlen($s);
while ($i < $l) {
// atom
$c = $s[$i];
if ($c === '(') {
if ($i + 1 < $l && $s[$i + 1] === ')') { // special case
$r[$rr] .= '()';
++$i;
} else {
$position = (int) $i + 1;
[$t, $ii] = $this->_ere2pcre($s, $position);
2018-10-08 13:19:10 +08:00
if ($ii >= $l || $s[$ii] !== ')') {
throw new InvalidEregException('"(" does not have a matching ")"');
}
$r[$rr] .= '(' . $t . ')';
$i = $ii;
}
} elseif ($c === '[') {
++$i;
$cls = '';
if ($i < $l && $s[$i] === '^') {
$cls .= '^';
++$i;
}
if ($i >= $l) {
throw new InvalidEregException('"[" does not have a matching "]"');
}
$start = true;
$i = (int) $i;
[$cls, $i] = $this->processSquareBracket($s, $i, $l, $cls, $start);
2018-10-08 13:19:10 +08:00
if ($i >= $l) {
throw new InvalidEregException('"[" does not have a matching "]"');
}
$r[$rr] .= '[' . $cls . ']';
} elseif ($c === ')') {
break;
} elseif ($c === '*' || $c === '+' || $c === '?') {
throw new InvalidEregException('unescaped metacharacter "' . $c . '"');
} elseif ($c === '{') {
2019-11-05 11:31:54 +01:00
if ($i + 1 < $l && Strings::contains('0123456789', $s[$i + 1])) {
2018-10-08 13:19:10 +08:00
$r[$rr] .= '\{';
} else {
throw new InvalidEregException('unescaped metacharacter "' . $c . '"');
}
} elseif ($c === '.') {
$r[$rr] .= $c;
} elseif ($c === '^' || $c === '$') {
$r[$rr] .= $c;
++$i;
continue;
} elseif ($c === '|') {
if ($r[$rr] === '') {
throw new InvalidEregException('empty branch');
}
$r[] = '';
++$rr;
++$i;
continue;
} elseif ($c === '\\') {
if (++$i >= $l) {
throw new InvalidEregException('an invalid escape sequence at the end');
}
$r[$rr] .= $this->_ere2pcre_escape($s[$i]);
} else { // including ] and } which are allowed as a literal character
$r[$rr] .= $this->_ere2pcre_escape($c);
}
++$i;
if ($i >= $l) {
break;
}
// piece after the atom (only ONE of them is possible)
$c = $s[$i];
if ($c === '*' || $c === '+' || $c === '?') {
$r[$rr] .= $c;
++$i;
} elseif ($c === '{') {
$i = (int) $i;
[$start, $i] = $this->processCurlyBracket($s, $i, $r, $rr);
2018-10-08 13:19:10 +08:00
}
}
if ($r[$rr] === '') {
throw new InvalidEregException('empty regular expression or branch');
}
return [implode('|', $r), $i];
}
2018-10-31 16:34:37 +01:00
private function _ere2pcre_escape(string $c): string
{
if ($c === "\0") {
throw new InvalidEregException('a literal null byte in the regex');
2019-11-05 11:31:54 +01:00
} elseif (Strings::contains('\^$.[]|()?*+{}-/', $c)) {
2018-10-31 16:34:37 +01:00
return '\\' . $c;
}
return $c;
}
private function processSquareBracket(string &$s, int &$i, int &$l, string &$cls, bool &$start): array
{
do {
if ($s[$i] === '[' &&
$i + 1 < $l && Strings::contains('.=:', $s[$i + 1])) {
$offset = (int) $i;
$ii = strpos($s, ']', $offset);
if ($ii === false) {
throw new InvalidEregException('"[" does not have a matching "]"');
}
$start = (int) $i + 1;
$length = (int) ($ii - ($i + 1));
$ccls = Strings::substring($s, $start, $length);
if (! isset(self::CCLSMAP[$ccls])) {
throw new InvalidEregException('an invalid or unsupported character class [' . $ccls . ']');
}
$cls .= self::CCLSMAP[$ccls];
$i = $ii + 1;
} else {
$a = $s[$i++];
if ($a === '-' && ! $start && ! ($i < $l && $s[$i] === ']')) {
throw new InvalidEregException('"-" is invalid for the start character in the brackets');
}
if ($i < $l && $s[$i] === '-') {
++$i;
$b = $s[$i++];
if ($b === ']') {
$cls .= $this->_ere2pcre_escape($a) . '\-';
break;
} elseif (ord($a) > ord($b)) {
throw new InvalidEregException(sprintf('an invalid character range %d-%d"', $a, $b));
}
$cls .= $this->_ere2pcre_escape($a) . '-' . $this->_ere2pcre_escape($b);
} else {
$cls .= $this->_ere2pcre_escape($a);
}
}
$start = false;
} while ($i < $l && $s[$i] !== ']');
return [$cls, $i];
}
private function processCurlyBracket(string $s, int $i, array &$r, int $rr): array
{
$ii = strpos($s, '}', $i);
if ($ii === false) {
throw new InvalidEregException('"{" does not have a matching "}"');
}
$start = (int) $i + 1;
$length = (int) $ii - ($i + 1);
$bound = Strings::substring($s, $start, $length);
$m = Strings::match($bound, '/^(\d|[1-9]\d|1\d\d|
2[0-4]\d|25[0-5])
(,(\d|[1-9]\d|1\d\d|
2[0-4]\d|25[0-5])?)?$/x');
if (! $m) {
throw new InvalidEregException('an invalid bound');
}
if (isset($m[3])) {
if ($m[1] > $m[3]) {
throw new InvalidEregException('an invalid bound');
}
$r[$rr] .= '{' . $m[1] . ',' . $m[3] . '}';
} elseif (isset($m[2])) {
$r[$rr] .= '{' . $m[1] . ',}';
} else {
$r[$rr] .= '{' . $m[1] . '}';
}
$i = $ii + 1;
return [$start, $i];
}
2018-10-08 13:19:10 +08:00
}