mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-08-07 09:06:36 +02:00
fix: rewrite and improve caching (#3594)
This commit is contained in:
276
lib/contents.php
276
lib/contents.php
@@ -1,101 +1,11 @@
|
||||
<?php
|
||||
|
||||
final class Response
|
||||
{
|
||||
public const STATUS_CODES = [
|
||||
'100' => 'Continue',
|
||||
'101' => 'Switching Protocols',
|
||||
'200' => 'OK',
|
||||
'201' => 'Created',
|
||||
'202' => 'Accepted',
|
||||
'203' => 'Non-Authoritative Information',
|
||||
'204' => 'No Content',
|
||||
'205' => 'Reset Content',
|
||||
'206' => 'Partial Content',
|
||||
'300' => 'Multiple Choices',
|
||||
'301' => 'Moved Permanently',
|
||||
'302' => 'Found',
|
||||
'303' => 'See Other',
|
||||
'304' => 'Not Modified',
|
||||
'305' => 'Use Proxy',
|
||||
'400' => 'Bad Request',
|
||||
'401' => 'Unauthorized',
|
||||
'402' => 'Payment Required',
|
||||
'403' => 'Forbidden',
|
||||
'404' => 'Not Found',
|
||||
'405' => 'Method Not Allowed',
|
||||
'406' => 'Not Acceptable',
|
||||
'407' => 'Proxy Authentication Required',
|
||||
'408' => 'Request Timeout',
|
||||
'409' => 'Conflict',
|
||||
'410' => 'Gone',
|
||||
'411' => 'Length Required',
|
||||
'412' => 'Precondition Failed',
|
||||
'413' => 'Request Entity Too Large',
|
||||
'414' => 'Request-URI Too Long',
|
||||
'415' => 'Unsupported Media Type',
|
||||
'416' => 'Requested Range Not Satisfiable',
|
||||
'417' => 'Expectation Failed',
|
||||
'429' => 'Too Many Requests',
|
||||
'500' => 'Internal Server Error',
|
||||
'501' => 'Not Implemented',
|
||||
'502' => 'Bad Gateway',
|
||||
'503' => 'Service Unavailable',
|
||||
'504' => 'Gateway Timeout',
|
||||
'505' => 'HTTP Version Not Supported'
|
||||
];
|
||||
private string $body;
|
||||
private int $code;
|
||||
private array $headers;
|
||||
|
||||
public function __construct(
|
||||
string $body = '',
|
||||
int $code = 200,
|
||||
array $headers = []
|
||||
) {
|
||||
$this->body = $body;
|
||||
$this->code = $code;
|
||||
$this->headers = $headers;
|
||||
}
|
||||
|
||||
public function getBody()
|
||||
{
|
||||
return $this->body;
|
||||
}
|
||||
|
||||
public function getCode()
|
||||
{
|
||||
return $this->code;
|
||||
}
|
||||
|
||||
public function getHeaders()
|
||||
{
|
||||
return $this->headers;
|
||||
}
|
||||
|
||||
public function send(): void
|
||||
{
|
||||
http_response_code($this->code);
|
||||
foreach ($this->headers as $name => $value) {
|
||||
header(sprintf('%s: %s', $name, $value));
|
||||
}
|
||||
print $this->body;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch data from an http url
|
||||
*
|
||||
* @param array $httpHeaders E.g. ['Content-type: text/plain']
|
||||
* @param array $curlOptions Associative array e.g. [CURLOPT_MAXREDIRS => 3]
|
||||
* @param bool $returnFull Whether to return an array:
|
||||
* [
|
||||
* 'code' => int,
|
||||
* 'header' => array,
|
||||
* 'content' => string,
|
||||
* 'status_lines' => array,
|
||||
* ]
|
||||
|
||||
* @param bool $returnFull Whether to return an array: ['code' => int, 'headers' => array, 'content' => string]
|
||||
* @return string|array
|
||||
*/
|
||||
function getContents(
|
||||
@@ -142,30 +52,35 @@ function getContents(
|
||||
}
|
||||
|
||||
$cache = RssBridge::getCache();
|
||||
$cache->setScope('server');
|
||||
$cache->setKey([$url]);
|
||||
$cacheKey = 'server_' . $url;
|
||||
|
||||
if (!Debug::isEnabled() && $cache->getTime() && $cache->loadData(86400 * 7)) {
|
||||
$config['if_not_modified_since'] = $cache->getTime();
|
||||
/** @var Response $cachedResponse */
|
||||
$cachedResponse = $cache->get($cacheKey);
|
||||
if ($cachedResponse) {
|
||||
// considering popping
|
||||
$cachedLastModified = $cachedResponse->getHeader('last-modified');
|
||||
if ($cachedLastModified) {
|
||||
$cachedLastModified = new \DateTimeImmutable($cachedLastModified);
|
||||
$config['if_not_modified_since'] = $cachedLastModified->getTimestamp();
|
||||
}
|
||||
}
|
||||
|
||||
$response = $httpClient->request($url, $config);
|
||||
|
||||
switch ($response['code']) {
|
||||
switch ($response->getCode()) {
|
||||
case 200:
|
||||
case 201:
|
||||
case 202:
|
||||
if (isset($response['headers']['cache-control'])) {
|
||||
$cachecontrol = $response['headers']['cache-control'];
|
||||
$lastValue = array_pop($cachecontrol);
|
||||
$directives = explode(',', $lastValue);
|
||||
$cacheControl = $response->getHeader('cache-control');
|
||||
if ($cacheControl) {
|
||||
$directives = explode(',', $cacheControl);
|
||||
$directives = array_map('trim', $directives);
|
||||
if (in_array('no-cache', $directives) || in_array('no-store', $directives)) {
|
||||
// Don't cache as instructed by the server
|
||||
break;
|
||||
}
|
||||
}
|
||||
$cache->saveData($response['body']);
|
||||
$cache->set($cacheKey, $response, 86400 * 10);
|
||||
break;
|
||||
case 301:
|
||||
case 302:
|
||||
@@ -174,16 +89,16 @@ function getContents(
|
||||
break;
|
||||
case 304:
|
||||
// Not Modified
|
||||
$response['body'] = $cache->loadData(86400 * 7);
|
||||
$response = $response->withBody($cachedResponse->getBody());
|
||||
break;
|
||||
default:
|
||||
$exceptionMessage = sprintf(
|
||||
'%s resulted in %s %s %s',
|
||||
$url,
|
||||
$response['code'],
|
||||
Response::STATUS_CODES[$response['code']] ?? '',
|
||||
$response->getCode(),
|
||||
$response->getStatusLine(),
|
||||
// If debug, include a part of the response body in the exception message
|
||||
Debug::isEnabled() ? mb_substr($response['body'], 0, 500) : '',
|
||||
Debug::isEnabled() ? mb_substr($response->getBody(), 0, 500) : '',
|
||||
);
|
||||
|
||||
// The following code must be extracted if it grows too much
|
||||
@@ -194,141 +109,21 @@ function getContents(
|
||||
'<title>Security | Glassdoor',
|
||||
];
|
||||
foreach ($cloudflareTitles as $cloudflareTitle) {
|
||||
if (str_contains($response['body'], $cloudflareTitle)) {
|
||||
throw new CloudFlareException($exceptionMessage, $response['code']);
|
||||
if (str_contains($response->getBody(), $cloudflareTitle)) {
|
||||
throw new CloudFlareException($exceptionMessage, $response->getCode());
|
||||
}
|
||||
}
|
||||
throw new HttpException(trim($exceptionMessage), $response['code']);
|
||||
throw new HttpException(trim($exceptionMessage), $response->getCode());
|
||||
}
|
||||
if ($returnFull === true) {
|
||||
// For legacy reasons, use content instead of body
|
||||
$response['content'] = $response['body'];
|
||||
unset($response['body']);
|
||||
return $response;
|
||||
}
|
||||
return $response['body'];
|
||||
}
|
||||
|
||||
interface HttpClient
|
||||
{
|
||||
public function request(string $url, array $config = []): array;
|
||||
}
|
||||
|
||||
final class CurlHttpClient implements HttpClient
|
||||
{
|
||||
public function request(string $url, array $config = []): array
|
||||
{
|
||||
$defaults = [
|
||||
'useragent' => null,
|
||||
'timeout' => 5,
|
||||
'headers' => [],
|
||||
'proxy' => null,
|
||||
'curl_options' => [],
|
||||
'if_not_modified_since' => null,
|
||||
'retries' => 3,
|
||||
'max_filesize' => null,
|
||||
'max_redirections' => 5,
|
||||
];
|
||||
$config = array_merge($defaults, $config);
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($ch, CURLOPT_MAXREDIRS, $config['max_redirections']);
|
||||
curl_setopt($ch, CURLOPT_HEADER, false);
|
||||
$httpHeaders = [];
|
||||
foreach ($config['headers'] as $name => $value) {
|
||||
$httpHeaders[] = sprintf('%s: %s', $name, $value);
|
||||
}
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, $httpHeaders);
|
||||
if ($config['useragent']) {
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']);
|
||||
}
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']);
|
||||
curl_setopt($ch, CURLOPT_ENCODING, '');
|
||||
curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
|
||||
|
||||
if ($config['max_filesize']) {
|
||||
// This option inspects the Content-Length header
|
||||
curl_setopt($ch, CURLOPT_MAXFILESIZE, $config['max_filesize']);
|
||||
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
||||
// This progress function will monitor responses who omit the Content-Length header
|
||||
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function ($ch, $downloadSize, $downloaded, $uploadSize, $uploaded) use ($config) {
|
||||
if ($downloaded > $config['max_filesize']) {
|
||||
// Return a non-zero value to abort the transfer
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
});
|
||||
}
|
||||
|
||||
if ($config['proxy']) {
|
||||
curl_setopt($ch, CURLOPT_PROXY, $config['proxy']);
|
||||
}
|
||||
if (curl_setopt_array($ch, $config['curl_options']) === false) {
|
||||
throw new \Exception('Tried to set an illegal curl option');
|
||||
}
|
||||
|
||||
if ($config['if_not_modified_since']) {
|
||||
curl_setopt($ch, CURLOPT_TIMEVALUE, $config['if_not_modified_since']);
|
||||
curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
|
||||
}
|
||||
|
||||
$responseStatusLines = [];
|
||||
$responseHeaders = [];
|
||||
curl_setopt($ch, CURLOPT_HEADERFUNCTION, function ($ch, $rawHeader) use (&$responseHeaders, &$responseStatusLines) {
|
||||
$len = strlen($rawHeader);
|
||||
if ($rawHeader === "\r\n") {
|
||||
return $len;
|
||||
}
|
||||
if (preg_match('#^HTTP/(2|1.1|1.0)#', $rawHeader)) {
|
||||
$responseStatusLines[] = $rawHeader;
|
||||
return $len;
|
||||
}
|
||||
$header = explode(':', $rawHeader);
|
||||
if (count($header) === 1) {
|
||||
return $len;
|
||||
}
|
||||
$name = mb_strtolower(trim($header[0]));
|
||||
$value = trim(implode(':', array_slice($header, 1)));
|
||||
if (!isset($responseHeaders[$name])) {
|
||||
$responseHeaders[$name] = [];
|
||||
}
|
||||
$responseHeaders[$name][] = $value;
|
||||
return $len;
|
||||
});
|
||||
|
||||
$attempts = 0;
|
||||
while (true) {
|
||||
$attempts++;
|
||||
$data = curl_exec($ch);
|
||||
if ($data !== false) {
|
||||
// The network call was successful, so break out of the loop
|
||||
break;
|
||||
}
|
||||
if ($attempts > $config['retries']) {
|
||||
// Finally give up
|
||||
$curl_error = curl_error($ch);
|
||||
$curl_errno = curl_errno($ch);
|
||||
throw new HttpException(sprintf(
|
||||
'cURL error %s: %s (%s) for %s',
|
||||
$curl_error,
|
||||
$curl_errno,
|
||||
'https://curl.haxx.se/libcurl/c/libcurl-errors.html',
|
||||
$url
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
return [
|
||||
'code' => $statusCode,
|
||||
'status_lines' => $responseStatusLines,
|
||||
'headers' => $responseHeaders,
|
||||
'body' => $data,
|
||||
'code' => $response->getCode(),
|
||||
'headers' => $response->getHeaders(),
|
||||
// For legacy reasons, use 'content' instead of 'body'
|
||||
'content' => $response->getBody(),
|
||||
];
|
||||
}
|
||||
return $response->getBody();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -391,7 +186,7 @@ function getSimpleHTMLDOM(
|
||||
* _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds).
|
||||
*
|
||||
* @param string $url The URL.
|
||||
* @param int $timeout Cache duration in seconds.
|
||||
* @param int $ttl Cache duration in seconds.
|
||||
* @param array $header (optional) A list of cURL header.
|
||||
* For more information follow the links below.
|
||||
* * https://php.net/manual/en/function.curl-setopt.php
|
||||
@@ -416,7 +211,7 @@ function getSimpleHTMLDOM(
|
||||
*/
|
||||
function getSimpleHTMLDOMCached(
|
||||
$url,
|
||||
$timeout = 86400,
|
||||
$ttl = 86400,
|
||||
$header = [],
|
||||
$opts = [],
|
||||
$lowercase = true,
|
||||
@@ -427,14 +222,11 @@ function getSimpleHTMLDOMCached(
|
||||
$defaultSpanText = DEFAULT_SPAN_TEXT
|
||||
) {
|
||||
$cache = RssBridge::getCache();
|
||||
$cache->setScope('pages');
|
||||
$cache->setKey([$url]);
|
||||
$content = $cache->loadData($timeout);
|
||||
if (!$content || Debug::isEnabled()) {
|
||||
$cacheKey = 'pages_' . $url;
|
||||
$content = $cache->get($cacheKey);
|
||||
if (!$content) {
|
||||
$content = getContents($url, $header ?? [], $opts ?? []);
|
||||
$cache->setScope('pages');
|
||||
$cache->setKey([$url]);
|
||||
$cache->saveData($content);
|
||||
$cache->set($cacheKey, $content, $ttl);
|
||||
}
|
||||
return str_get_html(
|
||||
$content,
|
||||
|
Reference in New Issue
Block a user