From b7c04f8587dc35a3fcf2951a63e96cbbe0ab36cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dawid=20Wr=C3=B3bel?= Date: Sat, 17 May 2025 20:18:36 +0200 Subject: [PATCH] Overhaul the usage of libcurl-impersonate (#4535) libcurl-impersonate was not being used properly, as the code was overriding the headers set by it to prevent detection. - update the libcurl-impersonate to an actively managed lexiforest fork - impersonate Chrome 131 - move the defaultHttpHeaders to http.php, where it belongs - only set defaultHttpHeaders if curl-impersonate is not detected - make useragent ini setting optional and disabled by default - add necessary documentation updates --- Dockerfile | 21 ++++++++------ config.default.ini.php | 5 +++- docs/03_For_Hosts/08_Custom_Configuration.md | 4 ++- docs/09_Technical_recommendations/index.md | 2 +- lib/contents.php | 14 +--------- lib/http.php | 29 ++++++++++++++++---- 6 files changed, 45 insertions(+), 30 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1326dba0..fb783344 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,36 +25,39 @@ RUN set -xe && \ # php-zlib is enabled by default with PHP 8.2 in Debian 12 # for downloading libcurl-impersonate curl \ + # for patching libcurl-impersonate + patchelf \ && \ # install curl-impersonate library - curlimpersonate_version=0.6.0 && \ + curlimpersonate_version=1.0.0rc2 && \ { \ { \ [ $(arch) = 'aarch64' ] && \ archive="libcurl-impersonate-v${curlimpersonate_version}.aarch64-linux-gnu.tar.gz" && \ - sha512sum="d04b1eabe71f3af06aa1ce99b39a49c5e1d33b636acedcd9fad163bc58156af5c3eb3f75aa706f335515791f7b9c7a6c40ffdfa47430796483ecef929abd905d" \ + sha512sum="c8add80e7a0430a074edea1a11f73d03044c48e848e164af2d6f362866623e29bede207a50f18f95b1bc5ab3d33f5c31408be60a6da66b74a0d176eebe299116" \ ; } \ || { \ [ $(arch) = 'armv7l' ] && \ archive="libcurl-impersonate-v${curlimpersonate_version}.arm-linux-gnueabihf.tar.gz" && \ - sha512sum="05906b4efa1a6ed8f3b716fd83d476b6eea6bfc68e3dbc5212d65a2962dcaa7bd1f938c9096a7535252b11d1d08fb93adccc633585ff8cb8cec5e58bfe969bc9" \ + sha512sum="d0403ca4ad55a8d499b120e5675c7b5a0dc4946af49c933e91fc24455ffe5e122aa21ee95554612ff5d1bd6faea1556e1e1b9c821918e2644cc9bcbddc05747a" \ ; } \ || { \ [ $(arch) = 'x86_64' ] && \ archive="libcurl-impersonate-v${curlimpersonate_version}.x86_64-linux-gnu.tar.gz" && \ - sha512sum="480bbe9452cd9aff2c0daaaf91f1057b3a96385f79011628a9237223757a9b0d090c59cb5982dc54ea0d07191657299ea91ca170a25ced3d7d410fcdff130ace" \ + sha512sum="35cafda2b96df3218a6d8545e0947a899837ede51c90f7ef2980bd2d99dbd67199bc620000df28b186727300b8c7046d506807fb48ee0fbc068dc8ae01986339" \ ; } \ } && \ - curl -LO "https://github.com/lwthiker/curl-impersonate/releases/download/v${curlimpersonate_version}/${archive}" && \ + curl -LO "https://github.com/lexiforest/curl-impersonate/releases/download/v${curlimpersonate_version}/${archive}" && \ echo "$sha512sum $archive" | sha512sum -c - && \ mkdir -p /usr/local/lib/curl-impersonate && \ - tar xaf "$archive" -C /usr/local/lib/curl-impersonate --wildcards 'libcurl-impersonate-ff.so*' && \ + tar xaf "$archive" -C /usr/local/lib/curl-impersonate && \ + patchelf --set-soname libcurl.so.4 /usr/local/lib/curl-impersonate/libcurl-impersonate.so && \ rm "$archive" && \ - apt-get purge --assume-yes curl && \ + apt-get purge --assume-yes curl patchelf && \ rm -rf /var/lib/apt/lists/* -ENV LD_PRELOAD /usr/local/lib/curl-impersonate/libcurl-impersonate-ff.so -ENV CURL_IMPERSONATE ff91esr +ENV LD_PRELOAD /usr/local/lib/curl-impersonate/libcurl-impersonate.so +ENV CURL_IMPERSONATE chrome131 # logs should go to stdout / stderr RUN ln -sfT /dev/stderr /var/log/nginx/error.log; \ diff --git a/config.default.ini.php b/config.default.ini.php index 6d646fd5..ae8b23b7 100644 --- a/config.default.ini.php +++ b/config.default.ini.php @@ -53,7 +53,10 @@ timeout = 5 retries = 1 ; Curl user agent -useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0" +; This is already set by curl-impersonate, which comes included as default +; in RSS-Bridge docker container. Use only if you know what you're doing. +; For reference, see https://github.com/lexiforest/curl-impersonate/tree/main/docs +;useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0" ; Max http response size in MB max_filesize = 20 diff --git a/docs/03_For_Hosts/08_Custom_Configuration.md b/docs/03_For_Hosts/08_Custom_Configuration.md index 6e22f7ee..ebb53cb3 100644 --- a/docs/03_For_Hosts/08_Custom_Configuration.md +++ b/docs/03_For_Hosts/08_Custom_Configuration.md @@ -119,7 +119,9 @@ Default network timeout. ### useragent -Default user agent. +Overrides the user agent value. Note that the default value, together with a set of other detection-preventing options is set +automatically by the [libcurl-impersonate](https://github.com/lexiforest/curl-impersonate), which is used by the default Docker container distributed together with RSS-Bridge. Use only if you know what you're doing, otherwise you may stop libcurl-impersonate +from doing its job impersonating real browser. ## Authentication diff --git a/docs/09_Technical_recommendations/index.md b/docs/09_Technical_recommendations/index.md index c564418e..92c672ca 100644 --- a/docs/09_Technical_recommendations/index.md +++ b/docs/09_Technical_recommendations/index.md @@ -3,7 +3,7 @@ ## Test a site before building a bridge Some sites make use of anti-bot mechanisms (e.g.: by using JavaScript) in which case they work fine in regular browsers, -but not in the PHP environment. +but not in the PHP environment. RSS-Bridge Docker container by default resorts to using libcurl-impersonate, which helps mitigating anti-bot mechanisms. To check if a site works with RSS-Bridge, create a new bridge using the [template](../05_Bridge_API/02_BridgeAbstract.md#template) diff --git a/lib/contents.php b/lib/contents.php index b4d70817..7c9060a9 100644 --- a/lib/contents.php +++ b/lib/contents.php @@ -64,19 +64,7 @@ function getContents( } } - // Snagged from https://github.com/lwthiker/curl-impersonate/blob/main/firefox/curl_ff102 - $defaultHttpHeaders = [ - 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language' => 'en-US,en;q=0.5', - 'Upgrade-Insecure-Requests' => '1', - 'Sec-Fetch-Dest' => 'document', - 'Sec-Fetch-Mode' => 'navigate', - 'Sec-Fetch-Site' => 'none', - 'Sec-Fetch-User' => '?1', - 'TE' => 'trailers', - ]; - - $config['headers'] = array_merge($defaultHttpHeaders, $httpHeadersNormalized); + $config['headers'] = $httpHeadersNormalized; $maxFileSize = Configuration::getConfig('http', 'max_filesize'); if ($maxFileSize) { diff --git a/lib/http.php b/lib/http.php index 15d6ebec..37a1f12e 100644 --- a/lib/http.php +++ b/lib/http.php @@ -66,6 +66,8 @@ final class CurlHttpClient implements HttpClient { public function request(string $url, array $config = []): Response { + $ch = curl_init($url); + $defaults = [ 'useragent' => null, 'timeout' => 5, @@ -77,13 +79,27 @@ final class CurlHttpClient implements HttpClient 'max_filesize' => null, 'max_redirections' => 5, ]; + + // if curl-impersonate is not detected, use some basic defaults + if (curl_version()['ssl_version'] != 'BoringSSL') { + // Snagged from https://github.com/lwthiker/curl-impersonate/blob/main/firefox/curl_ff102 + $defaults['headers'] = [ + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.5', + 'Upgrade-Insecure-Requests' => '1', + 'Sec-Fetch-Dest' => 'document', + 'Sec-Fetch-Mode' => 'navigate', + 'Sec-Fetch-Site' => 'none', + 'Sec-Fetch-User' => '?1', + 'TE' => 'trailers', + ]; + $defaults['useragent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'; + + curl_setopt($ch, CURLOPT_HEADER, false); + } + $config = array_merge($defaults, $config); - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_MAXREDIRS, $config['max_redirections']); - curl_setopt($ch, CURLOPT_HEADER, false); $httpHeaders = []; foreach ($config['headers'] as $name => $value) { $httpHeaders[] = sprintf('%s: %s', $name, $value); @@ -92,6 +108,9 @@ final class CurlHttpClient implements HttpClient if ($config['useragent']) { curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']); } + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_MAXREDIRS, $config['max_redirections']); curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']); curl_setopt($ch, CURLOPT_ENCODING, ''); curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);