From b505667168225a74c30e7b58ab6a54bc00c3b89e Mon Sep 17 00:00:00 2001 From: Pavel Korytov Date: Wed, 31 Jul 2024 22:57:20 +0300 Subject: [PATCH] [SubstackBridge] Add Substack bridge (#4174) * [SubstackBridge] Add Substack * [SubstackBridge] Add docs * [SubstackBridge] Fix lint * [SubstackBridge] Update description * [SubstackBridge] Update description (x2) --- bridges/SubstackBridge.php | 50 +++++++++++++++++++++++++++++ docs/10_Bridge_Specific/Substack.md | 18 +++++++++++ lib/FeedExpander.php | 4 +-- 3 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 bridges/SubstackBridge.php create mode 100644 docs/10_Bridge_Specific/Substack.md diff --git a/bridges/SubstackBridge.php b/bridges/SubstackBridge.php new file mode 100644 index 00000000..13eea02e --- /dev/null +++ b/bridges/SubstackBridge.php @@ -0,0 +1,50 @@ + [ + 'required' => false, + ] + ]; + + const PARAMETERS = [ + '' => [ + 'url' => [ + 'name' => 'Substack RSS URL', + 'required' => true, + 'type' => 'text', + 'defaultValue' => 'https://newsletter.pragmaticengineer.com/feed', + 'title' => 'Usually https:///feed' + ] + ] + ]; + + public function collectData() + { + $headers = []; + if ($this->getOption('sid')) { + $url_parsed = parse_url($this->getInput('url')); + $authority = $url_parsed['host']; + $cookies = [ + 'ab_experiment_sampled=%22false%22', + 'substack.sid=' . $this->getOption('sid'), + 'substack.lli=1', + 'intro_popup_last_hidden_at=' . (new DateTime())->format('Y-m-d\TH:i:s.v\Z') + ]; + $headers = [ + 'Authority: ' . $authority, + 'Cache-Control: max-age=0', + 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', + 'Cookie: ' . implode('; ', $cookies) + ]; + } + $this->collectExpandableDatas($this->getInput('url'), -1, $headers); + } +} diff --git a/docs/10_Bridge_Specific/Substack.md b/docs/10_Bridge_Specific/Substack.md new file mode 100644 index 00000000..7595bbef --- /dev/null +++ b/docs/10_Bridge_Specific/Substack.md @@ -0,0 +1,18 @@ +# SubstackBridge + +[Substack](https://substack.com) provides RSS feeds at `/feed` path, e.g., https://newsletter.pragmaticengineer.com/feed/. However, these feeds have two problems, addressed by this bridge: +- They use RSS 2.0 with the draft [content extension](https://web.resource.org/rss/1.0/modules/content/), which isn't supported by some readers; +- They don't have the full content for paywalled posts. + +Retrieving the full content is only possible _with an active subscription to the blog_. If you have one, Substack will return the full feed if it's fetched with the right set of cookies. Figuring out whether it's the intended behaviour is left as an exercise for the reader. + +To obtain the session cookie, authorize at https://substack.com/, open DevTools, go to Application -> Cookies -> https://substack.com, copy the value of `substack.sid` and paste it to the RSS bridge config: + +``` +[SubstackBridge] +sid = "" +``` + +Authorization sometimes requires CAPTCHA, hence this operation is manual. The cookie lives for three months. + +After you've done this, the bridge should return full feeds for your subscriptions. diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index fe809bc2..ef001af1 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -7,7 +7,7 @@ abstract class FeedExpander extends BridgeAbstract { private array $feed; - public function collectExpandableDatas(string $url, $maxItems = -1) + public function collectExpandableDatas(string $url, $maxItems = -1, $headers = []) { if (!$url) { throw new \Exception('There is no $url for this RSS expander'); @@ -17,7 +17,7 @@ abstract class FeedExpander extends BridgeAbstract $maxItems = 999; } $accept = [MrssFormat::MIME_TYPE, AtomFormat::MIME_TYPE, '*/*']; - $httpHeaders = ['Accept: ' . implode(', ', $accept)]; + $httpHeaders = array_merge(['Accept: ' . implode(', ', $accept)], $headers); $xmlString = getContents($url, $httpHeaders); if ($xmlString === '') { throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10);