From 9efdf24a6ed4c96a03d4aeb8684d5b082627057c Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 14 Jul 2023 22:09:45 +0200 Subject: [PATCH] Add CustomBridge (#3457) * Add CustomBridge For advanced users. Create RSS feed using HTML selectors. * [CssSelectorBridge] Refactor, Allow Unexpanded Rename bridge to CssSelectorBridge Allow unexpanded feed, i.e. make feed from home page only (1 request) Refactor bridge to put most of the code into protected functions Makes the code more maintainable and allows inheritance for variants * [CssSelectorBridge] Fix linting --- bridges/CssSelectorBridge.php | 208 ++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 bridges/CssSelectorBridge.php diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php new file mode 100644 index 00000000..ae135113 --- /dev/null +++ b/bridges/CssSelectorBridge.php @@ -0,0 +1,208 @@ + [ + 'name' => 'Site URL: Home page with latest articles', + 'exampleValue' => 'https://example.com/blog/', + 'required' => true + ], + 'url_selector' => [ + 'name' => 'Selector for article links or their parent elements', + 'exampleValue' => 'a.article', + 'required' => true + ], + 'url_pattern' => [ + 'name' => '[Optional] Pattern for site URLs to keep in feed', + 'exampleValue' => 'https://example.com/article/.*', + ], + 'content_selector' => [ + 'name' => '[Optional] Selector to extract each article content', + 'exampleValue' => 'article.content', + ], + 'content_cleanup' => [ + 'name' => '[Optional] Content cleanup: List of items to remove', + 'exampleValue' => 'div.ads, div.comments', + ], + 'title_cleanup' => [ + 'name' => '[Optional] Text to remove from expanded article title', + 'exampleValue' => ' | BlogName', + ], + 'limit' => self::LIMIT + ] + ]; + + private $feedName = ''; + + public function getURI() + { + $url = $this->getInput('home_page'); + if (empty($url)) { + $url = parent::getURI(); + } + return $url; + } + + public function getName() + { + if (!empty($this->feedName)) { + return $this->feedName; + } + return parent::getName(); + } + + public function collectData() + { + $url = $this->getInput('home_page'); + $url_selector = $this->getInput('url_selector'); + $url_pattern = $this->getInput('url_pattern'); + $content_selector = $this->getInput('content_selector'); + $content_cleanup = $this->getInput('content_cleanup'); + $title_cleanup = $this->getInput('title_cleanup'); + $limit = $this->getInput('limit') ?? 10; + + $html = defaultLinkTo(getSimpleHTMLDOM($url), $url); + $this->feedName = $this->getPageTitle($html, $title_cleanup); + $items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit); + + if (empty($content_selector)) { + $this->items = $items; + } else { + foreach ($items as $item) { + $this->items[] = $this->expandEntryWithSelector( + $item['uri'], + $content_selector, + $content_cleanup, + $title_cleanup + ); + } + } + } + + /** + * Filter a list of URLs using a pattern and limit + * @param array $links List of URLs + * @param string $url_pattern Pattern to look for in URLs + * @param int $limit Optional maximum amount of URLs to return + * @return array Array of URLs + */ + protected function filterUrlList($links, $url_pattern, $limit = 0) + { + if (!empty($url_pattern)) { + $url_pattern = '/' . str_replace('/', '\/', $url_pattern) . '/'; + $links = array_filter($links, function ($url) { + return preg_match($url_pattern, $url) === 1; + }); + } + + if ($limit > 0 && count($links) > $limit) { + $links = array_slice($links, 0, $limit); + } + + return $links; + } + + /** + * Retrieve title from webpage URL or DOM + * @param string|object $page URL or DOM to retrieve title from + * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName" + * @return string Webpage title + */ + protected function getPageTitle($page, $title_cleanup = null) + { + if (is_string($page)) { + $page = getSimpleHTMLDOMCached($page); + } + $title = html_entity_decode($page->find('title', 0)->plaintext); + if (!empty($title)) { + $title = trim(str_replace($title_cleanup, '', $title)); + } + return $title; + } + + /** + * Retrieve first N links from webpage URL or DOM satisfying the specified criteria + * @param string|object $page URL or DOM to retrieve links from + * @param string $url_selector DOM selector for matching links or their parent element + * @param string $url_pattern Optional filter to keep only links matching the pattern + * @param int $limit Optional maximum amount of URLs to return + * @return array of minimal feed items {'uri': entry_url, 'title', entry_title} + */ + protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0) + { + $links = $page->find($url_selector); + + if (empty($links)) { + returnClientError('No results for URL selector'); + } + + $link_to_title = []; + foreach ($links as $link) { + if ($link->tag != 'a') { + $link = $link->find('a', 0); + } + $link_to_title[$link->href] = $link->plaintext; + } + + $links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit); + + if (empty($links)) { + returnClientError('No results for URL pattern'); + } + + $items = []; + foreach ($links as $link) { + $item = []; + $item['uri'] = $link; + $item['title'] = $link_to_title[$link]; + $items[] = $item; + } + + return $items; + } + + /** + * Retrieve article content from its URL using content selector and return a feed item + * @param string $entry_url URL to retrieve article from + * @param string $content_selector HTML selector for extracting content, e.g. "article.content" + * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments" + * @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName" + * @return array Entry data: uri, title, content + */ + protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null) + { + if (empty($content_selector)) { + returnClientError('Please specify a content selector'); + } + + $entry_html = getSimpleHTMLDOMCached($entry_url); + $article_content = $entry_html->find($content_selector); + + if (!empty($article_content)) { + $article_content = $article_content[0]; + } else { + returnClientError('Could not find content selector at URL: ' . $entry_url); + } + + if (!empty($content_cleanup)) { + foreach ($article_content->find($content_cleanup) as $item_to_clean) { + $item_to_clean->outertext = ''; + } + } + + $article_content = convertLazyLoading($article_content); + $article_content = defaultLinkTo($article_content, $entry_url); + + $item = []; + $item['uri'] = $entry_url; + $item['title'] = $this->getPageTitle($entry_html, $title_cleanup); + $item['content'] = $article_content; + return $item; + } +}