From cb4bc57c72af4e90ca85eba0e1850f672d63fb9b Mon Sep 17 00:00:00 2001 From: somini <somini@users.noreply.github.com> Date: Thu, 24 Mar 2022 22:16:02 +0000 Subject: [PATCH] [FolhaDeSaoPauloBridge]: Small improvements (#1724) --- bridges/FolhaDeSaoPauloBridge.php | 46 +++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/bridges/FolhaDeSaoPauloBridge.php b/bridges/FolhaDeSaoPauloBridge.php index 91f1268c..58f6040b 100644 --- a/bridges/FolhaDeSaoPauloBridge.php +++ b/bridges/FolhaDeSaoPauloBridge.php @@ -12,27 +12,43 @@ class FolhaDeSaoPauloBridge extends FeedExpander { 'required' => true, 'title' => 'Select the sub-feed (see https://www1.folha.uol.com.br/feed/)', 'exampleValue' => 'emcimadahora/rss091.xml', - ) + ), + 'amount' => array( + 'name' => 'Amount of items to fetch', + 'type' => 'number', + 'defaultValue' => 15, + ), + 'deep_crawl' => array( + 'name' => 'Deep Crawl', + 'description' => 'Crawl each item "deeply", that is, return the article contents', + 'type' => 'checkbox', + 'defaultValue' => true, + ), ) ); protected function parseItem($item){ $item = parent::parseItem($item); - $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); - if($articleHTMLContent) { - foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) { - $toRemove->innertext = ''; + if ($this->getInput('deep_crawl')) { + $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); + if($articleHTMLContent) { + foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) { + $toRemove->innertext = ''; + } + $item_content = $articleHTMLContent->find('div.c-news__body', 0); + if ($item_content) { + $text = $item_content->innertext; + $text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em><ul><li>'); + $item['content'] = $text; + $item['uri'] = explode('*', $item['uri'])[1]; + } + } else { + Debug::log('???: ' . $item['uri']); } - $item_content = $articleHTMLContent->find('div.c-news__body', 0); - if ($item_content) { - $text = $item_content->innertext; - $text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em>'); - $item['content'] = $text; - $item['uri'] = explode('*', $item['uri'])[1]; - } - } else { - Debug::log('???: ' . $item['uri']); + } + else { + $item['uri'] = explode('*', $item['uri'])[1]; } return $item; @@ -48,6 +64,6 @@ class FolhaDeSaoPauloBridge extends FeedExpander { $feed_url = self::URI . '/' . $this->getInput('feed'); } Debug::log('URL: ' . $feed_url); - $this->collectExpandableDatas($feed_url); + $this->collectExpandableDatas($feed_url, $this->getInput('amount')); } }