From cb4bc57c72af4e90ca85eba0e1850f672d63fb9b Mon Sep 17 00:00:00 2001
From: somini <somini@users.noreply.github.com>
Date: Thu, 24 Mar 2022 22:16:02 +0000
Subject: [PATCH] [FolhaDeSaoPauloBridge]: Small improvements (#1724)

---
 bridges/FolhaDeSaoPauloBridge.php | 46 +++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/bridges/FolhaDeSaoPauloBridge.php b/bridges/FolhaDeSaoPauloBridge.php
index 91f1268c..58f6040b 100644
--- a/bridges/FolhaDeSaoPauloBridge.php
+++ b/bridges/FolhaDeSaoPauloBridge.php
@@ -12,27 +12,43 @@ class FolhaDeSaoPauloBridge extends FeedExpander {
 				'required' => true,
 				'title' => 'Select the sub-feed (see https://www1.folha.uol.com.br/feed/)',
 				'exampleValue' => 'emcimadahora/rss091.xml',
-			)
+			),
+			'amount' => array(
+				'name' => 'Amount of items to fetch',
+				'type' => 'number',
+				'defaultValue' => 15,
+			),
+			'deep_crawl' => array(
+				'name' => 'Deep Crawl',
+				'description' => 'Crawl each item "deeply", that is, return the article contents',
+				'type' => 'checkbox',
+				'defaultValue' => true,
+			),
 		)
 	);
 
 	protected function parseItem($item){
 		$item = parent::parseItem($item);
 
-		$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
-		if($articleHTMLContent) {
-			foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) {
-				$toRemove->innertext = '';
+		if ($this->getInput('deep_crawl')) {
+			$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
+			if($articleHTMLContent) {
+				foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) {
+					$toRemove->innertext = '';
+				}
+				$item_content = $articleHTMLContent->find('div.c-news__body', 0);
+				if ($item_content) {
+					$text = $item_content->innertext;
+					$text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em><ul><li>');
+					$item['content'] = $text;
+					$item['uri'] = explode('*', $item['uri'])[1];
+				}
+			} else {
+				Debug::log('???: ' . $item['uri']);
 			}
-			$item_content = $articleHTMLContent->find('div.c-news__body', 0);
-			if ($item_content) {
-				$text = $item_content->innertext;
-				$text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em>');
-				$item['content'] = $text;
-				$item['uri'] = explode('*', $item['uri'])[1];
-			}
-		} else {
-			Debug::log('???: ' . $item['uri']);
+		}
+		else {
+			$item['uri'] = explode('*', $item['uri'])[1];
 		}
 
 		return $item;
@@ -48,6 +64,6 @@ class FolhaDeSaoPauloBridge extends FeedExpander {
 			$feed_url = self::URI . '/' . $this->getInput('feed');
 		}
 		Debug::log('URL: ' . $feed_url);
-		$this->collectExpandableDatas($feed_url);
+		$this->collectExpandableDatas($feed_url, $this->getInput('amount'));
 	}
 }