mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-01-17 14:18:35 +01:00
[EconomistBridge] Full rework (#2272)
This commit is contained in:
parent
927cb17dbf
commit
1ddce120ae
@ -1,70 +1,125 @@
|
||||
<?php
|
||||
class EconomistBridge extends BridgeAbstract {
|
||||
const NAME = 'The Economist: Latest Updates';
|
||||
const URI = 'https://www.economist.com';
|
||||
const DESCRIPTION = 'Fetches the latest updates from the Economist.';
|
||||
const MAINTAINER = 'thefranke';
|
||||
const CACHE_TIMEOUT = 3600; // 1h
|
||||
class EconomistBridge extends FeedExpander {
|
||||
|
||||
public function getIcon() {
|
||||
return 'https://www.economist.com/sites/default/files/econfinal_favicon.ico';
|
||||
const MAINTAINER = 'bockiii';
|
||||
const NAME = 'Economist Bridge';
|
||||
const URI = 'https://www.economist.com/';
|
||||
const CACHE_TIMEOUT = 3600; //1hour
|
||||
const DESCRIPTION = 'Returns the latest articles for the selected category';
|
||||
|
||||
const PARAMETERS = array(
|
||||
'global' => array(
|
||||
'limit' => array(
|
||||
'name' => 'Feed Item Limit',
|
||||
'required' => true,
|
||||
'type' => 'number',
|
||||
'defaultValue' => 10,
|
||||
'title' => 'Maximum number of returned feed items. Maximum 30, default 10'
|
||||
)
|
||||
),
|
||||
'Topics' => array(
|
||||
'topic' => array(
|
||||
'name' => 'Topics',
|
||||
'type' => 'list',
|
||||
'title' => 'Select a Topic',
|
||||
'defaultValue' => 'latest',
|
||||
'values' => array(
|
||||
'Latest' => 'latest',
|
||||
'The world this week' => 'the-world-this-week',
|
||||
'Letters' => 'letters',
|
||||
'Leaders' => 'leaders',
|
||||
'Briefings' => 'briefing',
|
||||
'Special reports' => 'special-report',
|
||||
'Britain' => 'britain',
|
||||
'Europe' => 'europe',
|
||||
'United States' => 'united-states',
|
||||
'The Americas' => 'the-americas',
|
||||
'Middle East and Africa' => 'middle-east-and-africa',
|
||||
'Asia' => 'asia',
|
||||
'China' => 'china',
|
||||
'International' => 'international',
|
||||
'Business' => 'business',
|
||||
'Finance and economics' => 'finance-and-economics',
|
||||
'Science and technology' => 'science-and-technology',
|
||||
'Books and arts' => 'books-and-arts',
|
||||
'Obituaries' => 'obituary',
|
||||
'Graphic detail' => 'graphic-detail',
|
||||
'Indicators' => 'economic-and-financial-indicators',
|
||||
)
|
||||
)
|
||||
),
|
||||
'Blogs' => array(
|
||||
'blog' => array(
|
||||
'name' => 'Blogs',
|
||||
'type' => 'list',
|
||||
'title' => 'Select a Blog',
|
||||
'values' => array(
|
||||
'Bagehots notebook' => 'bagehots-notebook',
|
||||
'Bartleby' => 'bartleby',
|
||||
'Buttonwoods notebook' => 'buttonwoods-notebook',
|
||||
'Charlemagnes notebook' => 'charlemagnes-notebook',
|
||||
'Democracy in America' => 'democracy-in-america',
|
||||
'Erasmus' => 'erasmus',
|
||||
'Free exchange' => 'free-exchange',
|
||||
'Game theory' => 'game-theory',
|
||||
'Gulliver' => 'gulliver',
|
||||
'Kaffeeklatsch' => 'kaffeeklatsch',
|
||||
'Prospero' => 'prospero',
|
||||
'The Economist Explains' => 'the-economist-explains',
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
public function collectData(){
|
||||
// get if topics or blogs were selected and store the selected category
|
||||
switch ($this->queriedContext) {
|
||||
case 'Topics':
|
||||
$category = $this->getInput('topic');
|
||||
break;
|
||||
case 'Blogs':
|
||||
$category = $this->getInput('blog');
|
||||
break;
|
||||
default:
|
||||
$category = 'latest';
|
||||
}
|
||||
// limit the returned articles to 30 at max
|
||||
if ((int)$this->getInput('limit') <= 30) {
|
||||
$limit = (int)$this->getInput('limit');
|
||||
} else {
|
||||
$limit = 30;
|
||||
}
|
||||
|
||||
$this->collectExpandableDatas('https://www.economist.com/' . $category . '/rss.xml', $limit);
|
||||
}
|
||||
|
||||
public function collectData() {
|
||||
$html = getSimpleHTMLDOM(self::URI . '/latest/')
|
||||
or returnServerError('Could not fetch latest updates form The Economist.');
|
||||
protected function parseItem($feedItem){
|
||||
$item = parent::parseItem($feedItem);
|
||||
|
||||
foreach($html->find('div.teaser') as $element) {
|
||||
$article = getSimpleHTMLDOM($item['uri'])
|
||||
or returnServerError('Could not request Site: ' . $item['title']);
|
||||
// before the article can be added, it needs to be cleaned up, thus, the extra function
|
||||
$item['content'] = $this->cleanContent($article);
|
||||
// only the article lead image is retained
|
||||
$item['enclosures'][] = $article->find('div.article__lead-image', 0)->find('img', 0)->getAttribute('src');
|
||||
|
||||
$a = $element->find('a.headline-link', 0);
|
||||
$href = $a->href;
|
||||
return $item;
|
||||
}
|
||||
|
||||
if (substr($href, 0, 4) != 'http')
|
||||
$href = self::URI . $a->href;
|
||||
|
||||
$full = getSimpleHTMLDOMCached($href);
|
||||
$article = $full->find('article', 0);
|
||||
$header = $article->find('span[itemprop="headline"]', 0);
|
||||
$headerimg = $article->find('div[itemprop="image"]', 0)->find('img', 0);
|
||||
$author = $article->find('p[itemprop="byline"]', 0);
|
||||
$time = $article->find('time', 0);
|
||||
$content = $article->find('div[itemprop="text"]', 0);
|
||||
$section = array( $article->find('strong[itemprop="articleSection"]', 0)->plaintext );
|
||||
|
||||
// Author
|
||||
if ($author)
|
||||
$author = substr($author->innertext, 3, strlen($author));
|
||||
else
|
||||
$author = 'The Economist';
|
||||
|
||||
// Remove newsletter subscription box
|
||||
$newsletter = $content->find('div[class="newsletter-form__message"]', 0);
|
||||
if ($newsletter)
|
||||
$newsletter->outertext = '';
|
||||
|
||||
$newsletterForm = $content->find('form', 0);
|
||||
if ($newsletterForm)
|
||||
$newsletterForm->outertext = '';
|
||||
|
||||
// Remove next and previous article URLs at the bottom
|
||||
$nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0);
|
||||
if ($nextprev)
|
||||
$nextprev->outertext = '';
|
||||
|
||||
$item = array();
|
||||
$item['title'] = $header->innertext;
|
||||
$item['uri'] = $href;
|
||||
$item['timestamp'] = strtotime($time->datetime);
|
||||
$item['author'] = $author;
|
||||
$item['categories'] = $section;
|
||||
|
||||
$item['content'] = '<img style="max-width: 100%" src="'
|
||||
. $headerimg->src . '">' . $content->innertext;
|
||||
|
||||
$this->items[] = $item;
|
||||
|
||||
if (count($this->items) >= 10)
|
||||
break;
|
||||
private function cleanContent($article){
|
||||
// the actual article is in this div
|
||||
$content = $article->find('div.layout-article-body', 0)->innertext;
|
||||
// clean the article content. Remove all div's since the text is in paragraph elements
|
||||
foreach (array(
|
||||
'<div '
|
||||
) as $tag_start) {
|
||||
$content = stripRecursiveHTMLSection($content, 'div', $tag_start);
|
||||
}
|
||||
// now remove embedded iframes. The podcast postings contain these for example
|
||||
$content = preg_replace('/<iframe.*?\/iframe>/i', '', $content);
|
||||
// fix the relative links
|
||||
$content = defaultLinkTo($content, $this->getURI());
|
||||
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user