2018-10-15 18:09:20 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* An extension of the previous SexactuBridge to cover the whole GQMagazine.
|
|
|
|
* This one taks a page (as an example sexe/news or journaliste/maia-mazaurette) which is to be configured,
|
|
|
|
* reads all the articles visible on that page, and make a stream out of it.
|
|
|
|
* @author nicolas-delsaux
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
class GQMagazineBridge extends BridgeAbstract
|
|
|
|
{
|
|
|
|
const MAINTAINER = 'Riduidel';
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
const NAME = 'GQMagazine';
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
// URI is no more valid, since we can address the whole gq galaxy
|
|
|
|
const URI = 'https://www.gqmagazine.fr';
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
const CACHE_TIMEOUT = 7200; // 2h
|
|
|
|
const DESCRIPTION = 'GQMagazine section extractor bridge. This bridge allows you get only a specific section.';
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2019-01-05 12:29:26 +01:00
|
|
|
const DEFAULT_DOMAIN = 'www.gqmagazine.fr';
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
const PARAMETERS = [ [
|
|
|
|
'domain' => [
|
|
|
|
'name' => 'Domain to use',
|
|
|
|
'required' => true,
|
2019-01-05 12:29:26 +01:00
|
|
|
'defaultValue' => self::DEFAULT_DOMAIN
|
2018-10-15 18:09:20 +02:00
|
|
|
],
|
|
|
|
'page' => [
|
|
|
|
'name' => 'Initial page to load',
|
2019-01-05 12:29:26 +01:00
|
|
|
'required' => true,
|
|
|
|
'exampleValue' => 'sexe/news'
|
2018-10-15 18:09:20 +02:00
|
|
|
],
|
2022-04-10 18:56:24 +02:00
|
|
|
'limit' => self::LIMIT,
|
2018-10-15 18:09:20 +02:00
|
|
|
]];
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
const REPLACED_ATTRIBUTES = [
|
|
|
|
'href' => 'href',
|
|
|
|
'src' => 'src',
|
|
|
|
'data-original' => 'src'
|
|
|
|
];
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2019-06-28 19:29:32 +02:00
|
|
|
const POSSIBLE_TITLES = [
|
|
|
|
'h2',
|
|
|
|
'h3'
|
|
|
|
];
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
private function getDomain()
|
|
|
|
{
|
2019-01-05 12:29:26 +01:00
|
|
|
$domain = $this->getInput('domain');
|
|
|
|
if (empty($domain)) {
|
|
|
|
$domain = self::DEFAULT_DOMAIN;
|
2022-07-01 15:10:30 +02:00
|
|
|
}
|
2019-01-05 12:29:26 +01:00
|
|
|
if (strpos($domain, '://') === false) {
|
|
|
|
$domain = 'https://' . $domain;
|
2022-07-01 15:10:30 +02:00
|
|
|
}
|
2019-01-05 12:29:26 +01:00
|
|
|
return $domain;
|
2018-10-15 18:09:20 +02:00
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
public function getURI()
|
|
|
|
{
|
|
|
|
return $this->getDomain() . '/' . $this->getInput('page');
|
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2019-06-28 19:29:32 +02:00
|
|
|
private function findTitleOf($link)
|
|
|
|
{
|
|
|
|
foreach (self::POSSIBLE_TITLES as $tag) {
|
2019-09-06 10:51:13 +02:00
|
|
|
$title = $link->parent()->find($tag, 0);
|
2019-06-28 19:29:32 +02:00
|
|
|
if ($title !== null) {
|
|
|
|
if ($title->plaintext !== null) {
|
|
|
|
return $title->plaintext;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
}
|
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
public function collectData()
|
|
|
|
{
|
2022-01-02 14:36:09 +05:00
|
|
|
$html = getSimpleHTMLDOM($this->getURI());
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
// Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content !
|
|
|
|
$main = $html->find('main', 0);
|
2022-04-10 18:56:24 +02:00
|
|
|
$limit = $this->getInput('limit') ?? 10;
|
2018-10-15 18:09:20 +02:00
|
|
|
foreach ($main->find('a') as $link) {
|
2022-04-10 18:56:24 +02:00
|
|
|
if (count($this->items) >= $limit) {
|
|
|
|
break;
|
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
$uri = $link->href;
|
2019-09-06 10:51:13 +02:00
|
|
|
$date = $link->parent()->find('time', 0);
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
$item = [];
|
2019-09-06 10:51:13 +02:00
|
|
|
$author = $link->parent()->find('span[itemprop=name]', 0);
|
2019-06-28 19:29:32 +02:00
|
|
|
if ($author !== null) {
|
|
|
|
$item['author'] = $author->plaintext;
|
|
|
|
$item['title'] = $this->findTitleOf($link);
|
|
|
|
switch (substr($uri, 0, 1)) {
|
|
|
|
case 'h': // absolute uri
|
|
|
|
$item['uri'] = $uri;
|
|
|
|
break;
|
|
|
|
case '/': // domain relative uri
|
|
|
|
$item['uri'] = $this->getDomain() . $uri;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$item['uri'] = $this->getDomain() . '/' . $uri;
|
|
|
|
}
|
|
|
|
$article = $this->loadFullArticle($item['uri']);
|
|
|
|
if ($article) {
|
|
|
|
$item['content'] = $this->replaceUriInHtmlElement($article);
|
|
|
|
} else {
|
|
|
|
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
|
|
|
|
}
|
|
|
|
$short_date = $date->datetime;
|
|
|
|
$item['timestamp'] = strtotime($short_date);
|
|
|
|
$this->items[] = $item;
|
2022-07-01 15:10:30 +02:00
|
|
|
}
|
2018-10-15 18:09:20 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
/**
|
|
|
|
* Loads the full article and returns the contents
|
|
|
|
* @param $uri The article URI
|
|
|
|
* @return The article content
|
|
|
|
*/
|
|
|
|
private function loadFullArticle($uri)
|
|
|
|
{
|
|
|
|
$html = getSimpleHTMLDOMCached($uri);
|
2022-03-25 00:26:38 +01:00
|
|
|
return $html->find('article', 0);
|
2018-10-15 18:09:20 +02:00
|
|
|
}
|
2022-07-01 15:10:30 +02:00
|
|
|
|
2018-10-15 18:09:20 +02:00
|
|
|
/**
|
|
|
|
* Replaces all relative URIs with absolute ones
|
|
|
|
* @param $element A simplehtmldom element
|
|
|
|
* @return The $element->innertext with all URIs replaced
|
|
|
|
*/
|
|
|
|
private function replaceUriInHtmlElement($element)
|
|
|
|
{
|
|
|
|
$returned = $element->innertext;
|
|
|
|
foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
|
|
|
|
$returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned);
|
|
|
|
}
|
|
|
|
return $returned;
|
|
|
|
}
|
|
|
|
}
|