1
0
mirror of https://github.com/RSS-Bridge/rss-bridge.git synced 2025-08-31 20:02:03 +02:00

fix GolemBridge pagination (and avoid having identical content on multiple pages - especially images)

This commit is contained in:
Micha
2025-08-28 13:24:11 +02:00
committed by Mynacol
parent 32f324dbb5
commit 3f33d0e312

View File

@@ -82,25 +82,35 @@ class GolemBridge extends FeedExpander
// URI without RSS feed reference // URI without RSS feed reference
$item['uri'] = $articlePage->find('head meta[name="twitter:url"]', 0)->content; $item['uri'] = $articlePage->find('head meta[name="twitter:url"]', 0)->content;
$categories = $articlePage->find('div.go-tag-list__tags a.go-tag'); if (!array_key_exists('categories', $item)) {
foreach ($categories as $category) { $categories = $articlePage->find('div.go-tag-list__tags a.go-tag');
$trimmedcategories[] = trim(html_entity_decode($category->plaintext)); foreach ($categories as $category) {
} $trimmedcategories[] = trim(html_entity_decode($category->plaintext));
if (isset($trimmedcategories)) { }
$item['categories'] = array_unique($trimmedcategories); if (isset($trimmedcategories)) {
$item['categories'] = array_unique($trimmedcategories);
}
} }
$item['content'] .= $this->extractContent($articlePage); $item['content'] .= $this->extractContent($articlePage, $item['content']);
// next page // next page
$nextUri = $articlePage->find('link[rel="next"]', 0); $nextUri = $articlePage->find('li.go-pagination__item--next>a', 0);
$uri = $nextUri ? static::URI . $nextUri->href : null; if ($nextUri) {
$nextUri = $nextUri->href;
if (str_starts_with($nextUri, '/')) {
$nextUri = substr($nextUri, 1);
}
$uri = static::URI . $nextUri;
} else {
$uri = null;
}
} }
return $item; return $item;
} }
private function extractContent($page) private function extractContent($page, $prevcontent)
{ {
$item = ''; $item = '';
@@ -150,17 +160,21 @@ class GolemBridge extends FeedExpander
} }
$header = $article->find('header', 0); $header = $article->find('header', 0);
foreach ($header->find('p, figure') as $element) { if (isset($header)) {
$item .= $element; foreach ($header->find('p, figure') as $element) {
} $item .= $element;
}
// full image quality // full image quality
foreach ($article->find('img[data-src-full][src*="."]') as $img) { foreach ($article->find('img[data-src-full][src*="."]') as $img) {
$img->src = $img->getAttribute('data-src-full'); $img->src = $img->getAttribute('data-src-full');
}
} }
foreach ($article->find('div.go-article-header__intro, p, h1, h2, h3, pre, img[src*="."], div[class*="golem_tablediv"], iframe, video') as $element) { foreach ($article->find('div.go-article-header__intro, p, h1, h2, h3, pre, img[src*="."], div[class*="golem_tablediv"], iframe, video') as $element) {
$item .= $element; if (!str_contains($prevcontent, $element)) {
$item .= $element;
}
} }
return $item; return $item;