1
0
mirror of https://github.com/RSS-Bridge/rss-bridge.git synced 2025-10-28 21:55:50 +01:00

feat: introduce convenience function get_sitemap (#4773)

* feat: introduce function get_sitemap

Convenience function to fetch and parse xml sitemap from url

* lint
This commit is contained in:
Dag
2025-10-25 02:08:43 +02:00
committed by GitHub
parent 104308c0b2
commit f604ed842f
5 changed files with 59 additions and 46 deletions

View File

@@ -13,21 +13,11 @@ class CybernewsBridge extends BridgeAbstract
public function collectData()
{
$sitemapUrl = self::URI . '/news-sitemap.xml';
$urls = get_sitemap('https://cybernews.com/news-sitemap.xml');
$sitemapXml = getContents($sitemapUrl);
if (!$sitemapXml) {
throwServerException('Unable to retrieve Cybernews sitemap');
}
$sitemap = simplexml_load_string($sitemapXml, null, LIBXML_NOCDATA | LIBXML_NONET);
if (!$sitemap) {
throwServerException('Unable to parse Cybernews sitemap');
}
foreach ($sitemap->url as $entry) {
$url = trim((string) $entry->loc);
$lastmod = trim((string) $entry->lastmod);
foreach ($urls as $entry) {
$url = $entry['loc'];
$lastmod = $entry['lastmod'];
if (!$url) {
continue;
@@ -41,13 +31,12 @@ class CybernewsBridge extends BridgeAbstract
// continue;
// }
$namespaces = $entry->getNamespaces(true);
$title = '';
if (isset($namespaces['news'])) {
$news = $entry->children($namespaces['news'])->news;
if (isset($entry['news'])) {
$news = $entry['news'];
if ($news) {
$title = trim((string) $news->title);
$title = trim((string) $news['title']);
}
}

View File

@@ -13,36 +13,25 @@ class InvestorsObserverBridge extends BridgeAbstract
public function collectData()
{
$sitemapXml = getContents(self::URI . '/news-sitemap.xml');
$urls = get_sitemap(self::URI . '/news-sitemap.xml');
if (!$sitemapXml) {
throwServerException('Unable to retrieve sitemap');
}
foreach ($urls as $entry) {
$title = null;
$pubDate = null;
$sitemap = simplexml_load_string($sitemapXml, null, LIBXML_NOCDATA | LIBXML_NONET);
if (!$sitemap) {
throwServerException('Unable to parse sitemap');
}
foreach ($sitemap->url as $entry) {
$url = trim((string) $entry->loc);
$lastmod = trim((string) $entry->lastmod);
$url = trim((string) $entry['loc']);
$lastmod = trim((string) $entry['lastmod']);
if (!$url) {
continue;
}
$namespaces = $entry->getNamespaces(true);
$title = '';
$pubDate = null;
if (isset($namespaces['news'])) {
$news = $entry->children($namespaces['news'])->news;
if (isset($entry['news'])) {
$news = $entry['news'];
if ($news) {
$title = trim((string) $news->title);
$pubDate = trim((string) $news->publication_date);
$title = trim((string) $news['title']);
$pubDate = trim((string) $news['publication_date']);
}
}

View File

@@ -12,14 +12,11 @@ class SkyArteBridge extends BridgeAbstract
public function collectData()
{
$doc = new \DOMDocument();
$doc->loadXML(getContents('https://arte.sky.it/sitemap-mostre-eventi.xml'));
$urls = get_sitemap('https://arte.sky.it/sitemap-mostre-eventi.xml');
$count = 0;
foreach ($doc->getElementsByTagName('url') as $url) {
$loc = $url->getElementsByTagName('loc')->item(0)->nodeValue;
$lastmod = $url->getElementsByTagName('lastmod')->item(0)->nodeValue;
foreach ($urls as $url) {
$loc = $url['loc'];
if (!$loc) {
continue;
@@ -36,7 +33,7 @@ class SkyArteBridge extends BridgeAbstract
'title' => $event['title'],
'uri' => $loc,
'uid' => $loc,
'timestamp' => $lastmod,
'timestamp' => $url['lastmod'],
'content' => $event['content'],
'categories' => $event['categories'],
'enclosures' => $event['enclosures'],

View File

@@ -342,3 +342,16 @@ Json::decode($json);
```
[Defined in lib/utils.php](https://github.com/RSS-Bridge/rss-bridge/blob/master/lib/utils.php)
# get_sitemap(string $url): array
Convenience function to fetch urls from xml sitemap.
```php
$urls = get_sitemap('https://arte.sky.it/sitemap-mostre-eventi.xml');
foreach ($urls as $url) {
$loc = $url['loc'];
$lastmod = $url['lastmod'];
}
```

View File

@@ -1,5 +1,30 @@
<?php
function get_sitemap(string $url): array
{
$doc = new \DOMDocument();
$doc->loadXML(getContents($url));
$urls = [];
foreach ($doc->getElementsByTagName('url') as $url) {
$item = [
'loc' => $url->getElementsByTagName('loc')->item(0)->nodeValue ?? null,
'lastmod' => $url->getElementsByTagName('lastmod')->item(0)->nodeValue ?? null,
'changefreq' => $url->getElementsByTagName('changefreq')->item(0)->nodeValue ?? null,
'priority' => $url->getElementsByTagName('priority')->item(0)->nodeValue ?? null,
];
$news = $url->getElementsByTagNameNS('http://www.google.com/schemas/sitemap-news/0.9', '*');
foreach ($news as $element) {
$localName = $element->localName;
$prefix = $element->prefix;
$item[$prefix][$localName] = $element->nodeValue;
}
$urls[] = $item;
}
return $urls;
}
/**
* Fetch data from an http url
*