mirror of
				https://github.com/RSS-Bridge/rss-bridge.git
				synced 2025-10-26 21:11:30 +01:00 
			
		
		
		
	returnClientError => throwClientException returnServerError => throwServerException New convenience function: throwRateLimitException Old functions are kept but deprecated.
		
			
				
	
	
		
			347 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			347 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| class MoinMoinBridge extends BridgeAbstract
 | |
| {
 | |
|     const MAINTAINER = 'logmanoriginal';
 | |
|     const NAME = 'MoinMoin Bridge';
 | |
|     const URI = 'https://moinmo.in';
 | |
|     const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
 | |
|     const PARAMETERS = [
 | |
|         [
 | |
|             'source' => [
 | |
|                 'name' => 'Source',
 | |
|                 'type' => 'text',
 | |
|                 'required' => true,
 | |
|                 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
 | |
|                 'exampleValue' => 'https://moinmo.in/MoinMoin'
 | |
|             ],
 | |
|             'separator' => [
 | |
|                 'name' => 'Separator',
 | |
|                 'type' => 'list',
 | |
|                 'requied' => true,
 | |
|                 'title' => 'Defines the separtor for splitting content into feeds',
 | |
|                 'defaultValue' => 'h2',
 | |
|                 'values' => [
 | |
|                     'Header (h1)' => 'h1',
 | |
|                     'Header (h2)' => 'h2',
 | |
|                     'Header (h3)' => 'h3',
 | |
|                     'List element (li)' => 'li',
 | |
|                     'Anchor (a)' => 'a'
 | |
|                 ]
 | |
|             ],
 | |
|             'limit' => [
 | |
|                 'name' => 'Limit',
 | |
|                 'type' => 'number',
 | |
|                 'required' => false,
 | |
|                 'title' => 'Number of items to return (from top)',
 | |
|                 'defaultValue' => -1
 | |
|             ],
 | |
|             'content' => [
 | |
|                 'name' => 'Content',
 | |
|                 'type' => 'list',
 | |
|                 'required' => false,
 | |
|                 'title' => 'Defines how feed contents are build',
 | |
|                 'defaultValue' => 'separator',
 | |
|                 'values' => [
 | |
|                     'By separator' => 'separator',
 | |
|                     'Follow link (only for anchor)' => 'follow',
 | |
|                     'None' => 'none'
 | |
|                 ]
 | |
|             ]
 | |
|         ]
 | |
|     ];
 | |
| 
 | |
|     private $title = '';
 | |
| 
 | |
|     public function collectData()
 | |
|     {
 | |
|         /* MoinMoin uses a rather unpleasent representation of HTML. Instead of
 | |
|          * using tags like <article/>, <navigation/>, <header/>, etc... it uses
 | |
|          * <div/>, <span/> and <p/>. Also each line is literaly identified via
 | |
|          * IDs. The only way to distinguish content is via headers, though not
 | |
|          * in all cases.
 | |
|          *
 | |
|          * Example (indented for the sake of readability):
 | |
|          * ...
 | |
|          * <span class="anchor" id="line-1"></span>
 | |
|          * <span class="anchor" id="line-2"></span>
 | |
|          * <span class="anchor" id="line-3"></span>
 | |
|          * <span class="anchor" id="line-4"></span>
 | |
|          * <span class="anchor" id="line-5"></span>
 | |
|          * <span class="anchor" id="line-6"></span>
 | |
|          * <span class="anchor" id="line-7"></span>
 | |
|          * <span class="anchor" id="line-8"></span>
 | |
|          * <span class="anchor" id="line-9"></span>
 | |
|          *   <p class="line867">MoinMoin is a Wiki software implemented in
 | |
|          *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
 | |
|          *   and distributed as Free Software under
 | |
|          *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
 | |
|          * ...
 | |
|          */
 | |
|         $html = getSimpleHTMLDOM($this->getInput('source'));
 | |
| 
 | |
|         // Some anchors link to local sites or local IDs (both don't work well
 | |
|         // in feeds)
 | |
|         $html = $this->fixAnchors($html);
 | |
| 
 | |
|         $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
 | |
| 
 | |
|         // Here we focus on simple author and timestamp information from the given
 | |
|         // page. Later we update this information in case the anchor is followed.
 | |
|         $author = $this->findAuthor($html);
 | |
|         $timestamp = $this->findTimestamp($html);
 | |
| 
 | |
|         $sections = $this->splitSections($html);
 | |
| 
 | |
|         foreach ($sections as $section) {
 | |
|             $item = [];
 | |
| 
 | |
|             $item['uri'] = $this->findSectionAnchor($section[0]);
 | |
| 
 | |
|             switch ($this->getInput('content')) {
 | |
|                 case 'none': // Do not return any content
 | |
|                     break;
 | |
|                 case 'follow': // Follow the anchor
 | |
|                     // We can only follow anchors (use default otherwise)
 | |
|                     if ($this->getInput('separator') === 'a') {
 | |
|                         $content = $this->followAnchor($item['uri']);
 | |
| 
 | |
|                         // Return only actual content
 | |
|                         $item['content'] = $content->find('div#page', 0)->innertext;
 | |
| 
 | |
|                         // Each page could have its own author and timestamp
 | |
|                         $author = $this->findAuthor($content);
 | |
|                         $timestamp = $this->findTimestamp($content);
 | |
| 
 | |
|                         break;
 | |
|                     }
 | |
|                     // fall-through
 | |
|                 case 'separator':
 | |
|                 default: // Use contents from the current page
 | |
|                     $item['content'] = $this->cleanArticle($section[2]);
 | |
|             }
 | |
| 
 | |
|             if (!is_null($author)) {
 | |
|                 $item['author'] = $author;
 | |
|             }
 | |
|             if (!is_null($timestamp)) {
 | |
|                 $item['timestamp'] = $timestamp;
 | |
|             }
 | |
|             $item['title'] = strip_tags($section[1]);
 | |
| 
 | |
|             // Skip items with empty title
 | |
|             if (empty(trim($item['title']))) {
 | |
|                 continue;
 | |
|             }
 | |
| 
 | |
|             $this->items[] = $item;
 | |
| 
 | |
|             if (
 | |
|                 $this->getInput('limit') > 0
 | |
|                 && count($this->items) >= $this->getInput('limit')
 | |
|             ) {
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     public function getName()
 | |
|     {
 | |
|         return $this->title ?: parent::getName();
 | |
|     }
 | |
| 
 | |
|     public function getURI()
 | |
|     {
 | |
|         return $this->getInput('source') ?: parent::getURI();
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Splits the html into sections.
 | |
|      *
 | |
|      * Returns an array with one element per section. Each element consists of:
 | |
|      * [0] The entire section
 | |
|      * [1] The section title
 | |
|      * [2] The section content
 | |
|      */
 | |
|     private function splitSections($html)
 | |
|     {
 | |
|         $content = $html->find('div#page', 0)->innertext
 | |
|             or throwServerException('Unable to find <div id="page"/>!');
 | |
| 
 | |
|         $sections = [];
 | |
| 
 | |
|         $regex = implode(
 | |
|             '',
 | |
|             [
 | |
|                 "\<{$this->getInput('separator')}.+?(?=\>)\>",
 | |
|                 "(.+?)(?=\<\/{$this->getInput('separator')}\>)",
 | |
|                 "\<\/{$this->getInput('separator')}\>",
 | |
|                 "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
 | |
|             ]
 | |
|         );
 | |
| 
 | |
|         preg_match_all(
 | |
|             '/' . $regex . '/m',
 | |
|             $content,
 | |
|             $sections,
 | |
|             PREG_SET_ORDER
 | |
|         );
 | |
| 
 | |
|         // Some pages don't use headers, return page as one feed
 | |
|         if (count($sections) === 0) {
 | |
|             return [
 | |
|                 [
 | |
|                     $content,
 | |
|                     $html->find('title', 0)->innertext,
 | |
|                     $content
 | |
|                 ]
 | |
|             ];
 | |
|         }
 | |
| 
 | |
|         return $sections;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the anchor for a given section
 | |
|      */
 | |
|     private function findSectionAnchor($section)
 | |
|     {
 | |
|         $html = str_get_html($section);
 | |
| 
 | |
|         // For IDs
 | |
|         $anchor = $html->find($this->getInput('separator') . '[id=]', 0);
 | |
|         if (!is_null($anchor)) {
 | |
|             return $this->getInput('source') . '#' . $anchor->id;
 | |
|         }
 | |
| 
 | |
|         // For actual anchors
 | |
|         $anchor = $html->find($this->getInput('separator') . '[href=]', 0);
 | |
|         if (!is_null($anchor)) {
 | |
|             return $anchor->href;
 | |
|         }
 | |
| 
 | |
|         // Nothing found
 | |
|         return $this->getInput('source');
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the author
 | |
|      *
 | |
|      * Notice: Some pages don't provide author information
 | |
|      */
 | |
|     private function findAuthor($html)
 | |
|     {
 | |
|         /* Example:
 | |
|          * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
 | |
|          * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
 | |
|          * [178.162.199.143]">hosted-by</span>)</p>
 | |
|         */
 | |
|         $pageinfo = $html->find('[id="pageinfo"]', 0);
 | |
| 
 | |
|         if (is_null($pageinfo)) {
 | |
|             return null;
 | |
|         } else {
 | |
|             $author = $pageinfo->find('[title=]', 0);
 | |
|             if (is_null($author)) {
 | |
|                 return null;
 | |
|             } else {
 | |
|                 return trim(explode('@', $author->title)[0]);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the time of last edit
 | |
|      *
 | |
|      * Notice: Some pages don't provide this information
 | |
|      */
 | |
|     private function findTimestamp($html)
 | |
|     {
 | |
|         // See example of findAuthor()
 | |
|         $pageinfo = $html->find('[id="pageinfo"]', 0);
 | |
| 
 | |
|         if (is_null($pageinfo)) {
 | |
|             return null;
 | |
|         } else {
 | |
|             $timestamp = $pageinfo->innertext;
 | |
|             $matches = [];
 | |
|             preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
 | |
|             return strtotime($matches[1]);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns the original HTML with all anchors fixed (makes relative anchors
 | |
|      * absolute)
 | |
|      */
 | |
|     private function fixAnchors($html, $source = null)
 | |
|     {
 | |
|         $source = $source ?: $this->getURI();
 | |
| 
 | |
|         foreach ($html->find('a') as $anchor) {
 | |
|             switch (substr($anchor->href, 0, 1)) {
 | |
|                 case 'h': // http or https, no actions required
 | |
|                     break;
 | |
|                 case '/': // some relative path
 | |
|                     $anchor->href = $this->findDomain($source) . $anchor->href;
 | |
|                     break;
 | |
|                 case '#': // it's an ID
 | |
|                 default: // probably something like ? or &, skip empty ones
 | |
|                     if (!isset($anchor->href)) {
 | |
|                         break;
 | |
|                     }
 | |
|                     $anchor->href = $source . $anchor->href;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return $html;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Loads the full article of a given anchor (if the anchor is from the same
 | |
|      * wiki domain)
 | |
|      */
 | |
|     private function followAnchor($anchor)
 | |
|     {
 | |
|         if (strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) {
 | |
|             return null;
 | |
|         }
 | |
| 
 | |
|         $html = getSimpleHTMLDOMCached($anchor);
 | |
|         if (!$html) { // Cannot load article
 | |
|             return null;
 | |
|         }
 | |
| 
 | |
|         return $this->fixAnchors($html, $anchor);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Finds the domain for a given URI
 | |
|      */
 | |
|     private function findDomain($uri)
 | |
|     {
 | |
|         $matches = [];
 | |
|         preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
 | |
|         return $matches[1];
 | |
|     }
 | |
| 
 | |
|     /* This function is a copy from CNETBridge */
 | |
|     private function stripWithDelimiters($string, $start, $end)
 | |
|     {
 | |
|         while (strpos($string, $start) !== false) {
 | |
|             $section_to_remove = substr($string, strpos($string, $start));
 | |
|             $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
 | |
|             $string = str_replace($section_to_remove, '', $string);
 | |
|         }
 | |
| 
 | |
|         return $string;
 | |
|     }
 | |
| 
 | |
|     /* This function is based on CNETBridge */
 | |
|     private function cleanArticle($article_html)
 | |
|     {
 | |
|         $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
 | |
|         return $article_html;
 | |
|     }
 | |
| }
 |