From d9490c65181a090dfc743cd181caef2cc53ae8ef Mon Sep 17 00:00:00 2001 From: Nick McCarthy Date: Sat, 10 Jun 2023 17:35:04 +0100 Subject: [PATCH] GoogleScholarV2Bridge (#3415) * Added google scholar v2 bridge with more functionality * Corrected Sort By interpretation (this is weird on Googles part) * Remove some debug statements * Merged GoogleScholarBridge and GoogleScholarV2Bridge into GoogleScholarBridge with two contexts. * Left V2 in Bridge Name * Lint * Update GoogleScholarBridge.php * Update GoogleScholarBridge.php * Lint. * ; --- bridges/GoogleScholarBridge.php | 268 +++++++++++++++++++++++++------- 1 file changed, 215 insertions(+), 53 deletions(-) diff --git a/bridges/GoogleScholarBridge.php b/bridges/GoogleScholarBridge.php index 932efb5b..981355dd 100644 --- a/bridges/GoogleScholarBridge.php +++ b/bridges/GoogleScholarBridge.php @@ -2,19 +2,101 @@ class GoogleScholarBridge extends BridgeAbstract { - const NAME = 'Goolge Scholar'; + const NAME = 'Google Scholar v2'; const URI = 'https://scholar.google.com/'; - const DESCRIPTION = 'Follow authors of scientific publications.'; - const MAINTAINER = 'thefranke'; + const DESCRIPTION = 'Search for publications or follow authors on Google Scholar.'; + const MAINTAINER = 'nicholasmccarthy'; const CACHE_TIMEOUT = 86400; // 24h - const PARAMETERS = [[ - 'userId' => [ - 'name' => 'User ID', - 'exampleValue' => 'qc6CJjYAAAAJ', - 'required' => true - ] - ]]; + const PARAMETERS = [ + 'user' => [ + 'userId' => [ + 'name' => 'User ID', + 'exampleValue' => 'qc6CJjYAAAAJ', + 'required' => true + ] + ], + 'query' => [ + 'q' => [ + 'name' => 'Search Query', + 'title' => 'Search Query', + 'required' => true, + 'exampleValue' => 'machine learning' + ], + 'cites' => [ + 'name' => 'Cites', + 'required' => false, + 'default' => '', + 'exampleValue' => '1275980731835430123', + 'title' => 'Parameter defines unique ID for an article to trigger Cited By searches. Usage of cites + will bring up a list of citing documents in Google Scholar. Example value: cites=1275980731835430123. + Usage of cites and q parameters triggers search within citing articles.' + ], + 'language' => [ + 'name' => 'Language', + 'required' => false, + 'default' => '', + 'exampleValue' => 'en', + 'title' => 'Parameter defines the language to use for the Google Scholar search. ' + ], + 'minCitations' => [ + 'name' => 'Minimum Citations', + 'required' => false, + 'type' => 'number', + 'default' => '0', + 'title' => 'Parameter defines the minimum number of citations in order for the results to be included.' + ], + 'sinceYear' => [ + 'name' => 'Since Year', + 'required' => false, + 'type' => 'number', + 'default' => '0', + 'title' => 'Parameter defines the year from which you want the results to be included.' + ], + 'untilYear' => [ + 'name' => 'Until Year', + 'required' => false, + 'type' => 'number', + 'default' => '0', + 'title' => 'Parameter defines the year until which you want the results to be included.' + ], + 'sortBy' => [ + 'name' => 'Sort By Date', + 'type' => 'checkbox', + 'default' => false, + 'title' => 'Parameter defines articles added in the last year, sorted by date. Alternatively sorts + by relevance. This overrides Since-Until Year values.', + ], + 'includePatents' => [ + 'name' => 'Include Patents', + 'type' => 'checkbox', + 'default' => false, + 'title' => 'Include Patents', + ], + 'includeCitations' => [ + 'name' => 'Include Citations', + 'type' => 'checkbox', + 'default' => true, + 'title' => 'Parameter defines whether you would like to include citations or not.', + ], + 'reviewArticles' => [ + 'name' => 'Only Review Articles', + 'type' => 'checkbox', + 'default' => false, + 'title' => 'Parameter defines whether you would like to show only review articles or not (these + articles consist of topic reviews, or discuss the works or authors you have searched for).', + ], + 'numResults' => [ + 'name' => 'Number of Results (max 20)', + 'required' => false, + 'type' => 'number', + 'default' => 10, + 'exampleValue' => 10, + 'title' => 'Number of results to return' + ] + ], + ]; + public function getIcon() { @@ -23,58 +105,138 @@ class GoogleScholarBridge extends BridgeAbstract public function collectData() { - $uri = self::URI . '/citations?hl=en&view_op=list_works&sortby=pubdate&user=' . $this->getInput('userId'); + switch ($this->queriedContext) { + case 'user': + $userId = $this->getInput('userId'); + $uri = self::URI . '/citations?hl=en&view_op=list_works&sortby=pubdate&user=' . $userId; + $html = getSimpleHTMLDOM($uri) or returnServerError('Could not fetch Google Scholar data.'); - $html = getSimpleHTMLDOM($uri) - or returnServerError('Could not fetch Google Scholar data.'); + $publications = $html->find('tr[class="gsc_a_tr"]'); - $publications = $html->find('tr[class="gsc_a_tr"]'); + foreach ($publications as $publication) { + $articleUrl = self::URI . htmlspecialchars_decode($publication->find('a[class="gsc_a_at"]', 0)->href); + $articleTitle = $publication->find('a[class="gsc_a_at"]', 0)->plaintext; - foreach ($publications as $publication) { - $articleUrl = self::URI . htmlspecialchars_decode($publication->find('a[class="gsc_a_at"]', 0)->href); - $articleTitle = $publication->find('a[class="gsc_a_at"]', 0)->plaintext; + # fetch the article itself to extract rest of content + $contentArticle = getSimpleHTMLDOMCached($articleUrl); + $articleEntries = $contentArticle->find('div[class="gs_scl"]'); - # fetch the article itself to extract rest of content - $contentArticle = getSimpleHTMLDOMCached($articleUrl); - $articleEntries = $contentArticle->find('div[class="gs_scl"]'); + $articleDate = ''; + $articleAbstract = ''; + $articleAuthor = ''; + $content = ''; - $articleDate = ''; - $articleAbstract = ''; - $articleAuthor = ''; - $content = ''; + foreach ($articleEntries as $entry) { + $field = $entry->find('div[class="gsc_oci_field"]', 0)->plaintext; + $value = $entry->find('div[class="gsc_oci_value"]', 0)->plaintext; - foreach ($articleEntries as $entry) { - $field = $entry->find('div[class="gsc_oci_field"]', 0)->plaintext; - $value = $entry->find('div[class="gsc_oci_value"]', 0)->plaintext; + if ($field == 'Publication date') { + $articleDate = $value; + } elseif ($field == 'Description') { + $articleAbstract = $value; + } elseif ($field == 'Authors') { + $articleAuthor = $value; + } elseif ($field == 'Scholar articles' || $field == 'Total citations') { + continue; + } else { + $content = $content . $field . ': ' . $value . '

'; + } + } - if ($field == 'Publication date') { - $articleDate = $value; - } else if ($field == 'Description') { - $articleAbstract = $value; - } else if ($field == 'Authors') { - $articleAuthor = $value; - } else if ($field == 'Scholar articles' || $field == 'Total citations') { - continue; - } else { - $content = $content . $field . ': ' . $value . '

'; + $content = $content . $articleAbstract; + + $item = []; + + $item['title'] = $articleTitle; + $item['uri'] = $articleUrl; + $item['timestamp'] = strtotime($articleDate); + $item['author'] = $articleAuthor; + $item['content'] = $content; + + $this->items[] = $item; + + if (count($this->items) >= 10) { + break; + } + } + break; + case 'query': + $query = urlencode($this->getInput('q')); + $cites = $this->getInput('cites'); + $language = $this->getInput('language'); + $sinceYear = $this->getInput('sinceYear'); + $untilYear = $this->getInput('untilYear'); + $minCitations = (int)$this->getInput('minCitations'); + $includeCitations = $this->getInput('includeCitations'); + $includePatents = $this->getInput('includePatents'); + $reviewArticles = $this->getInput('reviewArticles'); + $sortBy = $this->getInput('sortBy'); + $numResults = $this->getInput('numResults'); + + # Build URI + $uri = self::URI . 'scholar?q=' . $query; + $uri .= $sinceYear != 0 ? '&as_ylo=' . $sinceYear : ''; + $uri .= $untilYear != 0 ? '&as_yhi=' . $untilYear : ''; + $uri .= $language != '' ? '&hl=' . $language : ''; + $uri .= $includePatents ? '&as_vis=7' : '&as_vis=0'; + $uri .= $includeCitations ? '&as_vis=0' : ($includePatents ? '&as_vis=1' : ''); + $uri .= $reviewArticles ? '&as_rr=1' : ''; + $uri .= $sortBy ? '&scisbd=1' : ''; + $uri .= $numResults ? '&num=' . $numResults : ''; + + $html = getSimpleHTMLDOM($uri) or returnServerError('Could not fetch Google Scholar data.'); + + $publications = $html->find('div[class="gs_r gs_or gs_scl"]'); + + foreach ($publications as $publication) { + $articleTitleElement = $publication->find('h3[class="gs_rt"]', 0); + $articleUrl = $articleTitleElement->find('a', 0)->href; + $articleTitle = $articleTitleElement->plaintext; + + $articleDateElement = $publication->find('div[class="gs_a"]', 0); + $articleDate = $articleDateElement ? $articleDateElement->plaintext : ''; + + $articleAbstractElement = $publication->find('div[class="gs_rs"]', 0); + $articleAbstract = $articleAbstractElement ? $articleAbstractElement->plaintext : ''; + + $articleAuthorElement = $publication->find('div[class="gs_a"]', 0); + $articleAuthor = $articleAuthorElement ? $articleAuthorElement->plaintext : ''; + + $bottomRowElement = $publication->find('div[class="gs_fl"]', 0); + + $item = [ + 'title' => $articleTitle, + 'uri' => $articleUrl, + 'timestamp' => strtotime($articleDate), + 'author' => $articleAuthor, + 'content' => $articleAbstract + ]; + + switch ($this->queriedContext) { + case 'user': + $this->items[] = $item; + break; + case 'query': + $citedBy = 0; + if ($bottomRowElement) { + $anchorTags = $bottomRowElement->find('a'); + foreach ($anchorTags as $anchorTag) { + if (strpos($anchorTag->plaintext, 'Cited') !== false) { + $parts = explode('Cited by ', $anchorTag->plaintext); + if (isset($parts[1])) { + $citedBy = (int)$parts[1]; + } + break; + } + } + } + if ($citedBy >= $minCitations) { + $this->items[] = $item; + } + break; + } } - } - - $content = $content . $articleAbstract; - - $item = []; - - $item['title'] = $articleTitle; - $item['uri'] = $articleUrl; - $item['timestamp'] = strtotime($articleDate); - $item['author'] = $articleAuthor; - $item['content'] = $content; - - $this->items[] = $item; - - if (count($this->items) >= 10) { break; - } } } }