2020-04-29 19:14:42 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Helper;
|
|
|
|
|
|
|
|
use GuzzleHttp\Exception\RequestException;
|
|
|
|
use Illuminate\Http\Client\ConnectionException;
|
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class HtmlMeta
|
|
|
|
*
|
|
|
|
* @package App\Helper
|
|
|
|
*/
|
|
|
|
class HtmlMeta
|
|
|
|
{
|
2020-04-30 23:15:35 +02:00
|
|
|
/** @var array */
|
|
|
|
protected static $fallback;
|
|
|
|
|
2020-05-05 23:51:09 +02:00
|
|
|
/** @var bool */
|
|
|
|
protected static $flashAlerts;
|
|
|
|
|
2020-04-29 19:14:42 +02:00
|
|
|
/**
|
|
|
|
* Get the title and description of an URL.
|
|
|
|
*
|
|
|
|
* Returned array:
|
|
|
|
* array [
|
|
|
|
* 'success' => bool,
|
|
|
|
* 'title' => string,
|
|
|
|
* 'description' => string|null,
|
|
|
|
* ]
|
|
|
|
*
|
2020-05-05 23:51:09 +02:00
|
|
|
* @param string $url
|
|
|
|
* @param bool $flashAlerts
|
2020-04-29 19:14:42 +02:00
|
|
|
* @return array
|
|
|
|
*/
|
2020-05-05 23:51:09 +02:00
|
|
|
public static function getFromUrl(string $url, bool $flashAlerts = false): array
|
2020-04-29 19:14:42 +02:00
|
|
|
{
|
2020-05-05 23:51:09 +02:00
|
|
|
self::$flashAlerts = $flashAlerts;
|
|
|
|
|
2020-04-29 19:14:42 +02:00
|
|
|
if (!filter_var($url, FILTER_VALIDATE_URL)) {
|
|
|
|
return [
|
|
|
|
'success' => false,
|
|
|
|
'title' => $url,
|
|
|
|
'description' => null,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
2020-04-30 23:15:35 +02:00
|
|
|
self::$fallback = [
|
2020-04-29 19:14:42 +02:00
|
|
|
'success' => false,
|
|
|
|
'title' => parse_url($url, PHP_URL_HOST),
|
|
|
|
'description' => null,
|
|
|
|
];
|
|
|
|
|
|
|
|
$html = self::getHtmlContent($url);
|
|
|
|
|
|
|
|
if ($html === null) {
|
2020-04-30 23:15:35 +02:00
|
|
|
return self::$fallback;
|
2020-04-29 19:14:42 +02:00
|
|
|
}
|
|
|
|
|
2020-04-30 23:15:35 +02:00
|
|
|
return self::buildHtmlMeta($html);
|
2020-04-29 19:14:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to get the HTML content of an URL.
|
|
|
|
* If a connection or response error occurs, null is returned, otherwise
|
|
|
|
* the HTML as a string.
|
|
|
|
*
|
|
|
|
* @param string $url
|
|
|
|
* @return string|null
|
|
|
|
*/
|
|
|
|
protected static function getHtmlContent(string $url): ?string
|
|
|
|
{
|
|
|
|
try {
|
|
|
|
$response = Http::timeout(5)->get($url);
|
|
|
|
} catch (ConnectionException $e) {
|
2020-05-05 23:51:09 +02:00
|
|
|
if (self::$flashAlerts) {
|
|
|
|
flash(trans('link.added_connection_error'), 'warning');
|
|
|
|
}
|
|
|
|
|
2020-04-29 19:14:42 +02:00
|
|
|
Log::warning($url . ': ' . $e->getMessage());
|
|
|
|
|
|
|
|
return null;
|
|
|
|
} catch (RequestException $e) {
|
2020-05-05 23:51:09 +02:00
|
|
|
if (self::$flashAlerts) {
|
|
|
|
flash(trans('link.added_request_error'), 'warning');
|
|
|
|
}
|
|
|
|
|
2020-04-29 19:14:42 +02:00
|
|
|
Log::warning($url . ': ' . $e->getMessage());
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!$response->successful()) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $response->body();
|
|
|
|
}
|
|
|
|
|
2020-04-30 23:15:35 +02:00
|
|
|
/**
|
|
|
|
* Returns an array containing the title and description parsed from the
|
|
|
|
* given HTML.
|
|
|
|
*
|
|
|
|
* If a charset meta tag was found and it does not contain UTF-8 as a value,
|
|
|
|
* the method tries to convert both values from the given charset into UTF-8.
|
|
|
|
* If it fails, it returns null because we most likely can't generate any
|
|
|
|
* useful information here.
|
|
|
|
*
|
|
|
|
* If no charset is available, the method will check if the title is encoded
|
|
|
|
* as UTF-8. If it does not pass the check, title and description will be set
|
|
|
|
* to null as we will most likely not be able to get any correctly encoded
|
|
|
|
* information from the strings without proper encoding information.
|
|
|
|
*
|
|
|
|
* @param string $html
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
protected static function buildHtmlMeta(string $html): array
|
|
|
|
{
|
|
|
|
$title = self::parseTitle($html);
|
|
|
|
$metaTags = self::getMetaTags($html);
|
|
|
|
|
|
|
|
$description = $metaTags['description']
|
|
|
|
?? $metaTags['og:description']
|
|
|
|
?? $metaTags['twitter:description']
|
|
|
|
?? self::$fallback['description'];
|
|
|
|
|
|
|
|
if (isset($metaTags['charset']) && strtolower($metaTags['charset']) !== 'utf-8') {
|
|
|
|
$title = iconv($metaTags['charset'], 'UTF-8', $title) ?: null;
|
|
|
|
$description = iconv($metaTags['charset'], 'UTF-8', $description) ?: null;
|
|
|
|
} elseif (mb_detect_encoding($title, 'UTF-8', true) === false) {
|
|
|
|
$title = null;
|
|
|
|
$description = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
return [
|
|
|
|
'success' => true,
|
|
|
|
'title' => $title ?? self::$fallback['title'],
|
|
|
|
'description' => $description,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
2020-04-29 19:14:42 +02:00
|
|
|
/**
|
|
|
|
* Parses the meta tags from HTML by using a specific regex.
|
|
|
|
* Returns an array of all found meta tags or an empty array if no tags were found.
|
|
|
|
*
|
|
|
|
* @param string $html
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
protected static function getMetaTags(string $html): array
|
|
|
|
{
|
2020-04-30 23:15:35 +02:00
|
|
|
$tags = [];
|
|
|
|
$pattern = '/<[\s]*meta[\s]*(name|property)="?([^>"]*)"?[\s]*content="?([^>"]*)"?[\s]*[\/]?[\s]*>/i';
|
2020-04-29 19:14:42 +02:00
|
|
|
|
|
|
|
if (preg_match_all($pattern, $html, $out)) {
|
2020-04-30 23:15:35 +02:00
|
|
|
$tags = array_combine($out[2], $out[3]);
|
|
|
|
}
|
|
|
|
|
|
|
|
$pattern = '/<[\s]*meta[\s]*(charset)="?([^>"]*)"?[\s]*>/i';
|
|
|
|
|
|
|
|
if (preg_match($pattern, $html, $out)) {
|
|
|
|
$tags['charset'] = $out[2];
|
2020-04-29 19:14:42 +02:00
|
|
|
}
|
|
|
|
|
2020-04-30 23:15:35 +02:00
|
|
|
return $tags;
|
2020-04-29 19:14:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to parse the title tag from the HTML by using regex.
|
|
|
|
* If a title tag was found, excessive whitespace and newlines are removed from the string.
|
|
|
|
*
|
|
|
|
* @param $html
|
|
|
|
* @return string|null
|
|
|
|
*/
|
|
|
|
protected static function parseTitle($html): ?string
|
|
|
|
{
|
|
|
|
$res = preg_match("/<title>(.*)<\/title>/siU", $html, $titleMatches);
|
|
|
|
|
|
|
|
if ($res) {
|
|
|
|
$title = preg_replace('/\s+/', ' ', $titleMatches[1]);
|
|
|
|
$title = trim($title);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $title ?? null;
|
|
|
|
}
|
|
|
|
}
|