2021-09-29 20:05:38 +00:00
use crate ::{ settings ::structs ::Settings , version ::VERSION , LemmyError } ;
2020-08-01 14:04:42 +00:00
use anyhow ::anyhow ;
2022-01-06 13:13:17 +00:00
use encoding ::{ all ::encodings , DecoderTrap } ;
2020-09-24 17:43:42 +00:00
use percent_encoding ::{ utf8_percent_encode , NON_ALPHANUMERIC } ;
2021-12-06 22:54:34 +00:00
use reqwest_middleware ::ClientWithMiddleware ;
2021-08-19 14:12:49 +00:00
use serde ::{ Deserialize , Serialize } ;
2020-07-01 12:54:29 +00:00
use std ::future ::Future ;
2020-08-01 14:04:42 +00:00
use thiserror ::Error ;
2021-12-20 21:57:36 +00:00
use tracing ::{ error , info } ;
2021-03-02 12:41:48 +00:00
use url ::Url ;
2021-08-19 14:12:49 +00:00
use webpage ::HTML ;
2020-07-01 12:54:29 +00:00
2020-08-01 14:04:42 +00:00
#[ derive(Clone, Debug, Error) ]
#[ error( " Error sending request, {0} " ) ]
2020-07-01 12:54:29 +00:00
struct SendError ( pub String ) ;
2020-08-01 14:04:42 +00:00
#[ derive(Clone, Debug, Error) ]
#[ error( " Error receiving response, {0} " ) ]
2020-07-01 12:54:29 +00:00
pub struct RecvError ( pub String ) ;
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2021-12-06 22:54:34 +00:00
pub async fn retry < F , Fut , T > ( f : F ) -> Result < T , reqwest_middleware ::Error >
2020-07-01 12:54:29 +00:00
where
F : Fn ( ) -> Fut ,
2021-12-06 22:54:34 +00:00
Fut : Future < Output = Result < T , reqwest_middleware ::Error > > ,
2020-07-01 12:54:29 +00:00
{
retry_custom ( | | async { Ok ( ( f ) ( ) . await ) } ) . await
}
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2021-12-06 22:54:34 +00:00
async fn retry_custom < F , Fut , T > ( f : F ) -> Result < T , reqwest_middleware ::Error >
2020-07-01 12:54:29 +00:00
where
F : Fn ( ) -> Fut ,
2021-12-06 22:54:34 +00:00
Fut : Future < Output = Result < Result < T , reqwest_middleware ::Error > , reqwest_middleware ::Error > > ,
2020-07-01 12:54:29 +00:00
{
2021-12-06 22:54:34 +00:00
let mut response : Option < Result < T , reqwest_middleware ::Error > > = None ;
2020-07-01 12:54:29 +00:00
for _ in 0 u8 .. 3 {
match ( f ) ( ) . await ? {
Ok ( t ) = > return Ok ( t ) ,
2021-12-06 22:54:34 +00:00
Err ( reqwest_middleware ::Error ::Reqwest ( e ) ) = > {
2020-08-31 13:48:02 +00:00
if e . is_timeout ( ) {
2021-12-06 22:54:34 +00:00
response = Some ( Err ( reqwest_middleware ::Error ::Reqwest ( e ) ) ) ;
2020-07-01 12:54:29 +00:00
continue ;
}
2021-12-06 22:54:34 +00:00
return Err ( reqwest_middleware ::Error ::Reqwest ( e ) ) ;
}
Err ( otherwise ) = > {
return Err ( otherwise ) ;
2020-07-01 12:54:29 +00:00
}
}
}
2021-03-01 12:56:07 +00:00
response . expect ( " retry http request " )
2020-07-01 12:54:29 +00:00
}
2020-09-24 17:43:42 +00:00
2021-08-19 14:12:49 +00:00
#[ derive(Deserialize, Serialize, Debug, PartialEq, Clone) ]
pub struct SiteMetadata {
2021-08-04 21:13:51 +00:00
pub title : Option < String > ,
pub description : Option < String > ,
2021-08-19 14:12:49 +00:00
image : Option < Url > ,
2021-08-04 21:13:51 +00:00
pub html : Option < String > ,
2020-09-24 17:43:42 +00:00
}
2021-08-19 14:12:49 +00:00
/// Fetches the post link html tags (like title, description, image, etc)
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2021-12-06 22:54:34 +00:00
pub async fn fetch_site_metadata (
client : & ClientWithMiddleware ,
url : & Url ,
) -> Result < SiteMetadata , LemmyError > {
2021-12-20 21:57:36 +00:00
info! ( " Fetching site metadata for url: {} " , url ) ;
2021-12-05 15:03:13 +00:00
let response = client . get ( url . as_str ( ) ) . send ( ) . await ? ;
2020-09-24 17:43:42 +00:00
2021-12-20 21:57:36 +00:00
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response
. bytes ( )
2021-08-19 14:12:49 +00:00
. await
2021-12-20 21:57:36 +00:00
. map_err ( | e | RecvError ( e . to_string ( ) ) ) ?
. to_vec ( ) ;
2022-01-06 13:13:17 +00:00
let tags = html_to_site_metadata ( & html_bytes ) ? ;
2021-08-19 14:12:49 +00:00
Ok ( tags )
}
2022-01-06 13:13:17 +00:00
fn html_to_site_metadata ( html_bytes : & [ u8 ] ) -> Result < SiteMetadata , LemmyError > {
let html = String ::from_utf8_lossy ( html_bytes ) ;
2021-12-20 21:57:36 +00:00
// Make sure the first line is doctype html
let first_line = html
2022-01-06 13:13:17 +00:00
. trim_start ( )
2021-12-20 21:57:36 +00:00
. lines ( )
. into_iter ( )
. next ( )
. ok_or_else ( | | LemmyError ::from_message ( " No lines in html " ) ) ?
. to_lowercase ( ) ;
if ! first_line . starts_with ( " <!doctype html> " ) {
return Err ( LemmyError ::from_message (
" Site metadata page fetch is not DOCTYPE html " ,
) ) ;
}
2022-01-06 13:13:17 +00:00
let mut page = HTML ::from_string ( html . to_string ( ) , None ) ? ;
// If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
// proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
// version.
if let Some ( charset ) = page . meta . get ( " charset " ) {
if charset . to_lowercase ( ) ! = " utf-8 " {
if let Some ( encoding_ref ) = encodings ( ) . iter ( ) . find ( | e | e . name ( ) = = charset ) {
if let Ok ( html_with_encoding ) = encoding_ref . decode ( html_bytes , DecoderTrap ::Replace ) {
page = HTML ::from_string ( html_with_encoding , None ) ? ;
}
}
}
}
2021-08-19 14:12:49 +00:00
let page_title = page . title ;
let page_description = page . description ;
let og_description = page
. opengraph
. properties
. get ( " description " )
. map ( | t | t . to_string ( ) ) ;
let og_title = page
. opengraph
. properties
. get ( " title " )
. map ( | t | t . to_string ( ) ) ;
let og_image = page
. opengraph
. images
. get ( 0 )
. map ( | ogo | Url ::parse ( & ogo . url ) . ok ( ) )
. flatten ( ) ;
let title = og_title . or ( page_title ) ;
let description = og_description . or ( page_description ) ;
let image = og_image ;
Ok ( SiteMetadata {
title ,
description ,
image ,
html : None ,
} )
2020-09-24 17:43:42 +00:00
}
#[ derive(Deserialize, Debug, Clone) ]
pub ( crate ) struct PictrsResponse {
files : Vec < PictrsFile > ,
msg : String ,
}
#[ derive(Deserialize, Debug, Clone) ]
pub ( crate ) struct PictrsFile {
file : String ,
2021-09-17 15:44:20 +00:00
#[ allow(dead_code) ]
2020-09-24 17:43:42 +00:00
delete_token : String ,
}
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2020-09-24 17:43:42 +00:00
pub ( crate ) async fn fetch_pictrs (
2021-12-06 22:54:34 +00:00
client : & ClientWithMiddleware ,
2021-09-22 15:57:09 +00:00
settings : & Settings ,
2021-03-02 12:41:48 +00:00
image_url : & Url ,
2021-08-19 14:12:49 +00:00
) -> Result < PictrsResponse , LemmyError > {
2021-09-22 15:57:09 +00:00
if let Some ( pictrs_url ) = settings . pictrs_url . to_owned ( ) {
2021-08-04 21:13:51 +00:00
is_image_content_type ( client , image_url ) . await ? ;
let fetch_url = format! (
" {}/image/download?url={} " ,
pictrs_url ,
utf8_percent_encode ( image_url . as_str ( ) , NON_ALPHANUMERIC ) // TODO this might not be needed
) ;
2021-12-05 15:03:13 +00:00
let response = client . get ( & fetch_url ) . send ( ) . await ? ;
2021-08-04 21:13:51 +00:00
let response : PictrsResponse = response
. json ( )
. await
. map_err ( | e | RecvError ( e . to_string ( ) ) ) ? ;
if response . msg = = " ok " {
2021-08-19 14:12:49 +00:00
Ok ( response )
2021-08-04 21:13:51 +00:00
} else {
Err ( anyhow! ( " {} " , & response . msg ) . into ( ) )
}
2020-09-24 17:43:42 +00:00
} else {
2021-08-19 14:12:49 +00:00
Err ( anyhow! ( " pictrs_url not set up in config " ) . into ( ) )
2020-09-24 17:43:42 +00:00
}
}
2021-08-19 14:12:49 +00:00
/// Both are options, since the URL might be either an html page, or an image
/// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2021-08-19 14:12:49 +00:00
pub async fn fetch_site_data (
2021-12-06 22:54:34 +00:00
client : & ClientWithMiddleware ,
2021-09-22 15:57:09 +00:00
settings : & Settings ,
2021-03-02 12:41:48 +00:00
url : Option < & Url > ,
2021-08-19 14:12:49 +00:00
) -> ( Option < SiteMetadata > , Option < Url > ) {
2020-09-24 17:43:42 +00:00
match & url {
Some ( url ) = > {
2021-08-19 14:12:49 +00:00
// Fetch metadata
// Ignore errors, since it may be an image, or not have the data.
// Warning, this may ignore SSL errors
let metadata_option = fetch_site_metadata ( client , url ) . await . ok ( ) ;
2020-09-24 17:43:42 +00:00
// Fetch pictrs thumbnail
2021-08-19 14:12:49 +00:00
let pictrs_hash = match & metadata_option {
Some ( metadata_res ) = > match & metadata_res . image {
// Metadata, with image
// Try to generate a small thumbnail if there's a full sized one from post-links
2021-09-22 15:57:09 +00:00
Some ( metadata_image ) = > fetch_pictrs ( client , settings , metadata_image )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2021-08-19 14:12:49 +00:00
// Metadata, but no image
2021-09-22 15:57:09 +00:00
None = > fetch_pictrs ( client , settings , url )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2020-09-24 17:43:42 +00:00
} ,
2021-08-19 14:12:49 +00:00
// No metadata, try to fetch the URL as an image
2021-09-22 15:57:09 +00:00
None = > fetch_pictrs ( client , settings , url )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2020-09-24 17:43:42 +00:00
} ;
// The full urls are necessary for federation
2021-08-04 21:13:51 +00:00
let pictrs_thumbnail = pictrs_hash
. map ( | p | {
Url ::parse ( & format! (
" {}/pictrs/image/{} " ,
2021-09-22 15:57:09 +00:00
settings . get_protocol_and_hostname ( ) ,
2021-08-04 21:13:51 +00:00
p
) )
. ok ( )
} )
2021-08-19 14:12:49 +00:00
. ok ( )
2021-08-04 21:13:51 +00:00
. flatten ( ) ;
2021-08-19 14:12:49 +00:00
( metadata_option , pictrs_thumbnail )
2020-09-24 17:43:42 +00:00
}
2021-08-19 14:12:49 +00:00
None = > ( None , None ) ,
2020-09-24 17:43:42 +00:00
}
}
2022-01-06 19:10:20 +00:00
#[ tracing::instrument(skip_all) ]
2021-12-06 22:54:34 +00:00
async fn is_image_content_type ( client : & ClientWithMiddleware , url : & Url ) -> Result < ( ) , LemmyError > {
2021-12-05 15:03:13 +00:00
let response = client . get ( url . as_str ( ) ) . send ( ) . await ? ;
2020-09-24 17:43:42 +00:00
if response
. headers ( )
. get ( " Content-Type " )
. ok_or_else ( | | anyhow! ( " No Content-Type header " ) ) ?
. to_str ( ) ?
. starts_with ( " image/ " )
{
Ok ( ( ) )
} else {
Err ( anyhow! ( " Not an image type. " ) . into ( ) )
}
}
2021-09-29 20:05:38 +00:00
pub fn build_user_agent ( settings : & Settings ) -> String {
format! (
" Lemmy/{}; +{} " ,
VERSION ,
settings . get_protocol_and_hostname ( )
)
}
2020-09-24 17:43:42 +00:00
#[ cfg(test) ]
mod tests {
2021-09-29 20:05:38 +00:00
use crate ::request ::{ build_user_agent , fetch_site_metadata } ;
2021-08-19 14:12:49 +00:00
use url ::Url ;
use super ::SiteMetadata ;
2021-09-29 20:05:38 +00:00
use crate ::settings ::structs ::Settings ;
2021-08-19 14:12:49 +00:00
2020-09-24 17:43:42 +00:00
// These helped with testing
2021-08-19 14:12:49 +00:00
#[ actix_rt::test ]
async fn test_site_metadata ( ) {
2021-09-29 20:05:38 +00:00
let settings = Settings ::init ( ) . unwrap ( ) ;
let client = reqwest ::Client ::builder ( )
. user_agent ( build_user_agent ( & settings ) )
. build ( )
2021-12-06 22:54:34 +00:00
. unwrap ( )
. into ( ) ;
2021-08-19 14:12:49 +00:00
let sample_url = Url ::parse ( " https://www.redspark.nu/en/peoples-war/district-leader-of-chand-led-cpn-arrested-in-bhojpur/ " ) . unwrap ( ) ;
let sample_res = fetch_site_metadata ( & client , & sample_url ) . await . unwrap ( ) ;
assert_eq! (
SiteMetadata {
title : Some ( " District Leader Of Chand Led CPN Arrested In Bhojpur - Redspark " . to_string ( ) ) ,
description : Some ( " BHOJPUR: A district leader of the outlawed Netra Bikram Chand alias Biplav-led outfit has been arrested. According to District Police " . to_string ( ) ) ,
image : Some ( Url ::parse ( " https://www.redspark.nu/wp-content/uploads/2020/03/netra-bikram-chand-attends-program-1272019033653-1000x0-845x653-1.jpg " ) . unwrap ( ) ) ,
html : None ,
} , sample_res ) ;
let youtube_url = Url ::parse ( " https://www.youtube.com/watch?v=IquO_TcMZIQ " ) . unwrap ( ) ;
let youtube_res = fetch_site_metadata ( & client , & youtube_url ) . await . unwrap ( ) ;
assert_eq! (
SiteMetadata {
title : Some ( " A Hard Look at Rent and Rent Seeking with Michael Hudson & Pepe Escobar " . to_string ( ) ) ,
description : Some ( " An interactive discussion on wealth inequality and the “Great Game” on the control of natural resources.In this webinar organized jointly by the Henry George... " . to_string ( ) ) ,
image : Some ( Url ::parse ( " https://i.ytimg.com/vi/IquO_TcMZIQ/maxresdefault.jpg " ) . unwrap ( ) ) ,
html : None ,
} , youtube_res ) ;
}
2020-09-24 17:43:42 +00:00
// #[test]
// fn test_pictshare() {
// let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
// assert!(res.is_ok());
// let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
// assert!(res_other.is_err());
// }
}