2021-03-01 17:24:11 +00:00
use crate ::{ settings ::structs ::Settings , LemmyError } ;
2020-08-01 14:04:42 +00:00
use anyhow ::anyhow ;
2020-09-24 17:43:42 +00:00
use log ::error ;
use percent_encoding ::{ utf8_percent_encode , NON_ALPHANUMERIC } ;
use reqwest ::Client ;
2021-08-19 14:12:49 +00:00
use serde ::{ Deserialize , Serialize } ;
2020-07-01 12:54:29 +00:00
use std ::future ::Future ;
2020-08-01 14:04:42 +00:00
use thiserror ::Error ;
2021-03-02 12:41:48 +00:00
use url ::Url ;
2021-08-19 14:12:49 +00:00
use webpage ::HTML ;
2020-07-01 12:54:29 +00:00
2020-08-01 14:04:42 +00:00
#[ derive(Clone, Debug, Error) ]
#[ error( " Error sending request, {0} " ) ]
2020-07-01 12:54:29 +00:00
struct SendError ( pub String ) ;
2020-08-01 14:04:42 +00:00
#[ derive(Clone, Debug, Error) ]
#[ error( " Error receiving response, {0} " ) ]
2020-07-01 12:54:29 +00:00
pub struct RecvError ( pub String ) ;
2021-01-12 16:12:41 +00:00
pub async fn retry < F , Fut , T > ( f : F ) -> Result < T , reqwest ::Error >
2020-07-01 12:54:29 +00:00
where
F : Fn ( ) -> Fut ,
2020-08-31 13:48:02 +00:00
Fut : Future < Output = Result < T , reqwest ::Error > > ,
2020-07-01 12:54:29 +00:00
{
retry_custom ( | | async { Ok ( ( f ) ( ) . await ) } ) . await
}
2021-01-12 16:12:41 +00:00
async fn retry_custom < F , Fut , T > ( f : F ) -> Result < T , reqwest ::Error >
2020-07-01 12:54:29 +00:00
where
F : Fn ( ) -> Fut ,
2021-01-12 16:12:41 +00:00
Fut : Future < Output = Result < Result < T , reqwest ::Error > , reqwest ::Error > > ,
2020-07-01 12:54:29 +00:00
{
2021-01-12 16:12:41 +00:00
let mut response : Option < Result < T , reqwest ::Error > > = None ;
2020-07-01 12:54:29 +00:00
for _ in 0 u8 .. 3 {
match ( f ) ( ) . await ? {
Ok ( t ) = > return Ok ( t ) ,
Err ( e ) = > {
2020-08-31 13:48:02 +00:00
if e . is_timeout ( ) {
2021-01-12 16:12:41 +00:00
response = Some ( Err ( e ) ) ;
2020-07-01 12:54:29 +00:00
continue ;
}
2021-01-12 16:12:41 +00:00
return Err ( e ) ;
2020-07-01 12:54:29 +00:00
}
}
}
2021-03-01 12:56:07 +00:00
response . expect ( " retry http request " )
2020-07-01 12:54:29 +00:00
}
2020-09-24 17:43:42 +00:00
2021-08-19 14:12:49 +00:00
#[ derive(Deserialize, Serialize, Debug, PartialEq, Clone) ]
pub struct SiteMetadata {
2021-08-04 21:13:51 +00:00
pub title : Option < String > ,
pub description : Option < String > ,
2021-08-19 14:12:49 +00:00
image : Option < Url > ,
2021-08-04 21:13:51 +00:00
pub html : Option < String > ,
2020-09-24 17:43:42 +00:00
}
2021-08-19 14:12:49 +00:00
/// Fetches the post link html tags (like title, description, image, etc)
pub async fn fetch_site_metadata ( client : & Client , url : & Url ) -> Result < SiteMetadata , LemmyError > {
let response = retry ( | | client . get ( url . as_str ( ) ) . send ( ) ) . await ? ;
2020-09-24 17:43:42 +00:00
2021-08-19 14:12:49 +00:00
let html = response
. text ( )
. await
. map_err ( | e | RecvError ( e . to_string ( ) ) ) ? ;
2020-09-24 17:43:42 +00:00
2021-08-19 14:12:49 +00:00
let tags = html_to_site_metadata ( & html ) ? ;
Ok ( tags )
}
fn html_to_site_metadata ( html : & str ) -> Result < SiteMetadata , LemmyError > {
let page = HTML ::from_string ( html . to_string ( ) , None ) ? ;
let page_title = page . title ;
let page_description = page . description ;
let og_description = page
. opengraph
. properties
. get ( " description " )
. map ( | t | t . to_string ( ) ) ;
let og_title = page
. opengraph
. properties
. get ( " title " )
. map ( | t | t . to_string ( ) ) ;
let og_image = page
. opengraph
. images
. get ( 0 )
. map ( | ogo | Url ::parse ( & ogo . url ) . ok ( ) )
. flatten ( ) ;
let title = og_title . or ( page_title ) ;
let description = og_description . or ( page_description ) ;
let image = og_image ;
Ok ( SiteMetadata {
title ,
description ,
image ,
html : None ,
} )
2020-09-24 17:43:42 +00:00
}
#[ derive(Deserialize, Debug, Clone) ]
pub ( crate ) struct PictrsResponse {
files : Vec < PictrsFile > ,
msg : String ,
}
#[ derive(Deserialize, Debug, Clone) ]
pub ( crate ) struct PictrsFile {
file : String ,
2021-09-17 15:44:20 +00:00
#[ allow(dead_code) ]
2020-09-24 17:43:42 +00:00
delete_token : String ,
}
pub ( crate ) async fn fetch_pictrs (
client : & Client ,
2021-09-22 15:57:09 +00:00
settings : & Settings ,
2021-03-02 12:41:48 +00:00
image_url : & Url ,
2021-08-19 14:12:49 +00:00
) -> Result < PictrsResponse , LemmyError > {
2021-09-22 15:57:09 +00:00
if let Some ( pictrs_url ) = settings . pictrs_url . to_owned ( ) {
2021-08-04 21:13:51 +00:00
is_image_content_type ( client , image_url ) . await ? ;
let fetch_url = format! (
" {}/image/download?url={} " ,
pictrs_url ,
utf8_percent_encode ( image_url . as_str ( ) , NON_ALPHANUMERIC ) // TODO this might not be needed
) ;
let response = retry ( | | client . get ( & fetch_url ) . send ( ) ) . await ? ;
let response : PictrsResponse = response
. json ( )
. await
. map_err ( | e | RecvError ( e . to_string ( ) ) ) ? ;
if response . msg = = " ok " {
2021-08-19 14:12:49 +00:00
Ok ( response )
2021-08-04 21:13:51 +00:00
} else {
Err ( anyhow! ( " {} " , & response . msg ) . into ( ) )
}
2020-09-24 17:43:42 +00:00
} else {
2021-08-19 14:12:49 +00:00
Err ( anyhow! ( " pictrs_url not set up in config " ) . into ( ) )
2020-09-24 17:43:42 +00:00
}
}
2021-08-19 14:12:49 +00:00
/// Both are options, since the URL might be either an html page, or an image
/// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
pub async fn fetch_site_data (
2020-09-24 17:43:42 +00:00
client : & Client ,
2021-09-22 15:57:09 +00:00
settings : & Settings ,
2021-03-02 12:41:48 +00:00
url : Option < & Url > ,
2021-08-19 14:12:49 +00:00
) -> ( Option < SiteMetadata > , Option < Url > ) {
2020-09-24 17:43:42 +00:00
match & url {
Some ( url ) = > {
2021-08-19 14:12:49 +00:00
// Fetch metadata
// Ignore errors, since it may be an image, or not have the data.
// Warning, this may ignore SSL errors
let metadata_option = fetch_site_metadata ( client , url ) . await . ok ( ) ;
2020-09-24 17:43:42 +00:00
// Fetch pictrs thumbnail
2021-08-19 14:12:49 +00:00
let pictrs_hash = match & metadata_option {
Some ( metadata_res ) = > match & metadata_res . image {
// Metadata, with image
// Try to generate a small thumbnail if there's a full sized one from post-links
2021-09-22 15:57:09 +00:00
Some ( metadata_image ) = > fetch_pictrs ( client , settings , metadata_image )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2021-08-19 14:12:49 +00:00
// Metadata, but no image
2021-09-22 15:57:09 +00:00
None = > fetch_pictrs ( client , settings , url )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2020-09-24 17:43:42 +00:00
} ,
2021-08-19 14:12:49 +00:00
// No metadata, try to fetch the URL as an image
2021-09-22 15:57:09 +00:00
None = > fetch_pictrs ( client , settings , url )
2021-08-19 14:12:49 +00:00
. await
2021-08-04 21:13:51 +00:00
. map ( | r | r . files [ 0 ] . file . to_owned ( ) ) ,
2020-09-24 17:43:42 +00:00
} ;
// The full urls are necessary for federation
2021-08-04 21:13:51 +00:00
let pictrs_thumbnail = pictrs_hash
. map ( | p | {
Url ::parse ( & format! (
" {}/pictrs/image/{} " ,
2021-09-22 15:57:09 +00:00
settings . get_protocol_and_hostname ( ) ,
2021-08-04 21:13:51 +00:00
p
) )
. ok ( )
} )
2021-08-19 14:12:49 +00:00
. ok ( )
2021-08-04 21:13:51 +00:00
. flatten ( ) ;
2021-08-19 14:12:49 +00:00
( metadata_option , pictrs_thumbnail )
2020-09-24 17:43:42 +00:00
}
2021-08-19 14:12:49 +00:00
None = > ( None , None ) ,
2020-09-24 17:43:42 +00:00
}
}
2021-03-02 12:41:48 +00:00
async fn is_image_content_type ( client : & Client , test : & Url ) -> Result < ( ) , LemmyError > {
let response = retry ( | | client . get ( test . to_owned ( ) ) . send ( ) ) . await ? ;
2020-09-24 17:43:42 +00:00
if response
. headers ( )
. get ( " Content-Type " )
. ok_or_else ( | | anyhow! ( " No Content-Type header " ) ) ?
. to_str ( ) ?
. starts_with ( " image/ " )
{
Ok ( ( ) )
} else {
Err ( anyhow! ( " Not an image type. " ) . into ( ) )
}
}
#[ cfg(test) ]
mod tests {
2021-08-19 14:12:49 +00:00
use crate ::request ::fetch_site_metadata ;
use url ::Url ;
use super ::SiteMetadata ;
2020-09-24 17:43:42 +00:00
// These helped with testing
2021-08-19 14:12:49 +00:00
#[ actix_rt::test ]
async fn test_site_metadata ( ) {
let client = reqwest ::Client ::default ( ) ;
let sample_url = Url ::parse ( " https://www.redspark.nu/en/peoples-war/district-leader-of-chand-led-cpn-arrested-in-bhojpur/ " ) . unwrap ( ) ;
let sample_res = fetch_site_metadata ( & client , & sample_url ) . await . unwrap ( ) ;
assert_eq! (
SiteMetadata {
title : Some ( " District Leader Of Chand Led CPN Arrested In Bhojpur - Redspark " . to_string ( ) ) ,
description : Some ( " BHOJPUR: A district leader of the outlawed Netra Bikram Chand alias Biplav-led outfit has been arrested. According to District Police " . to_string ( ) ) ,
image : Some ( Url ::parse ( " https://www.redspark.nu/wp-content/uploads/2020/03/netra-bikram-chand-attends-program-1272019033653-1000x0-845x653-1.jpg " ) . unwrap ( ) ) ,
html : None ,
} , sample_res ) ;
let youtube_url = Url ::parse ( " https://www.youtube.com/watch?v=IquO_TcMZIQ " ) . unwrap ( ) ;
let youtube_res = fetch_site_metadata ( & client , & youtube_url ) . await . unwrap ( ) ;
assert_eq! (
SiteMetadata {
title : Some ( " A Hard Look at Rent and Rent Seeking with Michael Hudson & Pepe Escobar " . to_string ( ) ) ,
description : Some ( " An interactive discussion on wealth inequality and the “Great Game” on the control of natural resources.In this webinar organized jointly by the Henry George... " . to_string ( ) ) ,
image : Some ( Url ::parse ( " https://i.ytimg.com/vi/IquO_TcMZIQ/maxresdefault.jpg " ) . unwrap ( ) ) ,
html : None ,
} , youtube_res ) ;
}
2020-09-24 17:43:42 +00:00
// #[test]
// fn test_pictshare() {
// let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
// assert!(res.is_ok());
// let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
// assert!(res_other.is_err());
// }
}