Site Metadata: resolve relative URLs for embedded images/videos (#3338)

* Site Metadata: resolve relative URLs for embedded images/videos

* api_common: relax version requirement of `webpage` dependency

With this change we opt into next (non breaking) versions of webpage-rs

* cargo +nightly fmt

* Add tests for resolving absolute urls in SiteMetadata
This commit is contained in:
Otto Rottier 2023-06-26 15:07:57 +02:00 committed by GitHub
parent b2a9d4a335
commit 62c8ac1db5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 55 additions and 6 deletions

View file

@ -33,7 +33,7 @@ reqwest-middleware = { workspace = true, optional = true }
regex = { workspace = true } regex = { workspace = true }
rosetta-i18n = { workspace = true, optional = true } rosetta-i18n = { workspace = true, optional = true }
percent-encoding = { workspace = true, optional = true } percent-encoding = { workspace = true, optional = true }
webpage = { version = "1.6.0", default-features = false, features = ["serde"], optional = true } webpage = { version = "1.6", default-features = false, features = ["serde"], optional = true }
encoding = { version = "0.2.33", optional = true } encoding = { version = "0.2.33", optional = true }
anyhow = { workspace = true } anyhow = { workspace = true }
futures = { workspace = true } futures = { workspace = true }

View file

@ -27,12 +27,12 @@ pub async fn fetch_site_metadata(
// https://github.com/LemmyNet/lemmy/issues/1964 // https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
let tags = html_to_site_metadata(&html_bytes)?; let tags = html_to_site_metadata(&html_bytes, url)?;
Ok(tags) Ok(tags)
} }
fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> { fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
let html = String::from_utf8_lossy(html_bytes); let html = String::from_utf8_lossy(html_bytes);
// Make sure the first line is doctype html // Make sure the first line is doctype html
@ -81,12 +81,14 @@ fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError>
.opengraph .opengraph
.images .images
.first() .first()
.and_then(|ogo| Url::parse(&ogo.url).ok()); // join also works if the target URL is absolute
.and_then(|ogo| url.join(&ogo.url).ok());
let og_embed_url = page let og_embed_url = page
.opengraph .opengraph
.videos .videos
.first() .first()
.and_then(|v| Url::parse(&v.url).ok()); // join also works if the target URL is absolute
.and_then(|v| url.join(&v.url).ok());
Ok(SiteMetadata { Ok(SiteMetadata {
title: og_title.or(page_title), title: og_title.or(page_title),
@ -266,7 +268,12 @@ pub fn build_user_agent(settings: &Settings) -> String {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata}; use crate::request::{
build_user_agent,
fetch_site_metadata,
html_to_site_metadata,
SiteMetadata,
};
use lemmy_utils::settings::SETTINGS; use lemmy_utils::settings::SETTINGS;
use url::Url; use url::Url;
@ -305,4 +312,46 @@ mod tests {
// let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu"); // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
// assert!(res_other.is_err()); // assert!(res_other.is_err());
// } // }
#[test]
fn test_resolve_image_url() {
// url that lists the opengraph fields
let url = Url::parse("https://example.com/one/two.html").unwrap();
// root relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://example.com/image.jpg").unwrap().into())
);
// base relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(
Url::parse("https://example.com/one/image.jpg")
.unwrap()
.into()
)
);
// absolute url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
);
// protocol relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://example.com/image.jpg").unwrap().into())
);
}
} }