Site Metadata: resolve relative URLs for embedded images/videos (#3338)
* Site Metadata: resolve relative URLs for embedded images/videos * api_common: relax version requirement of `webpage` dependency With this change we opt into next (non breaking) versions of webpage-rs * cargo +nightly fmt * Add tests for resolving absolute urls in SiteMetadata
This commit is contained in:
parent
b2a9d4a335
commit
62c8ac1db5
2 changed files with 55 additions and 6 deletions
|
@ -33,7 +33,7 @@ reqwest-middleware = { workspace = true, optional = true }
|
|||
regex = { workspace = true }
|
||||
rosetta-i18n = { workspace = true, optional = true }
|
||||
percent-encoding = { workspace = true, optional = true }
|
||||
webpage = { version = "1.6.0", default-features = false, features = ["serde"], optional = true }
|
||||
webpage = { version = "1.6", default-features = false, features = ["serde"], optional = true }
|
||||
encoding = { version = "0.2.33", optional = true }
|
||||
anyhow = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
|
|
|
@ -27,12 +27,12 @@ pub async fn fetch_site_metadata(
|
|||
// https://github.com/LemmyNet/lemmy/issues/1964
|
||||
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
|
||||
|
||||
let tags = html_to_site_metadata(&html_bytes)?;
|
||||
let tags = html_to_site_metadata(&html_bytes, url)?;
|
||||
|
||||
Ok(tags)
|
||||
}
|
||||
|
||||
fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
|
||||
fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
|
||||
let html = String::from_utf8_lossy(html_bytes);
|
||||
|
||||
// Make sure the first line is doctype html
|
||||
|
@ -81,12 +81,14 @@ fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError>
|
|||
.opengraph
|
||||
.images
|
||||
.first()
|
||||
.and_then(|ogo| Url::parse(&ogo.url).ok());
|
||||
// join also works if the target URL is absolute
|
||||
.and_then(|ogo| url.join(&ogo.url).ok());
|
||||
let og_embed_url = page
|
||||
.opengraph
|
||||
.videos
|
||||
.first()
|
||||
.and_then(|v| Url::parse(&v.url).ok());
|
||||
// join also works if the target URL is absolute
|
||||
.and_then(|v| url.join(&v.url).ok());
|
||||
|
||||
Ok(SiteMetadata {
|
||||
title: og_title.or(page_title),
|
||||
|
@ -266,7 +268,12 @@ pub fn build_user_agent(settings: &Settings) -> String {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata};
|
||||
use crate::request::{
|
||||
build_user_agent,
|
||||
fetch_site_metadata,
|
||||
html_to_site_metadata,
|
||||
SiteMetadata,
|
||||
};
|
||||
use lemmy_utils::settings::SETTINGS;
|
||||
use url::Url;
|
||||
|
||||
|
@ -305,4 +312,46 @@ mod tests {
|
|||
// let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
|
||||
// assert!(res_other.is_err());
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn test_resolve_image_url() {
|
||||
// url that lists the opengraph fields
|
||||
let url = Url::parse("https://example.com/one/two.html").unwrap();
|
||||
|
||||
// root relative url
|
||||
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
|
||||
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
|
||||
assert_eq!(
|
||||
metadata.image,
|
||||
Some(Url::parse("https://example.com/image.jpg").unwrap().into())
|
||||
);
|
||||
|
||||
// base relative url
|
||||
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
|
||||
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
|
||||
assert_eq!(
|
||||
metadata.image,
|
||||
Some(
|
||||
Url::parse("https://example.com/one/image.jpg")
|
||||
.unwrap()
|
||||
.into()
|
||||
)
|
||||
);
|
||||
|
||||
// absolute url
|
||||
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
|
||||
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
|
||||
assert_eq!(
|
||||
metadata.image,
|
||||
Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
|
||||
);
|
||||
|
||||
// protocol relative url
|
||||
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
|
||||
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
|
||||
assert_eq!(
|
||||
metadata.image,
|
||||
Some(Url::parse("https://example.com/image.jpg").unwrap().into())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue