diff --git a/Cargo.lock b/Cargo.lock index 2ed43cdae..0fe1f6fbf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "Inflector" @@ -966,6 +966,17 @@ dependencies = [ "nom", ] +[[package]] +name = "cfb" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -2681,6 +2692,15 @@ dependencies = [ "serde", ] +[[package]] +name = "infer" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847" +dependencies = [ + "cfb", +] + [[package]] name = "inout" version = "0.1.3" @@ -2860,6 +2880,7 @@ dependencies = [ "enum-map", "futures", "getrandom", + "infer", "jsonwebtoken", "lemmy_db_schema", "lemmy_db_views", @@ -2867,6 +2888,7 @@ dependencies = [ "lemmy_db_views_moderator", "lemmy_utils", "mime", + "mime_guess", "moka", "pretty_assertions", "regex", @@ -3514,9 +3536,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mime_guess" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" dependencies = [ "mime", "unicase", diff --git a/crates/api_common/Cargo.toml b/crates/api_common/Cargo.toml index 4eabfe5f9..50fbc3f94 100644 --- a/crates/api_common/Cargo.toml +++ b/crates/api_common/Cargo.toml @@ -64,6 +64,8 @@ actix-web = { workspace = true, optional = true } enum-map = { workspace = true } urlencoding = { workspace = true } mime = { version = "0.3.17", optional = true } +mime_guess = "2.0.5" +infer = "0.16.0" webpage = { version = "2.0", default-features = false, features = [ "serde", ], optional = true } diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index 8f79d5a92..cd0f0a470 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -23,7 +23,7 @@ use lemmy_utils::{ REQWEST_TIMEOUT, VERSION, }; -use mime::Mime; +use mime::{Mime, TEXT_HTML}; use reqwest::{ header::{CONTENT_TYPE, RANGE}, Client, @@ -62,38 +62,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu .send() .await?; - let content_type: Option = response + let mut content_type: Option = response .headers() .get(CONTENT_TYPE) .and_then(|h| h.to_str().ok()) - .and_then(|h| h.parse().ok()); + .and_then(|h| h.parse().ok()) + // If we don't get a content_type from the response (e.g. if the server is down), + // then try to infer the content_type from the file extension. + .or(mime_guess::from_path(url.path()).first()); let opengraph_data = { - // if the content type is not text/html, we don't need to parse it let is_html = content_type .as_ref() .map(|c| { - (c.type_() == mime::TEXT && c.subtype() == mime::HTML) - || - // application/xhtml+xml is a subset of HTML - (c.type_() == mime::APPLICATION && c.subtype() == "xhtml") + // application/xhtml+xml is a subset of HTML + let application_xhtml: Mime = "application/xhtml+xml".parse::().unwrap_or(TEXT_HTML); + let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()]; + allowed_mime_types.contains(&c.essence_str()) }) - .unwrap_or(false); - if !is_html { - Default::default() - } else { + .unwrap_or_default(); + + if is_html { // Can't use .text() here, because it only checks the content header, not the actual bytes // https://github.com/LemmyNet/lemmy/issues/1964 - // So we want to do deep inspection of the actually returned bytes but need to be careful not - // spend too much time parsing binary data as HTML - + // So we want to do deep inspection of the actually returned bytes but need to be careful + // not spend too much time parsing binary data as HTML // only take first bytes regardless of how many bytes the server returns let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; extract_opengraph_data(&html_bytes, url) .map_err(|e| info!("{e}")) .unwrap_or_default() + } else { + let is_octet_type = content_type + .as_ref() + .map(|c| c.subtype() == "octet-stream") + .unwrap_or_default(); + + // Overwrite the content type if its an octet type + if is_octet_type { + // Don't need to fetch as much data for this as we do with opengraph + let octet_bytes = collect_bytes_until_limit(response, 512).await?; + content_type = + infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok()); + } + + Default::default() } }; + Ok(LinkMetadata { opengraph_data, content_type: content_type.map(|c| c.to_string()),