From c5e4430c389541158efcd009fca59c8a2b2830d8 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Tue, 3 Dec 2024 23:08:31 -0500 Subject: [PATCH] Some cleanup. --- crates/api_common/src/request.rs | 54 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index df8bc5bc4..9b651a871 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -73,34 +73,46 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu // then try to infer the content_type from the file extension. .or(mime_guess::from_path(url.path()).first()); - let opengraph_data = 'ograph: { - if let Some(c) = &content_type { - // application/xhtml+xml is a subset of HTML - let application_xhtml: Mime = "application/xhtml+xml".parse()?; - if c.essence_str() == TEXT_HTML.essence_str() - || c.essence_str() == application_xhtml.essence_str() - { - // Can't use .text() here, because it only checks the content header, not the actual bytes - // https://github.com/LemmyNet/lemmy/issues/1964 - // So we want to do deep inspection of the actually returned bytes but need to be careful - // not spend too much time parsing binary data as HTML + let opengraph_data = { + let is_html = content_type + .as_ref() + .map(|c| { + // application/xhtml+xml is a subset of HTML + let application_xhtml: Mime = "application/xhtml+xml".parse::().unwrap_or(TEXT_HTML); + let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()]; + allowed_mime_types.contains(&c.essence_str()) + }) + .unwrap_or_default(); - // only take first bytes regardless of how many bytes the server returns - let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; - break 'ograph extract_opengraph_data(&html_bytes, url) - .map_err(|e| info!("{e}")) - .unwrap_or_default(); - } - // If a server is serving `application/octet-stream`, it's likely a mistake, - // so we try to guess the file type from its magic number. - else if c.subtype() == "octet-stream" { + if is_html { + // Can't use .text() here, because it only checks the content header, not the actual bytes + // https://github.com/LemmyNet/lemmy/issues/1964 + // So we want to do deep inspection of the actually returned bytes but need to be careful + // not spend too much time parsing binary data as HTML + + // only take first bytes regardless of how many bytes the server returns + let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; + extract_opengraph_data(&html_bytes, url) + .map_err(|e| info!("{e}")) + .unwrap_or_default() + } else { + let is_octet_type = content_type + .as_ref() + .map(|c| c.subtype() == "octet-stream") + .unwrap_or_default(); + + // Overwrite the content type if its an octet type + if is_octet_type { // Don't need to fetch as much data for this as we do with opengraph let octet_bytes = collect_bytes_until_limit(response, 512).await?; + // content_type = infer::get(&octet_bytes).or(&content_type, |t| + // t.mime_type().parse().ok()); content_type = infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok()); } + + Default::default() } - Default::default() }; Ok(LinkMetadata {