From b60e5114932ffe1658c901e9fb755b89114c42bc Mon Sep 17 00:00:00 2001 From: phiresky Date: Sat, 3 Aug 2024 15:02:12 +0200 Subject: [PATCH] use mime type for determination --- crates/api_common/src/request.rs | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index e376e6b93..01927d5d9 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -8,7 +8,7 @@ use crate::{ use activitypub_federation::config::Data; use chrono::{DateTime, Utc}; use encoding_rs::{Encoding, UTF_8}; -use futures::{StreamExt, TryStreamExt}; +use futures::StreamExt; use lemmy_db_schema::{ newtypes::DbUrl, source::{ @@ -68,24 +68,31 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu .and_then(|h| h.to_str().ok()) .and_then(|h| h.parse().ok()); - // Can't use .text() here, because it only checks the content header, not the actual bytes - // https://github.com/LemmyNet/lemmy/issues/1964 - // So we want to do deep inspection of the actually returned bytes but need to be careful not - // spend too much time parsing binary data as HTML + let opengraph_data = { + // if the content type is not text/html, we don't need to parse it + let is_html = content_type + .as_ref() + .map(|c| { + (c.type_() == mime::TEXT && c.subtype() == mime::HTML) + || + // application/xhtml+xml is a subset of HTML + (c.type_() == mime::APPLICATION && c.subtype() == "xhtml") + }) + .unwrap_or(false); + if !is_html { + Default::default() + } else { + // Can't use .text() here, because it only checks the content header, not the actual bytes + // https://github.com/LemmyNet/lemmy/issues/1964 + // So we want to do deep inspection of the actually returned bytes but need to be careful not + // spend too much time parsing binary data as HTML - // only take first bytes regardless of how many bytes the server returns - let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; - // https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md#binary-data - // In order to figure out whether a file is binary, the most effective heuristic that balances - // correctness with performance is to simply look for NUL bytes. At that point, the determination - // is simple: a file is considered "binary" if and only if it contains a NUL byte somewhere in its - // contents. - let opengraph_data = if !html_bytes.contains(&0) { - extract_opengraph_data(&html_bytes, url) - .map_err(|e| info!("{e}")) - .unwrap_or_default() - } else { - Default::default() + // only take first bytes regardless of how many bytes the server returns + let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; + extract_opengraph_data(&html_bytes, url) + .map_err(|e| info!("{e}")) + .unwrap_or_default() + } }; Ok(LinkMetadata { opengraph_data,