Use magic number to detemine file type (#5225)

* Revert "Guess image mime type from file extension (fixes #5196) (#5212)" This reverts commit 63ea99d38a. * Use magic numbers to determine file type. * fmt * Don't wrap response in an option * Regen Cargo.lock * Clean-up + guess mime type from extension if server is unresponsive * Move some things about. * Some cleanup. * Removing comment lines. --------- Co-authored-by: Dessalines <tyhou13@gmx.com>
2024-12-22 19:01:32 +00:00 · 2024-12-04 13:29:50 +00:00 · 2024-12-04 13:29:50 +00:00 · ff2bbf41ca
commit ff2bbf41ca
parent f4cd569a98
3 changed files with 57 additions and 27 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -779,6 +779,17 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "cfb"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
+dependencies = [
+ "byteorder",
+ "fnv",
+ "uuid",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@ -2347,6 +2358,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "infer"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
+dependencies = [
+ "cfb",
+]
+
 [[package]]
 name = "inout"
 version = "0.1.3"
@ -2507,6 +2527,7 @@ dependencies = [
 "encoding_rs",
 "enum-map",
 "futures",
+ "infer",
 "jsonwebtoken",
 "lemmy_db_schema",
 "lemmy_db_views",
@ -2878,7 +2899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
 "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.6",
 ]

 [[package]]
--- a/crates/api_common/Cargo.toml
+++ b/crates/api_common/Cargo.toml
@ -66,6 +66,7 @@ enum-map = { workspace = true }
 urlencoding = { workspace = true }
 mime = { version = "0.3.17", optional = true }
 mime_guess = "2.0.5"
+infer = "0.16.0"
 webpage = { version = "2.0", default-features = false, features = [
  "serde",
 ], optional = true }
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@ -23,6 +23,7 @@ use lemmy_utils::{
  REQWEST_TIMEOUT,
  VERSION,
 };
+use mime::{Mime, TEXT_HTML};
 use reqwest::{
  header::{CONTENT_TYPE, RANGE},
  Client,
@ -63,47 +64,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
    .await?
    .error_for_status()?;

-  // In some cases servers send a wrong mime type for images, which prevents thumbnail
-  // generation. To avoid this we also try to guess the mime type from file extension.
-  let content_type = mime_guess::from_path(url.path())
-    .first()
-    // If you can guess that its an image type, then return that first.
-    .filter(|guess| guess.type_() == mime::IMAGE)
-    // Otherwise, get the content type from the headers
-    .or(
-      response
-        .headers()
-        .get(CONTENT_TYPE)
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.parse().ok()),
-    );
+  let mut content_type: Option<Mime> = response
+    .headers()
+    .get(CONTENT_TYPE)
+    .and_then(|h| h.to_str().ok())
+    .and_then(|h| h.parse().ok())
+    // If we don't get a content_type from the response (e.g. if the server is down),
+    // then try to infer the content_type from the file extension.
+    .or(mime_guess::from_path(url.path()).first());

  let opengraph_data = {
-    // if the content type is not text/html, we don't need to parse it
    let is_html = content_type
      .as_ref()
      .map(|c| {
-        (c.type_() == mime::TEXT && c.subtype() == mime::HTML)
-      ||
-      // application/xhtml+xml is a subset of HTML
-      (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
+        // application/xhtml+xml is a subset of HTML
+        let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
+        let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
+        allowed_mime_types.contains(&c.essence_str())
      })
-      .unwrap_or(false);
-    if !is_html {
-      Default::default()
-    } else {
+      .unwrap_or_default();
+
+    if is_html {
      // Can't use .text() here, because it only checks the content header, not the actual bytes
      // https://github.com/LemmyNet/lemmy/issues/1964
-      // So we want to do deep inspection of the actually returned bytes but need to be careful not
-      // spend too much time parsing binary data as HTML
-
+      // So we want to do deep inspection of the actually returned bytes but need to be careful
+      // not spend too much time parsing binary data as HTML
      // only take first bytes regardless of how many bytes the server returns
      let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
      extract_opengraph_data(&html_bytes, url)
        .map_err(|e| info!("{e}"))
        .unwrap_or_default()
+    } else {
+      let is_octet_type = content_type
+        .as_ref()
+        .map(|c| c.subtype() == "octet-stream")
+        .unwrap_or_default();
+
+      // Overwrite the content type if its an octet type
+      if is_octet_type {
+        // Don't need to fetch as much data for this as we do with opengraph
+        let octet_bytes = collect_bytes_until_limit(response, 512).await?;
+        content_type =
+          infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
+      }
+
+      Default::default()
    }
  };
+
  Ok(LinkMetadata {
    opengraph_data,
    content_type: content_type.map(|c| c.to_string()),