Use magic number to detemine file type (#5225)

* Revert "Guess image mime type from file extension (fixes #5196) (#5212)" This reverts commit 63ea99d38a. * Use magic numbers to determine file type. * fmt * Don't wrap response in an option * Regen Cargo.lock * Clean-up + guess mime type from extension if server is unresponsive * Move some things about. * Some cleanup. * Removing comment lines. --------- Co-authored-by: Dessalines <tyhou13@gmx.com>
2024-12-23 03:11:32 +00:00 · 2024-12-04 13:29:50 +00:00 · 2024-12-04 13:29:50 +00:00 · 5769a33a13
commit 5769a33a13
parent cd7759bba4
3 changed files with 58 additions and 18 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4

 [[package]]
 name = "Inflector"
@ -966,6 +966,17 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "cfb"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
+dependencies = [
+ "byteorder",
+ "fnv",
+ "uuid",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@ -2681,6 +2692,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "infer"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
+dependencies = [
+ "cfb",
+]
+
 [[package]]
 name = "inout"
 version = "0.1.3"
@ -2860,6 +2880,7 @@ dependencies = [
 "enum-map",
 "futures",
 "getrandom",
+ "infer",
 "jsonwebtoken",
 "lemmy_db_schema",
 "lemmy_db_views",
@ -2867,6 +2888,7 @@ dependencies = [
 "lemmy_db_views_moderator",
 "lemmy_utils",
 "mime",
+ "mime_guess",
 "moka",
 "pretty_assertions",
 "regex",
@ -3514,9 +3536,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"

 [[package]]
 name = "mime_guess"
-version = "2.0.4"
+version = "2.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
 dependencies = [
 "mime",
 "unicase",
--- a/crates/api_common/Cargo.toml
+++ b/crates/api_common/Cargo.toml
@ -64,6 +64,8 @@ actix-web = { workspace = true, optional = true }
 enum-map = { workspace = true }
 urlencoding = { workspace = true }
 mime = { version = "0.3.17", optional = true }
+mime_guess = "2.0.5"
+infer = "0.16.0"
 webpage = { version = "2.0", default-features = false, features = [
  "serde",
 ], optional = true }
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@ -23,7 +23,7 @@ use lemmy_utils::{
  REQWEST_TIMEOUT,
  VERSION,
 };
-use mime::Mime;
+use mime::{Mime, TEXT_HTML};
 use reqwest::{
  header::{CONTENT_TYPE, RANGE},
  Client,
@ -62,38 +62,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
    .send()
    .await?;

-  let content_type: Option<Mime> = response
+  let mut content_type: Option<Mime> = response
    .headers()
    .get(CONTENT_TYPE)
    .and_then(|h| h.to_str().ok())
-    .and_then(|h| h.parse().ok());
+    .and_then(|h| h.parse().ok())
+    // If we don't get a content_type from the response (e.g. if the server is down),
+    // then try to infer the content_type from the file extension.
+    .or(mime_guess::from_path(url.path()).first());

  let opengraph_data = {
-    // if the content type is not text/html, we don't need to parse it
    let is_html = content_type
      .as_ref()
      .map(|c| {
-        (c.type_() == mime::TEXT && c.subtype() == mime::HTML)
-      ||
        // application/xhtml+xml is a subset of HTML
-      (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
+        let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
+        let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
+        allowed_mime_types.contains(&c.essence_str())
      })
-      .unwrap_or(false);
-    if !is_html {
-      Default::default()
-    } else {
+      .unwrap_or_default();
+
+    if is_html {
      // Can't use .text() here, because it only checks the content header, not the actual bytes
      // https://github.com/LemmyNet/lemmy/issues/1964
-      // So we want to do deep inspection of the actually returned bytes but need to be careful not
-      // spend too much time parsing binary data as HTML
-
+      // So we want to do deep inspection of the actually returned bytes but need to be careful
+      // not spend too much time parsing binary data as HTML
      // only take first bytes regardless of how many bytes the server returns
      let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
      extract_opengraph_data(&html_bytes, url)
        .map_err(|e| info!("{e}"))
        .unwrap_or_default()
+    } else {
+      let is_octet_type = content_type
+        .as_ref()
+        .map(|c| c.subtype() == "octet-stream")
+        .unwrap_or_default();
+
+      // Overwrite the content type if its an octet type
+      if is_octet_type {
+        // Don't need to fetch as much data for this as we do with opengraph
+        let octet_bytes = collect_bytes_until_limit(response, 512).await?;
+        content_type =
+          infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
+      }
+
+      Default::default()
    }
  };
+
  Ok(LinkMetadata {
    opengraph_data,
    content_type: content_type.map(|c| c.to_string()),