From c5b82d9b120ccf217fcfe38234c12979903e79e9 Mon Sep 17 00:00:00 2001
From: flamingos-cant <flamingos-cant@flamingos-cant.xyz>
Date: Mon, 25 Nov 2024 12:08:46 +0000
Subject: [PATCH] Use magic numbers to determine file type.

---
 Cargo.lock                       | 21 ++++++++++
 crates/api_common/Cargo.toml     |  1 +
 crates/api_common/src/request.rs | 70 ++++++++++++++++++++------------
 3 files changed, 67 insertions(+), 25 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e0c19410c..decdf74a5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -779,6 +779,17 @@ dependencies = [
  "nom",
 ]
 
+[[package]]
+name = "cfb"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
+dependencies = [
+ "byteorder",
+ "fnv",
+ "uuid",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -2347,6 +2358,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "infer"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
+dependencies = [
+ "cfb",
+]
+
 [[package]]
 name = "inout"
 version = "0.1.3"
@@ -2508,6 +2528,7 @@ dependencies = [
  "enum-map",
  "futures",
  "getrandom",
+ "infer",
  "jsonwebtoken",
  "lemmy_db_schema",
  "lemmy_db_views",
diff --git a/crates/api_common/Cargo.toml b/crates/api_common/Cargo.toml
index 7b987cbb7..c5334edfc 100644
--- a/crates/api_common/Cargo.toml
+++ b/crates/api_common/Cargo.toml
@@ -64,6 +64,7 @@ actix-web = { workspace = true, optional = true }
 enum-map = { workspace = true }
 urlencoding = { workspace = true }
 mime = { version = "0.3.17", optional = true }
+infer = "0.16.0"
 webpage = { version = "2.0", default-features = false, features = [
   "serde",
 ], optional = true }
diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs
index 36010f760..6ae24b271 100644
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@@ -18,17 +18,14 @@ use lemmy_db_schema::{
   },
 };
 use lemmy_utils::{
-  error::{LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult},
+  error::{FederationError, LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult},
   settings::structs::{PictrsImageMode, Settings},
-  REQWEST_TIMEOUT,
-  VERSION,
+  REQWEST_TIMEOUT, VERSION,
 };
 use mime::Mime;
 use reqwest::{
   header::{CONTENT_TYPE, RANGE},
-  Client,
-  ClientBuilder,
-  Response,
+  Client, ClientBuilder, Response,
 };
 use reqwest_middleware::ClientWithMiddleware;
 use serde::{Deserialize, Serialize};
@@ -54,21 +51,40 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
   // We only fetch the first 64kB of data in order to not waste bandwidth especially for large
   // binary files
   let bytes_to_fetch = 64 * 1024;
-  let response = context
-    .client()
-    .get(url.as_str())
-    // we only need the first chunk of data. Note that we do not check for Accept-Range so the
-    // server may ignore this and still respond with the full response
-    .header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
-    .send()
-    .await?
-    .error_for_status()?;
+  let mut response = Some(
+    context
+      .client()
+      .get(url.as_str())
+      // we only need the first chunk of data. Note that we do not check for Accept-Range so the
+      // server may ignore this and still respond with the full response
+      .header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
+      .send()
+      .await?
+      .error_for_status()?,
+  );
 
-  let content_type: Option<Mime> = response
-    .headers()
-    .get(CONTENT_TYPE)
-    .and_then(|h| h.to_str().ok())
-    .and_then(|h| h.parse().ok());
+  let content_type: Option<Mime> = {
+    let mut mime = response.as_ref().and_then(|m| {
+      m.headers()
+        .get(CONTENT_TYPE)
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.parse().ok())
+    });
+
+    // If a server is serving `application/octet-stream`, it's likely a mistake,
+    // so we try to guess the file type from its magic number.
+    if mime
+      .as_ref()
+      .is_some_and(|m: &Mime| m.subtype() == "octet-stream")
+    {
+      // Don't need to fetch as much data for this as we do with opengraph
+      let octet_bytes =
+        collect_bytes_until_limit(response.take().ok_or(FederationError::Unreachable)?, 512)
+          .await?;
+      mime = infer::get(&octet_bytes).map_or(mime, |t| t.mime_type().parse().ok());
+    }
+    mime
+  };
 
   let opengraph_data = {
     // if the content type is not text/html, we don't need to parse it
@@ -76,12 +92,12 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
       .as_ref()
       .map(|c| {
         (c.type_() == mime::TEXT && c.subtype() == mime::HTML)
-      ||
-      // application/xhtml+xml is a subset of HTML
-      (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
+	  ||
+	// application/xhtml+xml is a subset of HTML
+	  (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
       })
       .unwrap_or(false);
-    if !is_html {
+    if !is_html || response.is_none() {
       Default::default()
     } else {
       // Can't use .text() here, because it only checks the content header, not the actual bytes
@@ -90,7 +106,11 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
       // spend too much time parsing binary data as HTML
 
       // only take first bytes regardless of how many bytes the server returns
-      let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
+      let html_bytes = collect_bytes_until_limit(
+        response.take().ok_or(FederationError::Unreachable)?,
+        bytes_to_fetch,
+      )
+      .await?;
       extract_opengraph_data(&html_bytes, url)
         .map_err(|e| info!("{e}"))
         .unwrap_or_default()