Use magic number to detemine file type (#5225)

* Revert "Guess image mime type from file extension (fixes #5196) (#5212)"

This reverts commit 63ea99d38a.

* Use magic numbers to determine file type.

* fmt

* Don't wrap response in an option

* Regen Cargo.lock

* Clean-up + guess mime type from extension if server is unresponsive

* Move some things about.

* Some cleanup.

* Removing comment lines.

---------

Co-authored-by: Dessalines <tyhou13@gmx.com>
This commit is contained in:
flamingos-cant 2024-12-04 13:29:50 +00:00 committed by Felix Ableitner
parent cd7759bba4
commit 5769a33a13
3 changed files with 58 additions and 18 deletions

28
Cargo.lock generated
View file

@ -1,6 +1,6 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 4
[[package]] [[package]]
name = "Inflector" name = "Inflector"
@ -966,6 +966,17 @@ dependencies = [
"nom", "nom",
] ]
[[package]]
name = "cfb"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
dependencies = [
"byteorder",
"fnv",
"uuid",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -2681,6 +2692,15 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "infer"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
dependencies = [
"cfb",
]
[[package]] [[package]]
name = "inout" name = "inout"
version = "0.1.3" version = "0.1.3"
@ -2860,6 +2880,7 @@ dependencies = [
"enum-map", "enum-map",
"futures", "futures",
"getrandom", "getrandom",
"infer",
"jsonwebtoken", "jsonwebtoken",
"lemmy_db_schema", "lemmy_db_schema",
"lemmy_db_views", "lemmy_db_views",
@ -2867,6 +2888,7 @@ dependencies = [
"lemmy_db_views_moderator", "lemmy_db_views_moderator",
"lemmy_utils", "lemmy_utils",
"mime", "mime",
"mime_guess",
"moka", "moka",
"pretty_assertions", "pretty_assertions",
"regex", "regex",
@ -3514,9 +3536,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]] [[package]]
name = "mime_guess" name = "mime_guess"
version = "2.0.4" version = "2.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
dependencies = [ dependencies = [
"mime", "mime",
"unicase", "unicase",

View file

@ -64,6 +64,8 @@ actix-web = { workspace = true, optional = true }
enum-map = { workspace = true } enum-map = { workspace = true }
urlencoding = { workspace = true } urlencoding = { workspace = true }
mime = { version = "0.3.17", optional = true } mime = { version = "0.3.17", optional = true }
mime_guess = "2.0.5"
infer = "0.16.0"
webpage = { version = "2.0", default-features = false, features = [ webpage = { version = "2.0", default-features = false, features = [
"serde", "serde",
], optional = true } ], optional = true }

View file

@ -23,7 +23,7 @@ use lemmy_utils::{
REQWEST_TIMEOUT, REQWEST_TIMEOUT,
VERSION, VERSION,
}; };
use mime::Mime; use mime::{Mime, TEXT_HTML};
use reqwest::{ use reqwest::{
header::{CONTENT_TYPE, RANGE}, header::{CONTENT_TYPE, RANGE},
Client, Client,
@ -62,38 +62,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
.send() .send()
.await?; .await?;
let content_type: Option<Mime> = response let mut content_type: Option<Mime> = response
.headers() .headers()
.get(CONTENT_TYPE) .get(CONTENT_TYPE)
.and_then(|h| h.to_str().ok()) .and_then(|h| h.to_str().ok())
.and_then(|h| h.parse().ok()); .and_then(|h| h.parse().ok())
// If we don't get a content_type from the response (e.g. if the server is down),
// then try to infer the content_type from the file extension.
.or(mime_guess::from_path(url.path()).first());
let opengraph_data = { let opengraph_data = {
// if the content type is not text/html, we don't need to parse it
let is_html = content_type let is_html = content_type
.as_ref() .as_ref()
.map(|c| { .map(|c| {
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
||
// application/xhtml+xml is a subset of HTML // application/xhtml+xml is a subset of HTML
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml") let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
allowed_mime_types.contains(&c.essence_str())
}) })
.unwrap_or(false); .unwrap_or_default();
if !is_html {
Default::default() if is_html {
} else {
// Can't use .text() here, because it only checks the content header, not the actual bytes // Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964 // https://github.com/LemmyNet/lemmy/issues/1964
// So we want to do deep inspection of the actually returned bytes but need to be careful not // So we want to do deep inspection of the actually returned bytes but need to be careful
// spend too much time parsing binary data as HTML // not spend too much time parsing binary data as HTML
// only take first bytes regardless of how many bytes the server returns // only take first bytes regardless of how many bytes the server returns
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
extract_opengraph_data(&html_bytes, url) extract_opengraph_data(&html_bytes, url)
.map_err(|e| info!("{e}")) .map_err(|e| info!("{e}"))
.unwrap_or_default() .unwrap_or_default()
} else {
let is_octet_type = content_type
.as_ref()
.map(|c| c.subtype() == "octet-stream")
.unwrap_or_default();
// Overwrite the content type if its an octet type
if is_octet_type {
// Don't need to fetch as much data for this as we do with opengraph
let octet_bytes = collect_bytes_until_limit(response, 512).await?;
content_type =
infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
}
Default::default()
} }
}; };
Ok(LinkMetadata { Ok(LinkMetadata {
opengraph_data, opengraph_data,
content_type: content_type.map(|c| c.to_string()), content_type: content_type.map(|c| c.to_string()),