Use magic numbers to determine file type.

This commit is contained in:
flamingos-cant 2024-11-25 12:08:46 +00:00
parent 878e8b88c4
commit c5b82d9b12
3 changed files with 67 additions and 25 deletions

21
Cargo.lock generated
View file

@ -779,6 +779,17 @@ dependencies = [
"nom", "nom",
] ]
[[package]]
name = "cfb"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
dependencies = [
"byteorder",
"fnv",
"uuid",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -2347,6 +2358,15 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "infer"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
dependencies = [
"cfb",
]
[[package]] [[package]]
name = "inout" name = "inout"
version = "0.1.3" version = "0.1.3"
@ -2508,6 +2528,7 @@ dependencies = [
"enum-map", "enum-map",
"futures", "futures",
"getrandom", "getrandom",
"infer",
"jsonwebtoken", "jsonwebtoken",
"lemmy_db_schema", "lemmy_db_schema",
"lemmy_db_views", "lemmy_db_views",

View file

@ -64,6 +64,7 @@ actix-web = { workspace = true, optional = true }
enum-map = { workspace = true } enum-map = { workspace = true }
urlencoding = { workspace = true } urlencoding = { workspace = true }
mime = { version = "0.3.17", optional = true } mime = { version = "0.3.17", optional = true }
infer = "0.16.0"
webpage = { version = "2.0", default-features = false, features = [ webpage = { version = "2.0", default-features = false, features = [
"serde", "serde",
], optional = true } ], optional = true }

View file

@ -18,17 +18,14 @@ use lemmy_db_schema::{
}, },
}; };
use lemmy_utils::{ use lemmy_utils::{
error::{LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult}, error::{FederationError, LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult},
settings::structs::{PictrsImageMode, Settings}, settings::structs::{PictrsImageMode, Settings},
REQWEST_TIMEOUT, REQWEST_TIMEOUT, VERSION,
VERSION,
}; };
use mime::Mime; use mime::Mime;
use reqwest::{ use reqwest::{
header::{CONTENT_TYPE, RANGE}, header::{CONTENT_TYPE, RANGE},
Client, Client, ClientBuilder, Response,
ClientBuilder,
Response,
}; };
use reqwest_middleware::ClientWithMiddleware; use reqwest_middleware::ClientWithMiddleware;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -54,7 +51,8 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
// We only fetch the first 64kB of data in order to not waste bandwidth especially for large // We only fetch the first 64kB of data in order to not waste bandwidth especially for large
// binary files // binary files
let bytes_to_fetch = 64 * 1024; let bytes_to_fetch = 64 * 1024;
let response = context let mut response = Some(
context
.client() .client()
.get(url.as_str()) .get(url.as_str())
// we only need the first chunk of data. Note that we do not check for Accept-Range so the // we only need the first chunk of data. Note that we do not check for Accept-Range so the
@ -62,13 +60,31 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
.header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */ .header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
.send() .send()
.await? .await?
.error_for_status()?; .error_for_status()?,
);
let content_type: Option<Mime> = response let content_type: Option<Mime> = {
.headers() let mut mime = response.as_ref().and_then(|m| {
m.headers()
.get(CONTENT_TYPE) .get(CONTENT_TYPE)
.and_then(|h| h.to_str().ok()) .and_then(|h| h.to_str().ok())
.and_then(|h| h.parse().ok()); .and_then(|h| h.parse().ok())
});
// If a server is serving `application/octet-stream`, it's likely a mistake,
// so we try to guess the file type from its magic number.
if mime
.as_ref()
.is_some_and(|m: &Mime| m.subtype() == "octet-stream")
{
// Don't need to fetch as much data for this as we do with opengraph
let octet_bytes =
collect_bytes_until_limit(response.take().ok_or(FederationError::Unreachable)?, 512)
.await?;
mime = infer::get(&octet_bytes).map_or(mime, |t| t.mime_type().parse().ok());
}
mime
};
let opengraph_data = { let opengraph_data = {
// if the content type is not text/html, we don't need to parse it // if the content type is not text/html, we don't need to parse it
@ -81,7 +97,7 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml") (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
}) })
.unwrap_or(false); .unwrap_or(false);
if !is_html { if !is_html || response.is_none() {
Default::default() Default::default()
} else { } else {
// Can't use .text() here, because it only checks the content header, not the actual bytes // Can't use .text() here, because it only checks the content header, not the actual bytes
@ -90,7 +106,11 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
// spend too much time parsing binary data as HTML // spend too much time parsing binary data as HTML
// only take first bytes regardless of how many bytes the server returns // only take first bytes regardless of how many bytes the server returns
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?; let html_bytes = collect_bytes_until_limit(
response.take().ok_or(FederationError::Unreachable)?,
bytes_to_fetch,
)
.await?;
extract_opengraph_data(&html_bytes, url) extract_opengraph_data(&html_bytes, url)
.map_err(|e| info!("{e}")) .map_err(|e| info!("{e}"))
.unwrap_or_default() .unwrap_or_default()