mirror of
https://github.com/LemmyNet/lemmy.git
synced 2025-01-09 19:53:54 +00:00
Use magic numbers to determine file type.
This commit is contained in:
parent
878e8b88c4
commit
c5b82d9b12
3 changed files with 67 additions and 25 deletions
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -779,6 +779,17 @@ dependencies = [
|
||||||
"nom",
|
"nom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfb"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"fnv",
|
||||||
|
"uuid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
@ -2347,6 +2358,15 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "infer"
|
||||||
|
version = "0.16.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
|
||||||
|
dependencies = [
|
||||||
|
"cfb",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "inout"
|
name = "inout"
|
||||||
version = "0.1.3"
|
version = "0.1.3"
|
||||||
|
@ -2508,6 +2528,7 @@ dependencies = [
|
||||||
"enum-map",
|
"enum-map",
|
||||||
"futures",
|
"futures",
|
||||||
"getrandom",
|
"getrandom",
|
||||||
|
"infer",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
"lemmy_db_schema",
|
"lemmy_db_schema",
|
||||||
"lemmy_db_views",
|
"lemmy_db_views",
|
||||||
|
|
|
@ -64,6 +64,7 @@ actix-web = { workspace = true, optional = true }
|
||||||
enum-map = { workspace = true }
|
enum-map = { workspace = true }
|
||||||
urlencoding = { workspace = true }
|
urlencoding = { workspace = true }
|
||||||
mime = { version = "0.3.17", optional = true }
|
mime = { version = "0.3.17", optional = true }
|
||||||
|
infer = "0.16.0"
|
||||||
webpage = { version = "2.0", default-features = false, features = [
|
webpage = { version = "2.0", default-features = false, features = [
|
||||||
"serde",
|
"serde",
|
||||||
], optional = true }
|
], optional = true }
|
||||||
|
|
|
@ -18,17 +18,14 @@ use lemmy_db_schema::{
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use lemmy_utils::{
|
use lemmy_utils::{
|
||||||
error::{LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult},
|
error::{FederationError, LemmyError, LemmyErrorExt, LemmyErrorType, LemmyResult},
|
||||||
settings::structs::{PictrsImageMode, Settings},
|
settings::structs::{PictrsImageMode, Settings},
|
||||||
REQWEST_TIMEOUT,
|
REQWEST_TIMEOUT, VERSION,
|
||||||
VERSION,
|
|
||||||
};
|
};
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
use reqwest::{
|
use reqwest::{
|
||||||
header::{CONTENT_TYPE, RANGE},
|
header::{CONTENT_TYPE, RANGE},
|
||||||
Client,
|
Client, ClientBuilder, Response,
|
||||||
ClientBuilder,
|
|
||||||
Response,
|
|
||||||
};
|
};
|
||||||
use reqwest_middleware::ClientWithMiddleware;
|
use reqwest_middleware::ClientWithMiddleware;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
@ -54,21 +51,40 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
|
||||||
// We only fetch the first 64kB of data in order to not waste bandwidth especially for large
|
// We only fetch the first 64kB of data in order to not waste bandwidth especially for large
|
||||||
// binary files
|
// binary files
|
||||||
let bytes_to_fetch = 64 * 1024;
|
let bytes_to_fetch = 64 * 1024;
|
||||||
let response = context
|
let mut response = Some(
|
||||||
.client()
|
context
|
||||||
.get(url.as_str())
|
.client()
|
||||||
// we only need the first chunk of data. Note that we do not check for Accept-Range so the
|
.get(url.as_str())
|
||||||
// server may ignore this and still respond with the full response
|
// we only need the first chunk of data. Note that we do not check for Accept-Range so the
|
||||||
.header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
|
// server may ignore this and still respond with the full response
|
||||||
.send()
|
.header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
|
||||||
.await?
|
.send()
|
||||||
.error_for_status()?;
|
.await?
|
||||||
|
.error_for_status()?,
|
||||||
|
);
|
||||||
|
|
||||||
let content_type: Option<Mime> = response
|
let content_type: Option<Mime> = {
|
||||||
.headers()
|
let mut mime = response.as_ref().and_then(|m| {
|
||||||
.get(CONTENT_TYPE)
|
m.headers()
|
||||||
.and_then(|h| h.to_str().ok())
|
.get(CONTENT_TYPE)
|
||||||
.and_then(|h| h.parse().ok());
|
.and_then(|h| h.to_str().ok())
|
||||||
|
.and_then(|h| h.parse().ok())
|
||||||
|
});
|
||||||
|
|
||||||
|
// If a server is serving `application/octet-stream`, it's likely a mistake,
|
||||||
|
// so we try to guess the file type from its magic number.
|
||||||
|
if mime
|
||||||
|
.as_ref()
|
||||||
|
.is_some_and(|m: &Mime| m.subtype() == "octet-stream")
|
||||||
|
{
|
||||||
|
// Don't need to fetch as much data for this as we do with opengraph
|
||||||
|
let octet_bytes =
|
||||||
|
collect_bytes_until_limit(response.take().ok_or(FederationError::Unreachable)?, 512)
|
||||||
|
.await?;
|
||||||
|
mime = infer::get(&octet_bytes).map_or(mime, |t| t.mime_type().parse().ok());
|
||||||
|
}
|
||||||
|
mime
|
||||||
|
};
|
||||||
|
|
||||||
let opengraph_data = {
|
let opengraph_data = {
|
||||||
// if the content type is not text/html, we don't need to parse it
|
// if the content type is not text/html, we don't need to parse it
|
||||||
|
@ -76,12 +92,12 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|c| {
|
.map(|c| {
|
||||||
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
|
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
|
||||||
||
|
||
|
||||||
// application/xhtml+xml is a subset of HTML
|
// application/xhtml+xml is a subset of HTML
|
||||||
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
|
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
|
||||||
})
|
})
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
if !is_html {
|
if !is_html || response.is_none() {
|
||||||
Default::default()
|
Default::default()
|
||||||
} else {
|
} else {
|
||||||
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
||||||
|
@ -90,7 +106,11 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
|
||||||
// spend too much time parsing binary data as HTML
|
// spend too much time parsing binary data as HTML
|
||||||
|
|
||||||
// only take first bytes regardless of how many bytes the server returns
|
// only take first bytes regardless of how many bytes the server returns
|
||||||
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
|
let html_bytes = collect_bytes_until_limit(
|
||||||
|
response.take().ok_or(FederationError::Unreachable)?,
|
||||||
|
bytes_to_fetch,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
extract_opengraph_data(&html_bytes, url)
|
extract_opengraph_data(&html_bytes, url)
|
||||||
.map_err(|e| info!("{e}"))
|
.map_err(|e| info!("{e}"))
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
|
|
Loading…
Reference in a new issue