Use magic number to detemine file type (#5225)
* Revert "Guess image mime type from file extension (fixes #5196) (#5212)"
This reverts commit 63ea99d38a
.
* Use magic numbers to determine file type.
* fmt
* Don't wrap response in an option
* Regen Cargo.lock
* Clean-up + guess mime type from extension if server is unresponsive
* Move some things about.
* Some cleanup.
* Removing comment lines.
---------
Co-authored-by: Dessalines <tyhou13@gmx.com>
This commit is contained in:
parent
f4cd569a98
commit
ff2bbf41ca
3 changed files with 57 additions and 27 deletions
23
Cargo.lock
generated
23
Cargo.lock
generated
|
@ -779,6 +779,17 @@ dependencies = [
|
||||||
"nom",
|
"nom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfb"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"fnv",
|
||||||
|
"uuid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
@ -2347,6 +2358,15 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "infer"
|
||||||
|
version = "0.16.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
|
||||||
|
dependencies = [
|
||||||
|
"cfb",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "inout"
|
name = "inout"
|
||||||
version = "0.1.3"
|
version = "0.1.3"
|
||||||
|
@ -2507,6 +2527,7 @@ dependencies = [
|
||||||
"encoding_rs",
|
"encoding_rs",
|
||||||
"enum-map",
|
"enum-map",
|
||||||
"futures",
|
"futures",
|
||||||
|
"infer",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
"lemmy_db_schema",
|
"lemmy_db_schema",
|
||||||
"lemmy_db_views",
|
"lemmy_db_views",
|
||||||
|
@ -2878,7 +2899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"windows-targets 0.48.5",
|
"windows-targets 0.52.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -66,6 +66,7 @@ enum-map = { workspace = true }
|
||||||
urlencoding = { workspace = true }
|
urlencoding = { workspace = true }
|
||||||
mime = { version = "0.3.17", optional = true }
|
mime = { version = "0.3.17", optional = true }
|
||||||
mime_guess = "2.0.5"
|
mime_guess = "2.0.5"
|
||||||
|
infer = "0.16.0"
|
||||||
webpage = { version = "2.0", default-features = false, features = [
|
webpage = { version = "2.0", default-features = false, features = [
|
||||||
"serde",
|
"serde",
|
||||||
], optional = true }
|
], optional = true }
|
||||||
|
|
|
@ -23,6 +23,7 @@ use lemmy_utils::{
|
||||||
REQWEST_TIMEOUT,
|
REQWEST_TIMEOUT,
|
||||||
VERSION,
|
VERSION,
|
||||||
};
|
};
|
||||||
|
use mime::{Mime, TEXT_HTML};
|
||||||
use reqwest::{
|
use reqwest::{
|
||||||
header::{CONTENT_TYPE, RANGE},
|
header::{CONTENT_TYPE, RANGE},
|
||||||
Client,
|
Client,
|
||||||
|
@ -63,47 +64,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
|
||||||
.await?
|
.await?
|
||||||
.error_for_status()?;
|
.error_for_status()?;
|
||||||
|
|
||||||
// In some cases servers send a wrong mime type for images, which prevents thumbnail
|
let mut content_type: Option<Mime> = response
|
||||||
// generation. To avoid this we also try to guess the mime type from file extension.
|
.headers()
|
||||||
let content_type = mime_guess::from_path(url.path())
|
.get(CONTENT_TYPE)
|
||||||
.first()
|
.and_then(|h| h.to_str().ok())
|
||||||
// If you can guess that its an image type, then return that first.
|
.and_then(|h| h.parse().ok())
|
||||||
.filter(|guess| guess.type_() == mime::IMAGE)
|
// If we don't get a content_type from the response (e.g. if the server is down),
|
||||||
// Otherwise, get the content type from the headers
|
// then try to infer the content_type from the file extension.
|
||||||
.or(
|
.or(mime_guess::from_path(url.path()).first());
|
||||||
response
|
|
||||||
.headers()
|
|
||||||
.get(CONTENT_TYPE)
|
|
||||||
.and_then(|h| h.to_str().ok())
|
|
||||||
.and_then(|h| h.parse().ok()),
|
|
||||||
);
|
|
||||||
|
|
||||||
let opengraph_data = {
|
let opengraph_data = {
|
||||||
// if the content type is not text/html, we don't need to parse it
|
|
||||||
let is_html = content_type
|
let is_html = content_type
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|c| {
|
.map(|c| {
|
||||||
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
|
// application/xhtml+xml is a subset of HTML
|
||||||
||
|
let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
|
||||||
// application/xhtml+xml is a subset of HTML
|
let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
|
||||||
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
|
allowed_mime_types.contains(&c.essence_str())
|
||||||
})
|
})
|
||||||
.unwrap_or(false);
|
.unwrap_or_default();
|
||||||
if !is_html {
|
|
||||||
Default::default()
|
if is_html {
|
||||||
} else {
|
|
||||||
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
||||||
// https://github.com/LemmyNet/lemmy/issues/1964
|
// https://github.com/LemmyNet/lemmy/issues/1964
|
||||||
// So we want to do deep inspection of the actually returned bytes but need to be careful not
|
// So we want to do deep inspection of the actually returned bytes but need to be careful
|
||||||
// spend too much time parsing binary data as HTML
|
// not spend too much time parsing binary data as HTML
|
||||||
|
|
||||||
// only take first bytes regardless of how many bytes the server returns
|
// only take first bytes regardless of how many bytes the server returns
|
||||||
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
|
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
|
||||||
extract_opengraph_data(&html_bytes, url)
|
extract_opengraph_data(&html_bytes, url)
|
||||||
.map_err(|e| info!("{e}"))
|
.map_err(|e| info!("{e}"))
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
|
} else {
|
||||||
|
let is_octet_type = content_type
|
||||||
|
.as_ref()
|
||||||
|
.map(|c| c.subtype() == "octet-stream")
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Overwrite the content type if its an octet type
|
||||||
|
if is_octet_type {
|
||||||
|
// Don't need to fetch as much data for this as we do with opengraph
|
||||||
|
let octet_bytes = collect_bytes_until_limit(response, 512).await?;
|
||||||
|
content_type =
|
||||||
|
infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
Default::default()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(LinkMetadata {
|
Ok(LinkMetadata {
|
||||||
opengraph_data,
|
opengraph_data,
|
||||||
content_type: content_type.map(|c| c.to_string()),
|
content_type: content_type.map(|c| c.to_string()),
|
||||||
|
|
Loading…
Reference in a new issue