fix: Run extract_opengraph_data only on first 64kB of data and if Content-Type html (#4957)
* fix: Run extract_opengraph_data only on first 64kB of data and if data is not binary. * use mime type for determination * chore: simplify collect function
This commit is contained in:
parent
88fbcea246
commit
606545ccaf
1 changed files with 62 additions and 8 deletions
|
@ -8,6 +8,7 @@ use crate::{
|
||||||
use activitypub_federation::config::Data;
|
use activitypub_federation::config::Data;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use encoding_rs::{Encoding, UTF_8};
|
use encoding_rs::{Encoding, UTF_8};
|
||||||
|
use futures::StreamExt;
|
||||||
use lemmy_db_schema::{
|
use lemmy_db_schema::{
|
||||||
newtypes::DbUrl,
|
newtypes::DbUrl,
|
||||||
source::{
|
source::{
|
||||||
|
@ -23,7 +24,12 @@ use lemmy_utils::{
|
||||||
VERSION,
|
VERSION,
|
||||||
};
|
};
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder};
|
use reqwest::{
|
||||||
|
header::{CONTENT_TYPE, RANGE},
|
||||||
|
Client,
|
||||||
|
ClientBuilder,
|
||||||
|
Response,
|
||||||
|
};
|
||||||
use reqwest_middleware::ClientWithMiddleware;
|
use reqwest_middleware::ClientWithMiddleware;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
@ -44,7 +50,17 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder {
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResult<LinkMetadata> {
|
pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResult<LinkMetadata> {
|
||||||
info!("Fetching site metadata for url: {}", url);
|
info!("Fetching site metadata for url: {}", url);
|
||||||
let response = context.client().get(url.as_str()).send().await?;
|
// We only fetch the first 64kB of data in order to not waste bandwidth especially for large
|
||||||
|
// binary files
|
||||||
|
let bytes_to_fetch = 64 * 1024;
|
||||||
|
let response = context
|
||||||
|
.client()
|
||||||
|
.get(url.as_str())
|
||||||
|
// we only need the first chunk of data. Note that we do not check for Accept-Range so the
|
||||||
|
// server may ignore this and still respond with the full response
|
||||||
|
.header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
|
||||||
let content_type: Option<Mime> = response
|
let content_type: Option<Mime> = response
|
||||||
.headers()
|
.headers()
|
||||||
|
@ -52,19 +68,57 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
|
||||||
.and_then(|h| h.to_str().ok())
|
.and_then(|h| h.to_str().ok())
|
||||||
.and_then(|h| h.parse().ok());
|
.and_then(|h| h.parse().ok());
|
||||||
|
|
||||||
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
let opengraph_data = {
|
||||||
// https://github.com/LemmyNet/lemmy/issues/1964
|
// if the content type is not text/html, we don't need to parse it
|
||||||
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
|
let is_html = content_type
|
||||||
|
.as_ref()
|
||||||
|
.map(|c| {
|
||||||
|
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
|
||||||
|
||
|
||||||
|
// application/xhtml+xml is a subset of HTML
|
||||||
|
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
if !is_html {
|
||||||
|
Default::default()
|
||||||
|
} else {
|
||||||
|
// Can't use .text() here, because it only checks the content header, not the actual bytes
|
||||||
|
// https://github.com/LemmyNet/lemmy/issues/1964
|
||||||
|
// So we want to do deep inspection of the actually returned bytes but need to be careful not
|
||||||
|
// spend too much time parsing binary data as HTML
|
||||||
|
|
||||||
let opengraph_data = extract_opengraph_data(&html_bytes, url)
|
// only take first bytes regardless of how many bytes the server returns
|
||||||
.map_err(|e| info!("{e}"))
|
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
|
||||||
.unwrap_or_default();
|
extract_opengraph_data(&html_bytes, url)
|
||||||
|
.map_err(|e| info!("{e}"))
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
};
|
||||||
Ok(LinkMetadata {
|
Ok(LinkMetadata {
|
||||||
opengraph_data,
|
opengraph_data,
|
||||||
content_type: content_type.map(|c| c.to_string()),
|
content_type: content_type.map(|c| c.to_string()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn collect_bytes_until_limit(
|
||||||
|
response: Response,
|
||||||
|
requested_bytes: usize,
|
||||||
|
) -> Result<Vec<u8>, LemmyError> {
|
||||||
|
let mut stream = response.bytes_stream();
|
||||||
|
let mut bytes = Vec::with_capacity(requested_bytes);
|
||||||
|
while let Some(chunk) = stream.next().await {
|
||||||
|
let chunk = chunk.map_err(LemmyError::from)?;
|
||||||
|
// we may go over the requested size here but the important part is we don't keep aggregating
|
||||||
|
// more chunks than needed
|
||||||
|
bytes.extend_from_slice(&chunk);
|
||||||
|
if bytes.len() >= requested_bytes {
|
||||||
|
bytes.truncate(requested_bytes);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
/// Generates and saves a post thumbnail and metadata.
|
/// Generates and saves a post thumbnail and metadata.
|
||||||
///
|
///
|
||||||
/// Takes a callback to generate a send activity task, so that post can be federated with metadata.
|
/// Takes a callback to generate a send activity task, so that post can be federated with metadata.
|
||||||
|
|
Loading…
Reference in a new issue