Use correct encoding when fetching non-UTF-8 site metadata (#2015)
* Use correct encoding when fetching non-UTF-8 site metadata * Style fixes
This commit is contained in:
parent
e864685759
commit
661f97a073
2 changed files with 21 additions and 5 deletions
|
@ -45,3 +45,4 @@ webpage = { version = "1.4.0", default-features = false, features = ["serde"] }
|
||||||
jsonwebtoken = "7.2.0"
|
jsonwebtoken = "7.2.0"
|
||||||
doku = "0.10.2"
|
doku = "0.10.2"
|
||||||
uuid = { version = "0.8.2", features = ["serde", "v4"] }
|
uuid = { version = "0.8.2", features = ["serde", "v4"] }
|
||||||
|
encoding = "0.2.33"
|
|
@ -1,5 +1,6 @@
|
||||||
use crate::{settings::structs::Settings, version::VERSION, LemmyError};
|
use crate::{settings::structs::Settings, version::VERSION, LemmyError};
|
||||||
use anyhow::anyhow;
|
use anyhow::anyhow;
|
||||||
|
use encoding::{all::encodings, DecoderTrap};
|
||||||
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
|
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
|
||||||
use reqwest_middleware::ClientWithMiddleware;
|
use reqwest_middleware::ClientWithMiddleware;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
@ -75,16 +76,17 @@ pub async fn fetch_site_metadata(
|
||||||
.map_err(|e| RecvError(e.to_string()))?
|
.map_err(|e| RecvError(e.to_string()))?
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
let html = String::from_utf8_lossy(&html_bytes);
|
let tags = html_to_site_metadata(&html_bytes)?;
|
||||||
|
|
||||||
let tags = html_to_site_metadata(&html)?;
|
|
||||||
|
|
||||||
Ok(tags)
|
Ok(tags)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> {
|
fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
|
||||||
|
let html = String::from_utf8_lossy(html_bytes);
|
||||||
|
|
||||||
// Make sure the first line is doctype html
|
// Make sure the first line is doctype html
|
||||||
let first_line = html
|
let first_line = html
|
||||||
|
.trim_start()
|
||||||
.lines()
|
.lines()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.next()
|
.next()
|
||||||
|
@ -97,7 +99,20 @@ fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> {
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let page = HTML::from_string(html.to_string(), None)?;
|
let mut page = HTML::from_string(html.to_string(), None)?;
|
||||||
|
|
||||||
|
// If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
|
||||||
|
// proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
|
||||||
|
// version.
|
||||||
|
if let Some(charset) = page.meta.get("charset") {
|
||||||
|
if charset.to_lowercase() != "utf-8" {
|
||||||
|
if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
|
||||||
|
if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
|
||||||
|
page = HTML::from_string(html_with_encoding, None)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let page_title = page.title;
|
let page_title = page.title;
|
||||||
let page_description = page.description;
|
let page_description = page.description;
|
||||||
|
|
Loading…
Reference in a new issue