Trying another tokenizer fix. #1964

This commit is contained in:
Dessalines 2021-12-20 16:57:36 -05:00
parent c883a49a40
commit ad4715c2a3

View file

@ -5,7 +5,7 @@ use reqwest_middleware::ClientWithMiddleware;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::future::Future; use std::future::Future;
use thiserror::Error; use thiserror::Error;
use tracing::error; use tracing::{error, info};
use url::Url; use url::Url;
use webpage::HTML; use webpage::HTML;
@ -64,12 +64,18 @@ pub async fn fetch_site_metadata(
client: &ClientWithMiddleware, client: &ClientWithMiddleware,
url: &Url, url: &Url,
) -> Result<SiteMetadata, LemmyError> { ) -> Result<SiteMetadata, LemmyError> {
info!("Fetching site metadata for url: {}", url);
let response = client.get(url.as_str()).send().await?; let response = client.get(url.as_str()).send().await?;
let html = response // Can't use .text() here, because it only checks the content header, not the actual bytes
.text() // https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response
.bytes()
.await .await
.map_err(|e| RecvError(e.to_string()))?; .map_err(|e| RecvError(e.to_string()))?
.to_vec();
let html = String::from_utf8_lossy(&html_bytes);
let tags = html_to_site_metadata(&html)?; let tags = html_to_site_metadata(&html)?;
@ -77,6 +83,20 @@ pub async fn fetch_site_metadata(
} }
fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> { fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> {
// Make sure the first line is doctype html
let first_line = html
.lines()
.into_iter()
.next()
.ok_or_else(|| LemmyError::from_message("No lines in html"))?
.to_lowercase();
if !first_line.starts_with("<!doctype html>") {
return Err(LemmyError::from_message(
"Site metadata page fetch is not DOCTYPE html",
));
}
let page = HTML::from_string(html.to_string(), None)?; let page = HTML::from_string(html.to_string(), None)?;
let page_title = page.title; let page_title = page.title;