Auto-detect post language (fixes #2870)

This commit is contained in:
Felix Ableitner 2024-11-21 16:18:05 +01:00
parent 63ea99d38a
commit d566710e96
6 changed files with 1114 additions and 42 deletions

1004
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -120,6 +120,7 @@ reqwest = { version = "0.12.7", default-features = false, features = [
"gzip",
"rustls-tls",
] }
cfg-if = "1"
reqwest-middleware = "0.3.3"
reqwest-tracing = "0.5.3"
clokwerk = "0.4.0"

View file

@ -38,7 +38,7 @@ full = [
"rustls",
"i-love-jesus",
"tuplex",
"diesel-bind-if-some",
"diesel-bind-if-some","lingua"
]
[dependencies]
@ -82,6 +82,8 @@ diesel-bind-if-some = { workspace = true, optional = true }
moka.workspace = true
derive-new.workspace = true
tuplex = { workspace = true, optional = true }
lingua = { version = "1.6.2", optional = true }
cfg-if.workspace =true
[dev-dependencies]
serial_test = { workspace = true }

View file

@ -0,0 +1,84 @@
use crate::{newtypes::LanguageId, source::language::Language, utils::DbPool};
use lemmy_utils::error::LemmyResult;
use lingua::{IsoCode639_1, Language as LinguaLanguage, LanguageDetectorBuilder};
pub async fn detect_language(input: &str, pool: &mut DbPool<'_>) -> LemmyResult<LanguageId> {
// TODO: should only detect languages which are allowed in community
let detector = LanguageDetectorBuilder::from_iso_codes_639_1(&[
IsoCode639_1::EN,
IsoCode639_1::ES,
IsoCode639_1::DE,
])
.build();
let lang: Option<LinguaLanguage> = detector.detect_language_of(input);
let Some(lang) = lang else {
return Ok(LanguageId(0));
};
let confidence = detector.compute_language_confidence("languages are awesome", lang);
let lang = lang.iso_code_639_1().to_string().to_lowercase();
dbg!(&lang, &confidence);
if confidence < 0.4 {
return Ok(LanguageId(0));
}
Ok(Language::read_id_from_code(pool, &lang).await?)
}
#[cfg(test)]
#[expect(clippy::indexing_slicing)]
mod tests {
use super::*;
use crate::utils::build_db_pool_for_tests;
use pretty_assertions::assert_eq;
use serial_test::serial;
#[tokio::test]
#[serial]
async fn test_detect_language() -> LemmyResult<()> {
let pool = &build_db_pool_for_tests();
let pool = &mut pool.into();
// some easy comments
assert_eq!(
LanguageId(37),
detect_language(
"I don't think it's supposed to be taken seriously. It's just a throwaway meme.
",
pool
)
.await?
);
assert_eq!(
LanguageId(39),
detect_language(
"Oh! Mencion casual de la mejor pelicula navideña… Die hard!
",
pool
)
.await?
);
assert_eq!(
LanguageId(32),
detect_language(
"Die Forderung finde ich nutzlos.
",
pool
)
.await?
);
// different languages
assert_eq!(
LanguageId(0),
detect_language(
"Die Forderung finde ich nutzlos. It's just a throwaway meme.
",
pool
)
.await?
);
Ok(())
}
}

View file

@ -1,42 +1,37 @@
#![recursion_limit = "256"]
use cfg_if::cfg_if;
#[cfg(feature = "full")]
#[macro_use]
extern crate diesel;
#[cfg(feature = "full")]
#[macro_use]
extern crate diesel_derive_newtype;
#[cfg(feature = "full")]
#[macro_use]
extern crate diesel_derive_enum;
// this is used in tests
#[cfg(feature = "full")]
#[macro_use]
extern crate diesel_migrations;
#[cfg(feature = "full")]
#[macro_use]
extern crate async_trait;
pub mod aggregates;
#[cfg(feature = "full")]
pub mod impls;
pub mod newtypes;
pub mod sensitive;
#[cfg(feature = "full")]
#[rustfmt::skip]
pub mod schema;
#[cfg(feature = "full")]
pub mod aliases {
cfg_if! {
if #[cfg(feature = "full")] {
#[macro_use]
extern crate diesel;
#[macro_use]
extern crate diesel_derive_newtype;
#[macro_use]
extern crate diesel_derive_enum;
// this is used in tests
#[macro_use]
extern crate diesel_migrations;
#[macro_use]
extern crate async_trait;
pub mod impls;
#[rustfmt::skip]
pub mod schema;
pub mod detect_language;
pub mod aliases {
use crate::schema::{community_actions, person};
diesel::alias!(
community_actions as creator_community_actions: CreatorCommunityActions,
person as person1: Person1,
person as person2: Person2,
);
}
}
}
pub mod aggregates;
pub mod newtypes;
pub mod sensitive;
pub mod source;
#[cfg(feature = "full")]
pub mod traits;

View file

@ -83,7 +83,7 @@ lettre = { version = "0.11.10", default-features = false, features = [
markdown-it = { version = "0.6.1", optional = true }
ts-rs = { workspace = true, optional = true }
enum-map = { workspace = true, optional = true }
cfg-if = "1"
cfg-if.workspace = true
clearurls = { version = "0.0.4", features = ["linkify"] }
markdown-it-block-spoiler = "1.0.0"
markdown-it-sub = "1.0.0"