Added html2md crate to parse comment html from pleroma (fixes #1461)

This commit is contained in:
Felix Ableitner 2021-10-20 17:48:10 +02:00
parent 153ec0d7aa
commit c514f56158
3 changed files with 84 additions and 5 deletions

71
Cargo.lock generated
View file

@ -614,6 +614,12 @@ version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79c2681d6594606957bbb8631c4b90a7fcaaa72cdb714743a437b156d6a7eedd"
[[package]]
name = "cesu8"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -667,6 +673,16 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
[[package]]
name = "combine"
version = "4.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a909e4d93292cd8e9c42e189f61681eff9d67b6541f96b8a1a737f23737bd001"
dependencies = [
"bytes",
"memchr",
]
[[package]]
name = "comrak"
version = "0.12.1"
@ -1371,6 +1387,20 @@ version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a164bb2ceaeff4f42542bdb847c41517c78a60f5649671b2a07312b6e117549"
[[package]]
name = "html2md"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61f6bf799d9770725ec13d66f4af9344e96285dc14d8e71e0fe02d272690667f"
dependencies = [
"html5ever 0.25.1",
"jni",
"lazy_static",
"markup5ever_rcdom",
"percent-encoding",
"regex",
]
[[package]]
name = "html5ever"
version = "0.22.5"
@ -1601,6 +1631,26 @@ version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "jni"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec"
dependencies = [
"cesu8",
"combine",
"jni-sys",
"log",
"thiserror",
"walkdir",
]
[[package]]
name = "jni-sys"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
[[package]]
name = "jpeg-decoder"
version = "0.1.22"
@ -1768,6 +1818,7 @@ dependencies = [
"chrono",
"diesel",
"futures",
"html2md",
"http",
"http-signature-normalization-actix",
"itertools",
@ -3104,6 +3155,15 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "schannel"
version = "0.1.19"
@ -4033,6 +4093,17 @@ version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
[[package]]
name = "walkdir"
version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
dependencies = [
"same-file",
"winapi",
"winapi-util",
]
[[package]]
name = "want"
version = "0.3.0"

View file

@ -49,6 +49,7 @@ anyhow = "1.0.44"
thiserror = "1.0.29"
background-jobs = "0.9.0"
reqwest = { version = "0.11.4", features = ["json"] }
html2md = "0.2.13"
[dev-dependencies]
serial_test = "0.5.1"

View file

@ -16,6 +16,7 @@ use activitystreams::{
};
use anyhow::{anyhow, Context};
use chrono::{DateTime, FixedOffset};
use html2md::parse_html;
use lemmy_api_common::blocking;
use lemmy_apub_lib::{
traits::{ApubObject, FromApub, ToApub},
@ -284,12 +285,11 @@ impl FromApub for ApubComment {
}
let content = if let SourceCompat::Lemmy(source) = &note.source {
&source.content
source.content.clone()
} else {
// TODO: convert from html to markdown
&note.content
parse_html(&note.content)
};
let content_slurs_removed = remove_slurs(content, &context.settings().slur_regex());
let content_slurs_removed = remove_slurs(&content, &context.settings().slur_regex());
let form = CommentForm {
creator_id: creator.id,
@ -373,8 +373,15 @@ mod tests {
.unwrap();
assert_eq!(comment.ap_id.clone().into_inner(), pleroma_url);
assert_eq!(comment.content.len(), 179);
assert_eq!(comment.content.len(), 64);
assert!(!comment.local);
assert_eq!(request_counter, 0);
}
#[actix_rt::test]
#[serial]
async fn test_html_to_markdown_sanitize() {
let parsed = parse_html(&"<script></script><b>hello</b>");
assert_eq!(parsed, "**hello**");
}
}