From 0da2fd3d93a85c6dceb18e391783ab7bfa3bab36 Mon Sep 17 00:00:00 2001 From: Felix Ableitner Date: Mon, 21 Sep 2020 17:02:46 +0200 Subject: [PATCH] Sanitize HTML from federated data (#ref 647) --- Cargo.lock | 197 +++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + src/apub/comment.rs | 3 +- src/apub/community.rs | 7 +- src/apub/post.rs | 11 +- src/apub/private_message.rs | 14 +-- src/apub/user.rs | 7 +- 7 files changed, 221 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c8c79d873..c7bb8ff59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -389,6 +389,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "ammonia" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89eac85170f4b3fb3dc5e442c1cfb036cb8eecf9dbbd431a161ffad15d90ea3b" +dependencies = [ + "html5ever", + "lazy_static", + "maplit", + "markup5ever_rcdom", + "matches", + "tendril", + "url", +] + [[package]] name = "ansi_term" version = "0.11.0" @@ -1325,6 +1340,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +[[package]] +name = "futf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.5" @@ -1539,6 +1564,20 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "html5ever" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "http" version = "0.2.1" @@ -1835,6 +1874,7 @@ dependencies = [ "actix-rt", "actix-web", "actix-web-actors", + "ammonia", "anyhow", "async-trait", "awc", @@ -2026,12 +2066,47 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d947cbb889ed21c2a84be6ffbaebf5b4e0f4340638cba0444907e38b56be084" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" +dependencies = [ + "log", + "phf", + "phf_codegen", + "serde 1.0.116", + "serde_derive", + "serde_json", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2183,6 +2258,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "nom" version = "4.2.3" @@ -2417,6 +2498,44 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand 0.7.3", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "0.4.23" @@ -2482,6 +2601,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro-hack" version = "0.5.18" @@ -2576,7 +2701,7 @@ dependencies = [ "rand_isaac", "rand_jitter", "rand_os", - "rand_pcg", + "rand_pcg 0.1.2", "rand_xorshift", "winapi 0.3.9", ] @@ -2592,6 +2717,7 @@ dependencies = [ "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", + "rand_pcg 0.2.1", ] [[package]] @@ -2700,6 +2826,15 @@ dependencies = [ "rand_core 0.4.2", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xorshift" version = "0.1.1" @@ -3095,6 +3230,12 @@ dependencies = [ "num-traits 0.2.12", ] +[[package]] +name = "siphasher" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" + [[package]] name = "slab" version = "0.4.2" @@ -3189,6 +3330,31 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" +[[package]] +name = "string_cache" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a" +dependencies = [ + "lazy_static", + "new_debug_unreachable", + "phf_shared", + "precomputed-hash", + "serde 1.0.116", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.8.0" @@ -3244,6 +3410,17 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tendril" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.1.0" @@ -3576,6 +3753,12 @@ dependencies = [ "serde 1.0.116", ] +[[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + [[package]] name = "uuid" version = "0.7.4" @@ -3842,3 +4025,15 @@ name = "xdg" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d089681aa106a86fade1b0128fb5daf07d5867a509ab036d99988dec80429a57" + +[[package]] +name = "xml5ever" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59" +dependencies = [ + "log", + "mac", + "markup5ever", + "time 0.1.44", +] diff --git a/Cargo.toml b/Cargo.toml index f04866847..ff17f849b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,7 @@ anyhow = "1.0" thiserror = "1.0" background-jobs = " 0.8" reqwest = { version = "0.10", features = ["json"] } +ammonia = "3.1.0" [dev-dependencies.cargo-husky] version = "1" diff --git a/src/apub/comment.rs b/src/apub/comment.rs index fc19ec334..9f5ca8f0e 100644 --- a/src/apub/comment.rs +++ b/src/apub/comment.rs @@ -38,6 +38,7 @@ use activitystreams::{ public, }; use actix_web::{body::Body, web, web::Path, HttpResponse}; +use ammonia::clean_text; use anyhow::Context; use itertools::Itertools; use lemmy_db::{ @@ -183,7 +184,7 @@ impl FromApub for CommentForm { creator_id: creator.id, post_id: post.id, parent_id, - content: content_slurs_removed, + content: clean_text(&content_slurs_removed), removed: None, read: None, published: note.published().map(|u| u.to_owned().naive_local()), diff --git a/src/apub/community.rs b/src/apub/community.rs index 54b2957ed..07d1b8471 100644 --- a/src/apub/community.rs +++ b/src/apub/community.rs @@ -36,6 +36,7 @@ use activitystreams::{ }; use activitystreams_ext::Ext2; use actix_web::{body::Body, web, HttpResponse}; +use ammonia::clean_text; use anyhow::Context; use itertools::Itertools; use lemmy_db::{ @@ -376,7 +377,7 @@ impl FromApub for CommunityForm { .content() .map(|s| s.as_single_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean_text(s)); check_slurs(&name)?; check_slurs(&title)?; check_slurs_opt(&description)?; @@ -408,8 +409,8 @@ impl FromApub for CommunityForm { }; Ok(CommunityForm { - name, - title, + name: clean_text(&name), + title: clean_text(&title), description, category_id: group.ext_one.category.identifier.parse::()?, creator_id: creator.id, diff --git a/src/apub/post.rs b/src/apub/post.rs index 85e00fb23..983246d95 100644 --- a/src/apub/post.rs +++ b/src/apub/post.rs @@ -34,6 +34,7 @@ use activitystreams::{ }; use activitystreams_ext::Ext1; use actix_web::{body::Body, web, HttpResponse}; +use ammonia::{clean, clean_text}; use anyhow::Context; use lemmy_db::{ community::Community, @@ -168,17 +169,17 @@ fn extract_embed_from_apub( .flatten() .map(|s| s.as_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean_text(s)); let description = preview_page .summary() .map(|s| s.as_single_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean_text(s)); let html = preview_page .content() .map(|c| c.as_single_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean(s)); Ok(EmbedType { title, description, @@ -262,11 +263,11 @@ impl FromApub for PostForm { .as_ref() .map(|c| c.as_single_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean_text(s)); check_slurs(&name)?; let body_slurs_removed = body.map(|b| remove_slurs(&b)); Ok(PostForm { - name, + name: clean_text(&name), url, body: body_slurs_removed, creator_id: creator.id, diff --git a/src/apub/private_message.rs b/src/apub/private_message.rs index 9e0549eeb..4b3e7575d 100644 --- a/src/apub/private_message.rs +++ b/src/apub/private_message.rs @@ -26,6 +26,7 @@ use activitystreams::{ object::{kind::NoteType, Note, Tombstone}, prelude::*, }; +use ammonia::clean_text; use anyhow::Context; use lemmy_db::{ private_message::{PrivateMessage, PrivateMessageForm}, @@ -96,16 +97,17 @@ impl FromApub for PrivateMessageForm { let recipient = get_or_fetch_and_upsert_user(&recipient_actor_id, context).await?; let ap_id = note.id_unchecked().context(location_info!())?.to_string(); check_is_apub_id_valid(&Url::parse(&ap_id)?)?; + let content = note + .content() + .context(location_info!())? + .as_single_xsd_string() + .context(location_info!())? + .to_string(); Ok(PrivateMessageForm { creator_id: creator.id, recipient_id: recipient.id, - content: note - .content() - .context(location_info!())? - .as_single_xsd_string() - .context(location_info!())? - .to_string(), + content: clean_text(&content), published: note.published().map(|u| u.to_owned().naive_local()), updated: note.updated().map(|u| u.to_owned().naive_local()), deleted: None, diff --git a/src/apub/user.rs b/src/apub/user.rs index fb650632e..80ff5c68d 100644 --- a/src/apub/user.rs +++ b/src/apub/user.rs @@ -26,6 +26,7 @@ use activitystreams::{ }; use activitystreams_ext::Ext1; use actix_web::{body::Body, web, HttpResponse}; +use ammonia::clean_text; use anyhow::Context; use lemmy_db::{ naive_now, @@ -242,19 +243,19 @@ impl FromApub for UserForm { .as_xsd_string() .context(location_info!())? .to_string(); - let preferred_username = person.inner.preferred_username().map(|u| u.to_string()); + let preferred_username = person.inner.preferred_username().map(|u| clean_text(u)); let bio = person .inner .summary() .map(|s| s.as_single_xsd_string()) .flatten() - .map(|s| s.to_string()); + .map(|s| clean_text(s)); check_slurs(&name)?; check_slurs_opt(&preferred_username)?; check_slurs_opt(&bio)?; Ok(UserForm { - name, + name: clean_text(&name), preferred_username, password_encrypted: "".to_string(), admin: false,