From f76322e3f5e6792df569b01d501238a5efc683f0 Mon Sep 17 00:00:00 2001 From: Richard Schwab Date: Fri, 27 Dec 2024 18:53:23 +0100 Subject: [PATCH] Reduce false positives in URL blocklist to reduce scunthorpe problem by matching at word boundaries. (#5282) This addresses an issue brought up on matrix where blocking rt.com resulted in links to deviantart.com getting blocked. --- crates/api_common/src/utils.rs | 4 +- crates/api_crud/src/site/update.rs | 2 + crates/utils/src/utils/markdown/mod.rs | 62 +++++++++++++++++--------- 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/crates/api_common/src/utils.rs b/crates/api_common/src/utils.rs index 80f559edb..ee232c93a 100644 --- a/crates/api_common/src/utils.rs +++ b/crates/api_common/src/utils.rs @@ -552,7 +552,9 @@ pub async fn get_url_blocklist(context: &LemmyContext) -> LemmyResult let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?; // The urls are already validated on saving, so just escape them. - let regexes = urls.iter().map(|url| escape(&url.url)); + // If this regex creation changes it must be synced with + // lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set. + let regexes = urls.iter().map(|url| format!(r"\b{}\b", escape(&url.url))); let set = RegexSet::new(regexes)?; Ok(set) diff --git a/crates/api_crud/src/site/update.rs b/crates/api_crud/src/site/update.rs index d2585ea43..8b0dfe0c5 100644 --- a/crates/api_crud/src/site/update.rs +++ b/crates/api_crud/src/site/update.rs @@ -151,6 +151,8 @@ pub async fn update_site( .ok(); if let Some(url_blocklist) = data.blocked_urls.clone() { + // If this validation changes it must be synced with + // lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set. let parsed_urls = check_urls_are_valid(&url_blocklist)?; LocalSiteUrlBlocklist::replace(&mut context.pool(), parsed_urls).await?; } diff --git a/crates/utils/src/utils/markdown/mod.rs b/crates/utils/src/utils/markdown/mod.rs index ba509596e..25ac0ffd6 100644 --- a/crates/utils/src/utils/markdown/mod.rs +++ b/crates/utils/src/utils/markdown/mod.rs @@ -47,8 +47,10 @@ pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> Lemm mod tests { use super::*; + use crate::utils::validation::check_urls_are_valid; use image_links::markdown_rewrite_image_links; use pretty_assertions::assert_eq; + use regex::escape; #[test] fn test_basic_markdown() { @@ -191,9 +193,20 @@ mod tests { }); } + // This replicates the logic when saving url blocklist patterns and querying them. + // Refer to lemmy_api_crud::site::update::update_site and + // lemmy_api_common::utils::get_url_blocklist(). + fn create_url_blocklist_test_regex_set(patterns: Vec<&str>) -> LemmyResult { + let url_blocklist = patterns.iter().map(|&s| s.to_string()).collect(); + let valid_urls = check_urls_are_valid(&url_blocklist)?; + let regexes = valid_urls.iter().map(|p| format!(r"\b{}\b", escape(p))); + let set = RegexSet::new(regexes)?; + Ok(set) + } + #[test] fn test_url_blocking() -> LemmyResult<()> { - let set = RegexSet::new(vec![r"(https://)?example\.com/?"])?; + let set = create_url_blocklist_test_regex_set(vec!["example.com/"])?; assert!( markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err() @@ -221,37 +234,42 @@ mod tests { ) .is_err()); - let set = RegexSet::new(vec![r"(https://)?example\.com/spam\.jpg"])?; - assert!(markdown_check_for_blocked_urls( - &String::from("![](https://example.com/spam.jpg)"), - &set - ) - .is_err()); + let set = create_url_blocklist_test_regex_set(vec!["example.com/spam.jpg"])?; + assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg)", &set).is_err()); + assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg1)", &set).is_ok()); + // TODO: the following should not be matched, scunthorpe problem. + assert!( + markdown_check_for_blocked_urls("![](https://example.com/spam.jpg.html)", &set).is_err() + ); - let set = RegexSet::new(vec![ - r"(https://)?quo\.example\.com/?", - r"(https://)?foo\.example\.com/?", - r"(https://)?bar\.example\.com/?", + let set = create_url_blocklist_test_regex_set(vec![ + r"quo.example.com/", + r"foo.example.com/", + r"bar.example.com/", ])?; - assert!( - markdown_check_for_blocked_urls(&String::from("https://baz.example.com"), &set).is_ok() - ); + assert!(markdown_check_for_blocked_urls("https://baz.example.com", &set).is_ok()); - assert!( - markdown_check_for_blocked_urls(&String::from("https://bar.example.com"), &set).is_err() - ); + assert!(markdown_check_for_blocked_urls("https://bar.example.com", &set).is_err()); - let set = RegexSet::new(vec![r"(https://)?example\.com/banned_page"])?; + let set = create_url_blocklist_test_regex_set(vec!["example.com/banned_page"])?; - assert!( - markdown_check_for_blocked_urls(&String::from("https://example.com/page"), &set).is_ok() - ); + assert!(markdown_check_for_blocked_urls("https://example.com/page", &set).is_ok()); - let set = RegexSet::new(vec![r"(https://)?ex\.mple\.com/?"])?; + let set = create_url_blocklist_test_regex_set(vec!["ex.mple.com/"])?; assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok()); + let set = create_url_blocklist_test_regex_set(vec!["rt.com/"])?; + + assert!(markdown_check_for_blocked_urls("deviantart.com", &set).is_ok()); + assert!(markdown_check_for_blocked_urls("art.com.example.com", &set).is_ok()); + assert!(markdown_check_for_blocked_urls("https://rt.com/abc", &set).is_err()); + assert!(markdown_check_for_blocked_urls("go to rt.com.", &set).is_err()); + assert!(markdown_check_for_blocked_urls("check out rt.computer", &set).is_ok()); + // TODO: the following should not be matched, scunthorpe problem. + assert!(markdown_check_for_blocked_urls("rt.com.example.com", &set).is_err()); + Ok(()) }