Removing scheme from block urls. Fixes #4656 (#4659)

* Removing scheme from block urls. Fixes #4656 * Fix comment. * Fixing domain checking. * Removing pointless URL building in url blocklist regex. * Remove trailing /
2024-12-23 03:11:32 +00:00 · 2024-04-23 23:15:20 -04:00 · 2024-04-23 23:15:20 -04:00 · 66e06b3952
commit 66e06b3952
parent 6b9d9dfaa5
2 changed files with 48 additions and 38 deletions
--- a/crates/api_common/src/utils.rs
+++ b/crates/api_common/src/utils.rs
@ -536,25 +536,8 @@ pub async fn get_url_blocklist(context: &LemmyContext) -> LemmyResult<RegexSet>
      .try_get_with::<_, LemmyError>((), async {
        let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?;

-        let regexes = urls.iter().map(|url| {
-          let url = &url.url;
-          let parsed = Url::parse(url).expect("Coundln't parse URL.");
-          if url.ends_with('/') {
-            format!(
-              "({}://)?{}{}?",
-              parsed.scheme(),
-              escape(parsed.domain().expect("No domain.")),
-              escape(parsed.path())
-            )
-          } else {
-            format!(
-              "({}://)?{}{}",
-              parsed.scheme(),
-              escape(parsed.domain().expect("No domain.")),
-              escape(parsed.path())
-            )
-          }
-        });
+        // The urls are already validated on saving, so just escape them.
+        let regexes = urls.iter().map(|url| escape(&url.url));

        let set = RegexSet::new(regexes)?;
        Ok(set)
--- a/crates/utils/src/utils/validation.rs
+++ b/crates/utils/src/utils/validation.rs
@ -309,21 +309,44 @@ pub fn is_url_blocked(url: &Option<Url>, blocklist: &RegexSet) -> LemmyResult<()
  Ok(())
 }

+/// Check that urls are valid, and also remove the scheme, and uniques
 pub fn check_urls_are_valid(urls: &Vec<String>) -> LemmyResult<Vec<String>> {
  let mut parsed_urls = vec![];
  for url in urls {
-    let url = Url::parse(url).or_else(|e| {
-      if e == ParseError::RelativeUrlWithoutBase {
-        Url::parse(&format!("https://{url}"))
-      } else {
-        Err(e)
-      }
-    })?;
-
-    parsed_urls.push(url.to_string());
+    parsed_urls.push(build_url_str_without_scheme(url)?);
  }

-  Ok(parsed_urls)
+  let unique_urls = parsed_urls.into_iter().unique().collect();
+  Ok(unique_urls)
+}
+
+pub fn build_url_str_without_scheme(url_str: &str) -> LemmyResult<String> {
+  // Parse and check for errors
+  let mut url = Url::parse(url_str).or_else(|e| {
+    if e == ParseError::RelativeUrlWithoutBase {
+      Url::parse(&format!("http://{url_str}"))
+    } else {
+      Err(e)
+    }
+  })?;
+
+  // Set the scheme to http, then remove the http:// part
+  url
+    .set_scheme("http")
+    .map_err(|_| LemmyErrorType::InvalidUrl)?;
+
+  let mut out = url
+    .to_string()
+    .get(7..)
+    .ok_or(LemmyErrorType::InvalidUrl)?
+    .to_string();
+
+  // Remove trailing / if necessary
+  if out.ends_with('/') {
+    out.pop();
+  }
+
+  Ok(out)
 }

 #[cfg(test)]
@ -600,17 +623,21 @@ mod tests {

  #[test]
  fn test_url_parsed() {
+    // Make sure the scheme is removed, and uniques also
    assert_eq!(
-      vec![String::from("https://example.com/")],
-      check_urls_are_valid(&vec![String::from("example.com")]).unwrap()
+      &check_urls_are_valid(&vec![
+        "example.com".to_string(),
+        "http://example.com".to_string(),
+        "https://example.com".to_string(),
+        "https://example.com/test?q=test2&q2=test3#test4".to_string(),
+      ])
+      .unwrap(),
+      &vec![
+        "example.com".to_string(),
+        "example.com/test?q=test2&q2=test3#test4".to_string()
+      ],
    );

-    assert!(check_urls_are_valid(&vec![
-      String::from("example.com"),
-      String::from("https://example.blog")
-    ])
-    .is_ok());
-
-    assert!(check_urls_are_valid(&vec![String::from("https://example .com"),]).is_err());
+    assert!(check_urls_are_valid(&vec!["https://example .com".to_string()]).is_err());
  }
 }