Fix crash while rewriting markdown (fixes #5380) (#5395)

* Special characters in markdown links (ref #5380) * fix (fixes #5380) * fix test
2025-02-10 11:04:49 +00:00 · 2025-02-06 13:00:36 +00:00 · 2025-02-06 13:00:36 +00:00 · 865f0734ba
commit 865f0734ba
parent 20fa90a23c
2 changed files with 55 additions and 63 deletions
--- a/crates/utils/src/utils/markdown/image_links.rs
+++ b/crates/utils/src/utils/markdown/image_links.rs
@ -1,6 +1,15 @@
-use super::{link_rule::Link, MARKDOWN_PARSER};
-use crate::settings::SETTINGS;
-use markdown_it::{plugins::cmark::inline::image::Image, NodeValue};
+use super::link_rule::Link;
+use crate::{settings::SETTINGS, utils::markdown::link_rule};
+use markdown_it::{
+  parser::linkfmt::LinkFormatter,
+  plugins::cmark::{
+    block::fence,
+    inline::{image, image::Image},
+  },
+  MarkdownIt,
+  NodeValue,
+};
+use std::sync::LazyLock;
 use url::Url;
 use urlencoding::encode;

@ -55,7 +64,18 @@ pub fn markdown_find_links(src: &str) -> Vec<(usize, usize)> {

 // Walk the syntax tree to find positions of image or link urls
 fn find_urls<T: NodeValue + UrlAndTitle>(src: &str) -> Vec<(usize, usize)> {
-  let ast = MARKDOWN_PARSER.parse(src);
+  // Use separate markdown parser here, with most features disabled for faster parsing,
+  // and a dummy link formatter which doesnt normalize links.
+  static PARSER: LazyLock<MarkdownIt> = LazyLock::new(|| {
+    let mut p = MarkdownIt::new();
+    p.link_formatter = Box::new(NoopLinkFormatter {});
+    image::add(&mut p);
+    fence::add(&mut p);
+    link_rule::add(&mut p);
+    p
+  });
+
+  let ast = PARSER.parse(src);
  let mut links_offsets = vec![];
  ast.walk(|node, _depth| {
    if let Some(image) = node.cast::<T>() {
@ -94,6 +114,25 @@ impl UrlAndTitle for Link {
  }
 }

+/// markdown-it normalizes links by default, which breaks the link rewriting. So we use a dummy
+/// formatter here which does nothing. Note this isnt actually used to render the markdown.
+#[derive(Debug)]
+struct NoopLinkFormatter;
+
+impl LinkFormatter for NoopLinkFormatter {
+  fn validate_link(&self, _url: &str) -> Option<()> {
+    Some(())
+  }
+
+  fn normalize_link(&self, url: &str) -> String {
+    url.to_owned()
+  }
+
+  fn normalize_link_text(&self, url: &str) -> String {
+    url.to_owned()
+  }
+}
+
 #[cfg(test)]
 mod tests {

@ -107,6 +146,12 @@ mod tests {

    let links = find_urls::<Image>("![test](https://example.com)");
    assert_eq!(vec![(8, 27)], links);
+
+    let links = find_urls::<Image>("![ითხოვს](https://example.com/ითხოვს)");
+    assert_eq!(vec![(22, 60)], links);
+
+    let links = find_urls::<Image>("![test](https://example.com/%C3%A4%C3%B6%C3%BC.jpg)");
+    assert_eq!(vec![(8, 50)], links);
  }

  #[test]
@ -152,7 +197,12 @@ mod tests {
          "custom emoji support",
          r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
          r#"![party-blob](https://lemmy-alpha/api/v4/image/proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
-        )
+        ),
+        (
+          "image with special chars",
+          "ითხოვს ![ითხოვს](http://example.com/ითხოვს%C3%A4.jpg)",
+          "ითხოვს ![ითხოვს](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2F%E1%83%98%E1%83%97%E1%83%AE%E1%83%9D%E1%83%95%E1%83%A1%25C3%25A4.jpg)",
+        ),
      ];

    tests.iter().for_each(|&(msg, input, expected)| {
--- a/crates/utils/src/utils/markdown/mod.rs
+++ b/crates/utils/src/utils/markdown/mod.rs
@ -49,7 +49,6 @@ mod tests {

  use super::*;
  use crate::utils::validation::check_urls_are_valid;
-  use image_links::markdown_rewrite_image_links;
  use pretty_assertions::assert_eq;
  use regex::escape;

@ -148,63 +147,6 @@ mod tests {
    });
  }

-  #[test]
-  fn test_markdown_proxy_images() {
-    let tests: Vec<_> =
-      vec![
-        (
-          "remote image proxied",
-          "![link](http://example.com/image.jpg)",
-          "![link](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)",
-        ),
-        (
-          "local image unproxied",
-          "![link](http://lemmy-alpha/image.jpg)",
-          "![link](http://lemmy-alpha/image.jpg)",
-        ),
-        (
-          "multiple image links",
-          "![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)",
-          "![link](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)",
-        ),
-        (
-          "empty link handled",
-          "![image]()",
-          "![image]()"
-        ),
-        (
-          "empty label handled",
-          "![](http://example.com/image.jpg)",
-          "![](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
-        ),
-        (
-          "invalid image link removed",
-          "![image](http-not-a-link)",
-          "![image]()"
-        ),
-        (
-          "label with nested markdown handled",
-          "![a *b* c](http://example.com/image.jpg)",
-          "![a *b* c](https://lemmy-alpha/api/v4/image/proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
-        ),
-        (
-          "custom emoji support",
-          r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
-          r#"![party-blob](https://lemmy-alpha/api/v4/image/proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
-        )
-      ];
-
-    tests.iter().for_each(|&(msg, input, expected)| {
-      let result = markdown_rewrite_image_links(input.to_string());
-
-      assert_eq!(
-        result.0, expected,
-        "Testing {}, with original input '{}'",
-        msg, input
-      );
-    });
-  }
-
  // This replicates the logic when saving url blocklist patterns and querying them.
  // Refer to lemmy_api_crud::site::update::update_site and
  // lemmy_api_common::utils::get_url_blocklist().