From 7c2445efe73de5b701ecb7d0b90167c1a4e29a6a Mon Sep 17 00:00:00 2001 From: Nutomic Date: Thu, 3 Oct 2024 14:24:05 +0200 Subject: [PATCH] Resolve links to remote posts into local URL (#5057) * move code to new file * rewrite markdown links (fixes #2987) * add missing file * add helper fn * also convert post.url * simplify search.rs * clippy * also rewrite user/community links in markdown * Call from apub handlers, cleanup * no network requests in test * clippy * fix tests * serial * test * no mut * add api test * fix api test --- Cargo.lock | 54 ++--- api_tests/src/post.spec.ts | 26 +++ crates/api_common/src/utils.rs | 2 +- crates/apub/assets/lemmy/objects/group.json | 4 +- crates/apub/assets/pleroma/objects/note.json | 2 +- .../apub/assets/pleroma/objects/person.json | 2 +- crates/apub/src/api/resolve_object.rs | 22 +- crates/apub/src/api/user_settings_backup.rs | 4 +- crates/apub/src/fetcher/markdown_links.rs | 191 ++++++++++++++++++ crates/apub/src/fetcher/mod.rs | 1 + crates/apub/src/fetcher/search.rs | 61 ++---- crates/apub/src/objects/comment.rs | 4 +- crates/apub/src/objects/community.rs | 4 +- crates/apub/src/objects/instance.rs | 2 + crates/apub/src/objects/person.rs | 4 +- crates/apub/src/objects/post.rs | 15 +- crates/apub/src/objects/private_message.rs | 2 + crates/db_schema/src/impls/instance.rs | 5 + .../utils/src/utils/markdown/image_links.rs | 168 +++++++++++++++ crates/utils/src/utils/markdown/mod.rs | 72 +------ 20 files changed, 483 insertions(+), 162 deletions(-) create mode 100644 crates/apub/src/fetcher/markdown_links.rs create mode 100644 crates/utils/src/utils/markdown/image_links.rs diff --git a/Cargo.lock b/Cargo.lock index 0079a6cb2b..a799d9d6f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,7 +42,7 @@ dependencies = [ "pin-project-lite", "rand", "regex", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "rsa", "serde", @@ -490,9 +490,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" dependencies = [ "proc-macro2", "quote", @@ -839,9 +839,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.18" +version = "4.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3" +checksum = "3e5a21b8495e732f1b3c364c9949b201ca7bae518c502c80256c96ad79eaf6ac" dependencies = [ "clap_builder", "clap_derive", @@ -849,9 +849,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.18" +version = "4.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b" +checksum = "8cf2dd12af7a047ad9d6da2b6b249759a22a7abc0f474c1dae1777afa4b21a73" dependencies = [ "anstream", "anstyle", @@ -861,9 +861,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.18" +version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -1989,7 +1989,7 @@ dependencies = [ "base64 0.22.1", "http-signature-normalization", "httpdate", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "sha2", "thiserror", @@ -2514,7 +2514,7 @@ dependencies = [ "moka", "pretty_assertions", "regex", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "rosetta-i18n", "serde", @@ -2579,7 +2579,7 @@ dependencies = [ "lemmy_utils", "moka", "pretty_assertions", - "reqwest 0.12.8", + "reqwest 0.12.7", "serde", "serde_json", "serde_with", @@ -2718,7 +2718,7 @@ dependencies = [ "lemmy_utils", "mockall", "moka", - "reqwest 0.12.8", + "reqwest 0.12.7", "serde_json", "serial_test", "test-context", @@ -2745,7 +2745,7 @@ dependencies = [ "lemmy_db_views", "lemmy_db_views_actor", "lemmy_utils", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "rss", "serde", @@ -2778,7 +2778,7 @@ dependencies = [ "lemmy_utils", "pretty_assertions", "prometheus", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "reqwest-tracing", "rustls 0.23.13", @@ -2811,7 +2811,7 @@ dependencies = [ "markdown-it", "pretty_assertions", "regex", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "rosetta-build", "rosetta-i18n", @@ -3197,9 +3197,9 @@ dependencies = [ [[package]] name = "mutually_exclusive_features" -version = "0.1.0" +version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94e1e6445d314f972ff7395df2de295fe51b71821694f0b0e1e79c4f12c8577" +checksum = "6d02c0b00610773bb7fc61d85e13d86c7858cbdf00e1a120bfc41bc055dbaa0e" [[package]] name = "never" @@ -3966,9 +3966,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "async-compression", "base64 0.22.1", @@ -4019,7 +4019,7 @@ dependencies = [ "anyhow", "async-trait", "http 1.1.0", - "reqwest 0.12.8", + "reqwest 0.12.7", "serde", "thiserror", "tower-service", @@ -4036,7 +4036,7 @@ dependencies = [ "getrandom", "http 1.1.0", "matchit", - "reqwest 0.12.8", + "reqwest 0.12.7", "reqwest-middleware", "tracing", ] @@ -4508,9 +4508,9 @@ checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" [[package]] name = "sitemap-rs" -version = "0.2.2" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c4c6ab96128064ba085256d34e205153555b3803094d76e24d406c76f85a2c9" +checksum = "88cc73a9aac975541c9054e74ceae8d8ee85edc89a322404c275c1d100fffa51" dependencies = [ "chrono", "xml-builder", @@ -5106,9 +5106,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.13" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15bc0cd5f72e837e310f4d978a90abf202a7f7d8ef3272246bae381d0086d3bf" +checksum = "284586dc201db407be8c9d721abad1b3a6dacbbce5cccecd4fd15a37db95ab0d" dependencies = [ "actix-web", "mutually_exclusive_features", @@ -5579,7 +5579,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/api_tests/src/post.spec.ts b/api_tests/src/post.spec.ts index ee9cc441df..59e3d774e2 100644 --- a/api_tests/src/post.spec.ts +++ b/api_tests/src/post.spec.ts @@ -794,3 +794,29 @@ test("Fetch post with redirect", async () => { let gammaPost2 = await gamma.resolveObject(form); expect(gammaPost2.post).toBeDefined(); }); + +test("Rewrite markdown links", async () => { + const community = (await resolveBetaCommunity(beta)).community!; + + // create a post + let postRes1 = await createPost(beta, community.community.id); + + // link to this post in markdown + let postRes2 = await createPost( + beta, + community.community.id, + "https://example.com/", + `[link](${postRes1.post_view.post.ap_id})`, + ); + console.log(postRes2.post_view.post.body); + expect(postRes2.post_view.post).toBeDefined(); + + // fetch both posts from another instance + const alphaPost1 = await resolvePost(alpha, postRes1.post_view.post); + const alphaPost2 = await resolvePost(alpha, postRes2.post_view.post); + + // remote markdown link is replaced with local link + expect(alphaPost2.post?.post.body).toBe( + `[link](http://lemmy-alpha:8541/post/${alphaPost1.post?.post.id})`, + ); +}); diff --git a/crates/api_common/src/utils.rs b/crates/api_common/src/utils.rs index b55cd92bba..e255e36c22 100644 --- a/crates/api_common/src/utils.rs +++ b/crates/api_common/src/utils.rs @@ -52,7 +52,7 @@ use lemmy_utils::{ rate_limit::{ActionType, BucketConfig}, settings::structs::{PictrsImageMode, Settings}, utils::{ - markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links}, + markdown::{image_links::markdown_rewrite_image_links, markdown_check_for_blocked_urls}, slurs::{build_slur_regex, remove_slurs}, validation::clean_urls_in_text, }, diff --git a/crates/apub/assets/lemmy/objects/group.json b/crates/apub/assets/lemmy/objects/group.json index 1b848a8661..bd6e440655 100644 --- a/crates/apub/assets/lemmy/objects/group.json +++ b/crates/apub/assets/lemmy/objects/group.json @@ -3,9 +3,9 @@ "type": "Group", "preferredUsername": "tenforward", "name": "Ten Forward", - "summary": "

Lounge and recreation facility

\n
\n

Welcome to the Enterprise!.

\n", + "summary": "

Lounge and recreation facility

\n
\n

Welcome to the Enterprise!.

\n", "source": { - "content": "Lounge and recreation facility\n\n---\n\nWelcome to the [Enterprise](https://memory-alpha.fandom.com/wiki/USS_Enterprise_(NCC-1701-D))!.", + "content": "Lounge and recreation facility\n\n---\n\nWelcome to the Enterprise!", "mediaType": "text/markdown" }, "sensitive": false, diff --git a/crates/apub/assets/pleroma/objects/note.json b/crates/apub/assets/pleroma/objects/note.json index ff4b20d257..af61ff46e2 100644 --- a/crates/apub/assets/pleroma/objects/note.json +++ b/crates/apub/assets/pleroma/objects/note.json @@ -10,7 +10,7 @@ "attachment": [], "attributedTo": "https://queer.hacktivis.me/users/lanodan", "cc": ["https://www.w3.org/ns/activitystreams#Public"], - "content": "@popolon Have what?", + "content": "Have what?", "context": "https://queer.hacktivis.me/contexts/34cba3d2-2f35-4169-aeff-56af9bfeb753", "conversation": "https://queer.hacktivis.me/contexts/34cba3d2-2f35-4169-aeff-56af9bfeb753", "id": "https://queer.hacktivis.me/objects/8d4973f4-53de-49cd-8c27-df160e16a9c2", diff --git a/crates/apub/assets/pleroma/objects/person.json b/crates/apub/assets/pleroma/objects/person.json index bc9008babf..fff9a2cbaf 100644 --- a/crates/apub/assets/pleroma/objects/person.json +++ b/crates/apub/assets/pleroma/objects/person.json @@ -41,7 +41,7 @@ "owner": "https://queer.hacktivis.me/users/lanodan", "publicKeyPem": "-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAsWOgdjSMc010qvxC3njI\nXJlFWMJ5gJ8QXCW/PajYdsHPM6d+jxBNJ6zp9/tIRa2m7bWHTSkuHQ7QthOpt6vu\n+dAWpKRLS607SPLItn/qUcyXvgN+H8shfyhMxvkVs9jXdtlBsLUVE7UNpN0dxzqe\nI79QWbf7o4amgaIWGRYB+OYMnIxKt+GzIkivZdSVSYjfxNnBYkMCeUxm5EpPIxKS\nP5bBHAVRRambD5NUmyKILuC60/rYuc/C+vmgpY2HCWFS2q6o34dPr9enwL6t4b3m\nS1t/EJHk9rGaaDqSGkDEfyQI83/7SDebWKuETMKKFLZi1vMgQIFuOYCIhN6bIiZm\npQIDAQAB\n-----END PUBLIC KEY-----\n\n" }, - "summary": "---
Website: https://hacktivis.me/
Lang: Français(natif), English(fluent), LSF(🤏~👌), русский (еле-еле),
Politics: Anarchist as in DIY/DIWO, freedom of association, anti-authoritarian, anti-identitarianism

Pronouns: meh, pick any, have fun
Timezone: Let's say Mars, I have a non-24h cycle
```
🦊🦄⚧🂡ⓥ :anarchy: 👿🐧 :gentoo:
Pleroma maintainer (mostly backend)
BadWolf developer
Gentoo contributor

Dayjob: yogoko.fr

That person which uses HJKL in games

Just because computer bad: X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*

banner from: https://soc.flyingcube.tech/objects/56f79be2-9013-4559-9826-f7dc392417db
Federation-bots: #nobot", + "summary": "---Lang: Français(natif), English(fluent), LSF(🤏~👌), русский (еле-еле),
Politics: Anarchist as in DIY/DIWO, freedom of association, anti-authoritarian, anti-identitarianism

Pronouns: meh, pick any, have fun
Timezone: Let's say Mars, I have a non-24h cycle
```
🦊🦄⚧🂡ⓥ :anarchy: 👿🐧 :gentoo:
Pleroma maintainer (mostly backend)
BadWolf developer
Gentoo contributor

Dayjob: yogoko.fr

That person which uses HJKL in games

Just because computer bad: X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*

banner from: https://soc.flyingcube.tech/objects/56f79be2-9013-4559-9826-f7dc392417db
Federation-bots: #nobot", "tag": [ { "icon": { diff --git a/crates/apub/src/api/resolve_object.rs b/crates/apub/src/api/resolve_object.rs index b3061d1cea..81329a97b9 100644 --- a/crates/apub/src/api/resolve_object.rs +++ b/crates/apub/src/api/resolve_object.rs @@ -1,4 +1,5 @@ use crate::fetcher::{ + post_or_comment::PostOrComment, search::{search_query_to_object_id, search_query_to_object_id_local, SearchableObjects}, user_or_community::UserOrCommunity, }; @@ -46,21 +47,22 @@ async fn convert_response( local_user_view: Option, pool: &mut DbPool<'_>, ) -> LemmyResult> { - use SearchableObjects::*; let removed_or_deleted; let mut res = ResolveObjectResponse::default(); let local_user = local_user_view.map(|l| l.local_user); match object { - Post(p) => { - removed_or_deleted = p.deleted || p.removed; - res.post = Some(PostView::read(pool, p.id, local_user.as_ref(), false).await?) - } - Comment(c) => { - removed_or_deleted = c.deleted || c.removed; - res.comment = Some(CommentView::read(pool, c.id, local_user.as_ref()).await?) - } - PersonOrCommunity(p) => match *p { + SearchableObjects::PostOrComment(pc) => match *pc { + PostOrComment::Post(p) => { + removed_or_deleted = p.deleted || p.removed; + res.post = Some(PostView::read(pool, p.id, local_user.as_ref(), false).await?) + } + PostOrComment::Comment(c) => { + removed_or_deleted = c.deleted || c.removed; + res.comment = Some(CommentView::read(pool, c.id, local_user.as_ref()).await?) + } + }, + SearchableObjects::PersonOrCommunity(pc) => match *pc { UserOrCommunity::User(u) => { removed_or_deleted = u.deleted; res.person = Some(PersonView::read(pool, u.id).await?) diff --git a/crates/apub/src/api/user_settings_backup.rs b/crates/apub/src/api/user_settings_backup.rs index 6a524b71df..36a6907a04 100644 --- a/crates/apub/src/api/user_settings_backup.rs +++ b/crates/apub/src/api/user_settings_backup.rs @@ -313,7 +313,7 @@ where #[cfg(test)] #[expect(clippy::indexing_slicing)] -mod tests { +pub(crate) mod tests { use crate::api::user_settings_backup::{export_settings, import_settings}; use activitypub_federation::config::Data; @@ -336,7 +336,7 @@ mod tests { use std::time::Duration; use tokio::time::sleep; - async fn create_user( + pub(crate) async fn create_user( name: String, bio: Option, context: &Data, diff --git a/crates/apub/src/fetcher/markdown_links.rs b/crates/apub/src/fetcher/markdown_links.rs new file mode 100644 index 0000000000..1689585896 --- /dev/null +++ b/crates/apub/src/fetcher/markdown_links.rs @@ -0,0 +1,191 @@ +use super::{search::SearchableObjects, user_or_community::UserOrCommunity}; +use crate::fetcher::post_or_comment::PostOrComment; +use activitypub_federation::{config::Data, fetch::object_id::ObjectId}; +use lemmy_api_common::{ + context::LemmyContext, + utils::{generate_local_apub_endpoint, EndpointType}, +}; +use lemmy_db_schema::{newtypes::InstanceId, source::instance::Instance}; +use lemmy_utils::{ + error::LemmyResult, + utils::markdown::image_links::{markdown_find_links, markdown_handle_title}, +}; +use url::Url; + +pub async fn markdown_rewrite_remote_links_opt( + src: Option, + context: &Data, +) -> Option { + match src { + Some(t) => Some(markdown_rewrite_remote_links(t, context).await), + None => None, + } +} + +/// Goes through all remote markdown links and attempts to resolve them as Activitypub objects. +/// If successful, the link is rewritten to a local link, so it can be viewed without leaving the +/// local instance. +/// +/// As it relies on ObjectId::dereference, it can only be used for incoming federated objects, not +/// for the API. +pub async fn markdown_rewrite_remote_links( + mut src: String, + context: &Data, +) -> String { + let links_offsets = markdown_find_links(&src); + + // Go through the collected links in reverse order + for (start, end) in links_offsets.into_iter().rev() { + let (url, extra) = markdown_handle_title(&src, start, end); + + if let Some(local_url) = to_local_url(url, context).await { + let mut local_url = local_url.to_string(); + // restore title + if let Some(extra) = extra { + local_url = format!("{local_url} {extra}"); + } + src.replace_range(start..end, local_url.as_str()); + } + } + + src +} + +pub(crate) async fn to_local_url(url: &str, context: &Data) -> Option { + let local_domain = &context.settings().get_protocol_and_hostname(); + let object_id = ObjectId::::parse(url).ok()?; + if object_id.inner().domain() == Some(local_domain) { + return None; + } + let dereferenced = object_id.dereference(context).await.ok()?; + match dereferenced { + SearchableObjects::PostOrComment(pc) => match *pc { + PostOrComment::Post(post) => { + generate_local_apub_endpoint(EndpointType::Post, &post.id.to_string(), local_domain) + } + PostOrComment::Comment(comment) => { + generate_local_apub_endpoint(EndpointType::Comment, &comment.id.to_string(), local_domain) + } + } + .ok() + .map(Into::into), + SearchableObjects::PersonOrCommunity(pc) => match *pc { + UserOrCommunity::User(user) => { + format_actor_url(&user.name, "u", user.instance_id, context).await + } + UserOrCommunity::Community(community) => { + format_actor_url(&community.name, "c", community.instance_id, context).await + } + } + .ok(), + } +} + +async fn format_actor_url( + name: &str, + kind: &str, + instance_id: InstanceId, + context: &LemmyContext, +) -> LemmyResult { + let local_protocol_and_hostname = context.settings().get_protocol_and_hostname(); + let local_hostname = &context.settings().hostname; + let instance = Instance::read(&mut context.pool(), instance_id).await?; + let url = if &instance.domain != local_hostname { + format!( + "{local_protocol_and_hostname}/{kind}/{name}@{}", + instance.domain + ) + } else { + format!("{local_protocol_and_hostname}/{kind}/{name}") + }; + Ok(Url::parse(&url)?) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::api::user_settings_backup::tests::create_user; + use lemmy_db_schema::{ + source::{ + community::{Community, CommunityInsertForm}, + post::{Post, PostInsertForm}, + }, + traits::Crud, + }; + use pretty_assertions::assert_eq; + use serial_test::serial; + + #[serial] + #[tokio::test] + async fn test_markdown_rewrite_remote_links() -> LemmyResult<()> { + let context = LemmyContext::init_test_context().await; + let instance = Instance::read_or_create(&mut context.pool(), "example.com".to_string()).await?; + let community = Community::create( + &mut context.pool(), + &CommunityInsertForm::new( + instance.id, + "my_community".to_string(), + "My Community".to_string(), + "pubkey".to_string(), + ), + ) + .await?; + let user = create_user("john".to_string(), None, &context).await?; + + // insert a remote post which is already fetched + let post_form = PostInsertForm { + ap_id: Some(Url::parse("https://example.com/post/123")?.into()), + ..PostInsertForm::new("My post".to_string(), user.person.id, community.id) + }; + let post = Post::create(&mut context.pool(), &post_form).await?; + let markdown_local_post_url = format!("[link](https://lemmy-alpha/post/{})", post.id); + + let tests: Vec<_> = vec![ + ( + "rewrite remote post link", + format!("[link]({})", post.ap_id), + markdown_local_post_url.as_ref(), + ), + ( + "rewrite community link", + format!("[link]({})", community.actor_id), + "[link](https://lemmy-alpha/c/my_community@example.com)", + ), + ( + "dont rewrite local post link", + "[link](https://lemmy-alpha/post/2)".to_string(), + "[link](https://lemmy-alpha/post/2)", + ), + ( + "dont rewrite local community link", + "[link](https://lemmy-alpha/c/test)".to_string(), + "[link](https://lemmy-alpha/c/test)", + ), + ( + "dont rewrite non-fediverse link", + "[link](https://example.com/)".to_string(), + "[link](https://example.com/)", + ), + ( + "dont rewrite invalid url", + "[link](example-com)".to_string(), + "[link](example-com)", + ), + ]; + + let context = LemmyContext::init_test_context().await; + for (msg, input, expected) in &tests { + let result = markdown_rewrite_remote_links(input.to_string(), &context).await; + + assert_eq!( + &result, expected, + "Testing {}, with original input '{}'", + msg, input + ); + } + + Instance::delete(&mut context.pool(), instance.id).await?; + + Ok(()) + } +} diff --git a/crates/apub/src/fetcher/mod.rs b/crates/apub/src/fetcher/mod.rs index 68fc07d305..29202004f3 100644 --- a/crates/apub/src/fetcher/mod.rs +++ b/crates/apub/src/fetcher/mod.rs @@ -10,6 +10,7 @@ use lemmy_db_schema::traits::ApubActor; use lemmy_db_views::structs::LocalUserView; use lemmy_utils::error::{LemmyError, LemmyResult}; +pub(crate) mod markdown_links; pub mod post_or_comment; pub mod search; pub mod site_or_community_or_user; diff --git a/crates/apub/src/fetcher/search.rs b/crates/apub/src/fetcher/search.rs index 76c2848206..e8c0291066 100644 --- a/crates/apub/src/fetcher/search.rs +++ b/crates/apub/src/fetcher/search.rs @@ -1,8 +1,5 @@ -use crate::{ - fetcher::user_or_community::{PersonOrGroup, UserOrCommunity}, - objects::{comment::ApubComment, community::ApubCommunity, person::ApubPerson, post::ApubPost}, - protocol::objects::{note::Note, page::Page}, -}; +use super::post_or_comment::{PageOrNote, PostOrComment}; +use crate::fetcher::user_or_community::{PersonOrGroup, UserOrCommunity}; use activitypub_federation::{ config::Data, fetch::{object_id::ObjectId, webfinger::webfinger_resolve_actor}, @@ -54,16 +51,14 @@ pub(crate) async fn search_query_to_object_id_local( /// The types of ActivityPub objects that can be fetched directly by searching for their ID. #[derive(Debug)] pub(crate) enum SearchableObjects { - Post(ApubPost), - Comment(ApubComment), + PostOrComment(Box), PersonOrCommunity(Box), } #[derive(Deserialize)] #[serde(untagged)] pub(crate) enum SearchableKinds { - Page(Box), - Note(Note), + PageOrNote(Box), PersonOrGroup(Box), } @@ -75,8 +70,7 @@ impl Object for SearchableObjects { fn last_refreshed_at(&self) -> Option> { match self { - SearchableObjects::Post(p) => p.last_refreshed_at(), - SearchableObjects::Comment(c) => c.last_refreshed_at(), + SearchableObjects::PostOrComment(p) => p.last_refreshed_at(), SearchableObjects::PersonOrCommunity(p) => p.last_refreshed_at(), } } @@ -95,13 +89,9 @@ impl Object for SearchableObjects { if let Some(uc) = uc { return Ok(Some(SearchableObjects::PersonOrCommunity(Box::new(uc)))); } - let p = ApubPost::read_from_id(object_id.clone(), context).await?; - if let Some(p) = p { - return Ok(Some(SearchableObjects::Post(p))); - } - let c = ApubComment::read_from_id(object_id, context).await?; - if let Some(c) = c { - return Ok(Some(SearchableObjects::Comment(c))); + let pc = PostOrComment::read_from_id(object_id.clone(), context).await?; + if let Some(pc) = pc { + return Ok(Some(SearchableObjects::PostOrComment(Box::new(pc)))); } Ok(None) } @@ -109,25 +99,16 @@ impl Object for SearchableObjects { #[tracing::instrument(skip_all)] async fn delete(self, data: &Data) -> LemmyResult<()> { match self { - SearchableObjects::Post(p) => p.delete(data).await, - SearchableObjects::Comment(c) => c.delete(data).await, - SearchableObjects::PersonOrCommunity(pc) => match *pc { - UserOrCommunity::User(p) => p.delete(data).await, - UserOrCommunity::Community(c) => c.delete(data).await, - }, + SearchableObjects::PostOrComment(pc) => pc.delete(data).await, + SearchableObjects::PersonOrCommunity(pc) => pc.delete(data).await, } } async fn into_json(self, data: &Data) -> LemmyResult { + use SearchableObjects::*; Ok(match self { - SearchableObjects::Post(p) => SearchableKinds::Page(Box::new(p.into_json(data).await?)), - SearchableObjects::Comment(c) => SearchableKinds::Note(c.into_json(data).await?), - SearchableObjects::PersonOrCommunity(pc) => { - SearchableKinds::PersonOrGroup(Box::new(match *pc { - UserOrCommunity::User(p) => PersonOrGroup::Person(p.into_json(data).await?), - UserOrCommunity::Community(c) => PersonOrGroup::Group(c.into_json(data).await?), - })) - } + PostOrComment(pc) => SearchableKinds::PageOrNote(Box::new(pc.into_json(data).await?)), + PersonOrCommunity(pc) => SearchableKinds::PersonOrGroup(Box::new(pc.into_json(data).await?)), }) } @@ -137,24 +118,20 @@ impl Object for SearchableObjects { expected_domain: &Url, data: &Data, ) -> LemmyResult<()> { + use SearchableKinds::*; match apub { - SearchableKinds::Page(a) => ApubPost::verify(a, expected_domain, data).await, - SearchableKinds::Note(a) => ApubComment::verify(a, expected_domain, data).await, - SearchableKinds::PersonOrGroup(pg) => match pg.as_ref() { - PersonOrGroup::Person(a) => ApubPerson::verify(a, expected_domain, data).await, - PersonOrGroup::Group(a) => ApubCommunity::verify(a, expected_domain, data).await, - }, + PageOrNote(pn) => PostOrComment::verify(pn, expected_domain, data).await, + PersonOrGroup(pg) => UserOrCommunity::verify(pg, expected_domain, data).await, } } #[tracing::instrument(skip_all)] async fn from_json(apub: Self::Kind, context: &Data) -> LemmyResult { - use SearchableKinds as SAT; + use SearchableKinds::*; use SearchableObjects as SO; Ok(match apub { - SAT::Page(p) => SO::Post(ApubPost::from_json(*p, context).await?), - SAT::Note(n) => SO::Comment(ApubComment::from_json(n, context).await?), - SAT::PersonOrGroup(pg) => { + PageOrNote(pg) => SO::PostOrComment(Box::new(PostOrComment::from_json(*pg, context).await?)), + PersonOrGroup(pg) => { SO::PersonOrCommunity(Box::new(UserOrCommunity::from_json(*pg, context).await?)) } }) diff --git a/crates/apub/src/objects/comment.rs b/crates/apub/src/objects/comment.rs index dd02850912..b6b411ec64 100644 --- a/crates/apub/src/objects/comment.rs +++ b/crates/apub/src/objects/comment.rs @@ -1,6 +1,7 @@ use crate::{ activities::{verify_is_public, verify_person_in_community}, check_apub_id_valid_with_strictness, + fetcher::markdown_links::markdown_rewrite_remote_links, mentions::collect_non_local_mentions, objects::{read_from_string_or_source, verify_is_remote_object}, protocol::{ @@ -181,6 +182,7 @@ impl Object for ApubComment { let slur_regex = &local_site_opt_to_slur_regex(&local_site); let url_blocklist = get_url_blocklist(context).await?; let content = process_markdown(&content, slur_regex, &url_blocklist, context).await?; + let content = markdown_rewrite_remote_links(content, context).await; let language_id = Some( LanguageTag::to_language_id_single(note.language.unwrap_or_default(), &mut context.pool()) .await?, @@ -298,7 +300,7 @@ pub(crate) mod tests { let comment = ApubComment::from_json(json, &context).await?; assert_eq!(comment.ap_id, pleroma_url.into()); - assert_eq!(comment.content.len(), 64); + assert_eq!(comment.content.len(), 10); assert!(!comment.local); assert_eq!(context.request_count(), 1); diff --git a/crates/apub/src/objects/community.rs b/crates/apub/src/objects/community.rs index d7e2490a7c..2c60b87b34 100644 --- a/crates/apub/src/objects/community.rs +++ b/crates/apub/src/objects/community.rs @@ -1,6 +1,7 @@ use crate::{ activities::GetActorType, check_apub_id_valid, + fetcher::markdown_links::markdown_rewrite_remote_links_opt, local_site_data_cached, objects::{instance::fetch_instance_actor_for_object, read_from_string_or_source_opt}, protocol::{ @@ -148,6 +149,7 @@ impl Object for ApubCommunity { let description = read_from_string_or_source_opt(&group.summary, &None, &group.source); let description = process_markdown_opt(&description, slur_regex, &url_blocklist, context).await?; + let description = markdown_rewrite_remote_links_opt(description, context).await; let icon = proxy_image_link_opt_apub(group.icon.map(|i| i.url), context).await?; let banner = proxy_image_link_opt_apub(group.image.map(|i| i.url), context).await?; @@ -296,7 +298,7 @@ pub(crate) mod tests { assert!(!community.local); assert_eq!( community.description.as_ref().map(std::string::String::len), - Some(132) + Some(63) ); Community::delete(&mut context.pool(), community.id).await?; diff --git a/crates/apub/src/objects/instance.rs b/crates/apub/src/objects/instance.rs index c67a223e0f..6ee0a41dcb 100644 --- a/crates/apub/src/objects/instance.rs +++ b/crates/apub/src/objects/instance.rs @@ -2,6 +2,7 @@ use super::verify_is_remote_object; use crate::{ activities::GetActorType, check_apub_id_valid_with_strictness, + fetcher::markdown_links::markdown_rewrite_remote_links_opt, local_site_data_cached, objects::read_from_string_or_source_opt, protocol::{ @@ -151,6 +152,7 @@ impl Object for ApubSite { let url_blocklist = get_url_blocklist(context).await?; let sidebar = read_from_string_or_source_opt(&apub.content, &None, &apub.source); let sidebar = process_markdown_opt(&sidebar, slur_regex, &url_blocklist, context).await?; + let sidebar = markdown_rewrite_remote_links_opt(sidebar, context).await; let icon = proxy_image_link_opt_apub(apub.icon.map(|i| i.url), context).await?; let banner = proxy_image_link_opt_apub(apub.image.map(|i| i.url), context).await?; diff --git a/crates/apub/src/objects/person.rs b/crates/apub/src/objects/person.rs index 61ff04622e..406a77d944 100644 --- a/crates/apub/src/objects/person.rs +++ b/crates/apub/src/objects/person.rs @@ -2,6 +2,7 @@ use super::verify_is_remote_object; use crate::{ activities::GetActorType, check_apub_id_valid_with_strictness, + fetcher::markdown_links::markdown_rewrite_remote_links_opt, local_site_data_cached, objects::{instance::fetch_instance_actor_for_object, read_from_string_or_source_opt}, protocol::{ @@ -156,6 +157,7 @@ impl Object for ApubPerson { let url_blocklist = get_url_blocklist(context).await?; let bio = read_from_string_or_source_opt(&person.summary, &None, &person.source); let bio = process_markdown_opt(&bio, slur_regex, &url_blocklist, context).await?; + let bio = markdown_rewrite_remote_links_opt(bio, context).await; let avatar = proxy_image_link_opt_apub(person.icon.map(|i| i.url), context).await?; let banner = proxy_image_link_opt_apub(person.image.map(|i| i.url), context).await?; @@ -277,7 +279,7 @@ pub(crate) mod tests { assert_eq!(person.name, "lanodan"); assert!(!person.local); assert_eq!(context.request_count(), 0); - assert_eq!(person.bio.as_ref().map(std::string::String::len), Some(873)); + assert_eq!(person.bio.as_ref().map(std::string::String::len), Some(812)); cleanup((person, site), &context).await?; Ok(()) diff --git a/crates/apub/src/objects/post.rs b/crates/apub/src/objects/post.rs index 44fd4d56a7..ee88cf3ec7 100644 --- a/crates/apub/src/objects/post.rs +++ b/crates/apub/src/objects/post.rs @@ -1,6 +1,7 @@ use crate::{ activities::{verify_is_public, verify_person_in_community}, check_apub_id_valid_with_strictness, + fetcher::markdown_links::{markdown_rewrite_remote_links_opt, to_local_url}, local_site_data_cached, objects::{read_from_string_or_source_opt, verify_is_remote_object}, protocol::{ @@ -226,10 +227,13 @@ impl Object for ApubPost { let url_blocklist = get_url_blocklist(context).await?; - if let Some(url) = &url { - is_url_blocked(url, &url_blocklist)?; - is_valid_url(url)?; - } + let url = if let Some(url) = url { + is_url_blocked(&url, &url_blocklist)?; + is_valid_url(&url)?; + to_local_url(url.as_str(), context).await.or(Some(url)) + } else { + None + }; let alt_text = first_attachment.cloned().and_then(Attachment::alt_text); @@ -237,6 +241,7 @@ impl Object for ApubPost { let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source); let body = process_markdown_opt(&body, slur_regex, &url_blocklist, context).await?; + let body = markdown_rewrite_remote_links_opt(body, context).await; let language_id = Some( LanguageTag::to_language_id_single(page.language.unwrap_or_default(), &mut context.pool()) .await?, @@ -303,7 +308,7 @@ mod tests { assert_eq!(post.body.as_ref().map(std::string::String::len), Some(45)); assert!(!post.locked); assert!(!post.featured_community); - assert_eq!(context.request_count(), 0); + assert_eq!(context.request_count(), 1); Post::delete(&mut context.pool(), post.id).await?; Person::delete(&mut context.pool(), person.id).await?; diff --git a/crates/apub/src/objects/private_message.rs b/crates/apub/src/objects/private_message.rs index d3ca340db2..3ed5b35722 100644 --- a/crates/apub/src/objects/private_message.rs +++ b/crates/apub/src/objects/private_message.rs @@ -1,6 +1,7 @@ use super::verify_is_remote_object; use crate::{ check_apub_id_valid_with_strictness, + fetcher::markdown_links::markdown_rewrite_remote_links, objects::read_from_string_or_source, protocol::{ objects::chat_message::{ChatMessage, ChatMessageType}, @@ -134,6 +135,7 @@ impl Object for ApubPrivateMessage { let url_blocklist = get_url_blocklist(context).await?; let content = read_from_string_or_source(¬e.content, &None, ¬e.source); let content = process_markdown(&content, slur_regex, &url_blocklist, context).await?; + let content = markdown_rewrite_remote_links(content, context).await; let form = PrivateMessageInsertForm { creator_id: creator.id, diff --git a/crates/db_schema/src/impls/instance.rs b/crates/db_schema/src/impls/instance.rs index adde5482ba..6c72b5e189 100644 --- a/crates/db_schema/src/impls/instance.rs +++ b/crates/db_schema/src/impls/instance.rs @@ -67,6 +67,11 @@ impl Instance { } } } + pub async fn read(pool: &mut DbPool<'_>, instance_id: InstanceId) -> Result { + let conn = &mut get_conn(pool).await?; + instance::table.find(instance_id).first(conn).await + } + pub async fn update( pool: &mut DbPool<'_>, instance_id: InstanceId, diff --git a/crates/utils/src/utils/markdown/image_links.rs b/crates/utils/src/utils/markdown/image_links.rs new file mode 100644 index 0000000000..a21bb6f416 --- /dev/null +++ b/crates/utils/src/utils/markdown/image_links.rs @@ -0,0 +1,168 @@ +use super::{link_rule::Link, MARKDOWN_PARSER}; +use crate::settings::SETTINGS; +use markdown_it::{plugins::cmark::inline::image::Image, NodeValue}; +use url::Url; +use urlencoding::encode; + +/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`. +pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec) { + let links_offsets = find_urls::(&src); + + let mut links = vec![]; + // Go through the collected links in reverse order + for (start, end) in links_offsets.into_iter().rev() { + let (url, extra) = markdown_handle_title(&src, start, end); + match Url::parse(url) { + Ok(parsed) => { + links.push(parsed.clone()); + // If link points to remote domain, replace with proxied link + if parsed.domain() != Some(&SETTINGS.hostname) { + let mut proxied = format!( + "{}/api/v3/image_proxy?url={}", + SETTINGS.get_protocol_and_hostname(), + encode(url), + ); + // restore custom emoji format + if let Some(extra) = extra { + proxied = format!("{proxied} {extra}"); + } + src.replace_range(start..end, &proxied); + } + } + Err(_) => { + // If its not a valid url, replace with empty text + src.replace_range(start..end, ""); + } + } + } + + (src, links) +} + +pub fn markdown_handle_title(src: &str, start: usize, end: usize) -> (&str, Option<&str>) { + let content = src.get(start..end).unwrap_or_default(); + // necessary for custom emojis which look like `![name](url "title")` + let (url, extra) = if content.contains(' ') { + let split = content.split_once(' ').expect("split is valid"); + (split.0, Some(split.1)) + } else { + (content, None) + }; + (url, extra) +} + +pub fn markdown_find_links(src: &str) -> Vec<(usize, usize)> { + find_urls::(src) +} + +// Walk the syntax tree to find positions of image or link urls +fn find_urls(src: &str) -> Vec<(usize, usize)> { + let ast = MARKDOWN_PARSER.parse(src); + let mut links_offsets = vec![]; + ast.walk(|node, _depth| { + if let Some(image) = node.cast::() { + let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets(); + let start_offset = node_offsets.1 - image.url_len() - 1 - image.title_len(); + let end_offset = node_offsets.1 - 1; + + links_offsets.push((start_offset, end_offset)); + } + }); + links_offsets +} + +pub trait UrlAndTitle { + fn url_len(&self) -> usize; + fn title_len(&self) -> usize; +} + +impl UrlAndTitle for Image { + fn url_len(&self) -> usize { + self.url.len() + } + + fn title_len(&self) -> usize { + self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default() + } +} +impl UrlAndTitle for Link { + fn url_len(&self) -> usize { + self.url.len() + } + fn title_len(&self) -> usize { + self.title.as_ref().map(|t| t.len() + 3).unwrap_or_default() + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn test_find_links() { + let links = markdown_find_links("[test](https://example.com)"); + assert_eq!(vec![(7, 26)], links); + + let links = find_urls::("![test](https://example.com)"); + assert_eq!(vec![(8, 27)], links); + } + + #[test] + fn test_markdown_proxy_images() { + let tests: Vec<_> = + vec![ + ( + "remote image proxied", + "![link](http://example.com/image.jpg)", + "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)", + ), + ( + "local image unproxied", + "![link](http://lemmy-alpha/image.jpg)", + "![link](http://lemmy-alpha/image.jpg)", + ), + ( + "multiple image links", + "![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)", + "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)", + ), + ( + "empty link handled", + "![image]()", + "![image]()" + ), + ( + "empty label handled", + "![](http://example.com/image.jpg)", + "![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)" + ), + ( + "invalid image link removed", + "![image](http-not-a-link)", + "![image]()" + ), + ( + "label with nested markdown handled", + "![a *b* c](http://example.com/image.jpg)", + "![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)" + ), + ( + "custom emoji support", + r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#, + r#"![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"# + ) + ]; + + tests.iter().for_each(|&(msg, input, expected)| { + let result = markdown_rewrite_image_links(input.to_string()); + + assert_eq!( + result.0, expected, + "Testing {}, with original input '{}'", + msg, input + ); + }); + } +} diff --git a/crates/utils/src/utils/markdown/mod.rs b/crates/utils/src/utils/markdown/mod.rs index e2d6040c46..a6de096e38 100644 --- a/crates/utils/src/utils/markdown/mod.rs +++ b/crates/utils/src/utils/markdown/mod.rs @@ -1,10 +1,9 @@ -use crate::{error::LemmyResult, settings::SETTINGS, LemmyErrorType}; -use markdown_it::{plugins::cmark::inline::image::Image, MarkdownIt}; +use crate::{error::LemmyResult, LemmyErrorType}; +use markdown_it::MarkdownIt; use regex::RegexSet; use std::sync::LazyLock; -use url::Url; -use urlencoding::encode; +pub mod image_links; mod link_rule; mod spoiler_rule; @@ -35,70 +34,6 @@ pub fn markdown_to_html(text: &str) -> String { MARKDOWN_PARSER.parse(text).xrender() } -/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`. -pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec) { - let ast = MARKDOWN_PARSER.parse(&src); - let mut links_offsets = vec![]; - - // Walk the syntax tree to find positions of image links - ast.walk(|node, _depth| { - if let Some(image) = node.cast::() { - // srcmap is always present for image - // https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387 - let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets(); - // necessary for custom emojis which look like `![name](url "title")` - let start_offset = node_offsets.1 - - image.url.len() - - 1 - - image - .title - .as_ref() - .map(|t| t.len() + 3) - .unwrap_or_default(); - let end_offset = node_offsets.1 - 1; - - links_offsets.push((start_offset, end_offset)); - } - }); - - let mut links = vec![]; - // Go through the collected links in reverse order - while let Some((start, end)) = links_offsets.pop() { - let content = src.get(start..end).unwrap_or_default(); - // necessary for custom emojis which look like `![name](url "title")` - let (url, extra) = if content.contains(' ') { - let split = content.split_once(' ').expect("split is valid"); - (split.0, Some(split.1)) - } else { - (content, None) - }; - match Url::parse(url) { - Ok(parsed) => { - links.push(parsed.clone()); - // If link points to remote domain, replace with proxied link - if parsed.domain() != Some(&SETTINGS.hostname) { - let mut proxied = format!( - "{}/api/v3/image_proxy?url={}", - SETTINGS.get_protocol_and_hostname(), - encode(url), - ); - // restore custom emoji format - if let Some(extra) = extra { - proxied = format!("{proxied} {extra}"); - } - src.replace_range(start..end, &proxied); - } - } - Err(_) => { - // If its not a valid url, replace with empty text - src.replace_range(start..end, ""); - } - } - } - - (src, links) -} - pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> LemmyResult<()> { if blocklist.is_match(text) { Err(LemmyErrorType::BlockedUrl)? @@ -110,6 +45,7 @@ pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> Lemm mod tests { use super::*; + use image_links::markdown_rewrite_image_links; use pretty_assertions::assert_eq; #[test]