From 788634fb832211a90248c6e811c687000c63ff95 Mon Sep 17 00:00:00 2001 From: cottongin Date: Tue, 10 Mar 2026 21:53:26 -0400 Subject: [PATCH] feat: add HTML stripping utility for Owncast emoji and markup Made-with: Cursor --- src/html.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 1 + 2 files changed, 88 insertions(+) create mode 100644 src/html.rs diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..1a54bb4 --- /dev/null +++ b/src/html.rs @@ -0,0 +1,87 @@ +use std::borrow::Cow; + +/// Extracts alt text from tags (for Owncast emoji) and strips all other HTML. +pub fn strip_html(input: &str) -> String { + let mut result = String::with_capacity(input.len()); + let mut chars = input.chars().peekable(); + + while let Some(&ch) = chars.peek() { + if ch == '<' { + chars.next(); // consume '<' + let tag: String = chars.by_ref().take_while(|&c| c != '>').collect(); + if tag.starts_with("img ") || tag.starts_with("img\t") { + if let Some(alt) = extract_attr(&tag, "alt") { + result.push_str(&alt); + } + } + } else if ch == '&' { + chars.next(); // consume '&' + let entity_name: String = chars.by_ref().take_while(|&c| c != ';').collect(); + let full_entity = format!("&{}", entity_name); + result.push_str(&decode_entity(&full_entity)); + } else { + result.push(ch); + chars.next(); + } + } + + result +} + +fn extract_attr(tag: &str, attr_name: &str) -> Option { + let pattern = format!("{}=\"", attr_name); + let start = tag.find(&pattern)? + pattern.len(); + let rest = &tag[start..]; + let end = rest.find('"')?; + Some(rest[..end].to_string()) +} + +fn decode_entity(entity: &str) -> Cow<'static, str> { + match entity { + "&" => Cow::Borrowed("&"), + "<" => Cow::Borrowed("<"), + ">" => Cow::Borrowed(">"), + """ => Cow::Borrowed("\""), + "'" | "&apos" => Cow::Borrowed("'"), + " " => Cow::Borrowed(" "), + other => Cow::Owned(other.to_string()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_plain_text_unchanged() { + assert_eq!(strip_html("hello world"), "hello world"); + } + + #[test] + fn test_strips_basic_tags() { + assert_eq!(strip_html("bold text"), "bold text"); + } + + #[test] + fn test_emoji_img_to_alt_text() { + let input = r#"hello :beerparrot: world"#; + assert_eq!(strip_html(input), "hello :beerparrot: world"); + } + + #[test] + fn test_multiple_emoji() { + let input = r#":a::b:"#; + assert_eq!(strip_html(input), ":a::b:"); + } + + #[test] + fn test_strips_links() { + let input = r#"check this link"#; + assert_eq!(strip_html(input), "check this link"); + } + + #[test] + fn test_decodes_html_entities() { + assert_eq!(strip_html("a & b < c"), "a & b < c"); + } +} diff --git a/src/main.rs b/src/main.rs index 86ab3db..b8695e8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,6 @@ mod config; mod events; +mod html; fn main() { println!("owncast-irc-bridge");