--- /dev/null
+using System.Collections.Frozen;
+using System.Linq;
+using System.Text;
+using System.Text.Unicode;
+
+namespace Content.Shared.Chat.V2.Moderation;
+
+/// <summary>
+/// A basic censor. Not bullet-proof.
+/// </summary>
+public sealed class SimpleCensor : IChatCensor
+{
+ // Common substitution symbols are replaced with one of the characters they commonly substitute.
+ private bool _shouldSanitizeLeetspeak;
+ private FrozenDictionary<char, char> _leetspeakReplacements = FrozenDictionary<char, char>.Empty;
+
+ // Special characters are replaced with spaces.
+ private bool _shouldSanitizeSpecialCharacters;
+ private HashSet<char> _specialCharacterReplacements = [];
+
+ // Censored words are removed unless they're a false positive (e.g. Scunthorpe)
+ private string[] _censoredWords = Array.Empty<string>();
+ private string[] _falsePositives = Array.Empty<string>();
+
+ // False negatives are censored words that contain a false positives.
+ private string[] _falseNegatives = Array.Empty<string>();
+
+ // What unicode ranges are allowed? If this array is empty, don't filter by range.
+ private UnicodeRange[] _allowedUnicodeRanges= Array.Empty<UnicodeRange>();
+
+ /// <summary>
+ /// Censors the input string.
+ /// </summary>
+ /// <param name="input">The input string</param>
+ /// <param name="output">The output string</param>
+ /// <param name="replaceWith">The character to replace with</param>
+ /// <returns>If output is valid</returns>
+ public bool Censor(string input, out string output, char replaceWith = '*')
+ {
+ output = Censor(input, replaceWith);
+
+ return !string.Equals(input, output);
+ }
+
+ public string Censor(string input, char replaceWith = '*')
+ {
+ // We flat-out ban anything not in the allowed unicode ranges, stripping them
+ input = SanitizeOutBlockedUnicode(input);
+
+ var originalInput = input.ToCharArray();
+
+ input = SanitizeInput(input);
+
+ var censored = input.ToList();
+
+ // Remove false negatives
+ input = CheckProfanity(input, censored, _falseNegatives, replaceWith);
+
+ // Get false positives
+ var falsePositives = FindFalsePositives(censored, replaceWith);
+
+ // Remove censored words
+ CheckProfanity(input, censored, _censoredWords, replaceWith);
+
+ // Reconstruct
+ // Reconstruct false positives
+ for (var i = 0; i < falsePositives.Length; i++)
+ {
+ if (falsePositives[i] != replaceWith)
+ {
+ censored[i] = falsePositives[i];
+ }
+ }
+
+ for (var i = 0; i < originalInput.Length; i++)
+ {
+ if (originalInput[i] == ' ')
+ {
+ censored.Insert(i, ' ');
+
+ continue;
+ }
+
+ if (_shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(originalInput[i]))
+ {
+ censored.Insert(i, originalInput[i]);
+
+ continue;
+ }
+
+ if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
+ {
+ // detect "()"
+ if (originalInput[i] == '(' && i != originalInput.Length - 1 && originalInput[i+1] == ')')
+ {
+ // censored has now had "o" replaced with "o) so both strings line up again..."
+ censored.Insert(i+1, censored[i] != replaceWith ? ')' : replaceWith);
+ }
+ }
+
+ if (censored[i] != replaceWith)
+ {
+ censored[i] = originalInput[i];
+ }
+ }
+
+ // SO says this is fast...
+ return string.Concat(censored);
+ }
+
+ /// <summary>
+ /// Adds a l33tsp34k sanitization rule
+ /// </summary>
+ /// <returns>The censor for further configuration</returns>
+ public SimpleCensor WithSanitizeLeetSpeak()
+ {
+ _shouldSanitizeLeetspeak = true;
+
+ return BuildCharacterReplacements();
+ }
+
+ /// <summary>
+ /// Adds a l33tsp34k sanitization rule
+ /// </summary>
+ /// <returns>The censor for further configuration</returns>
+ public SimpleCensor WithSanitizeSpecialCharacters()
+ {
+ _shouldSanitizeSpecialCharacters = true;
+
+ return BuildCharacterReplacements();
+ }
+
+ public SimpleCensor WithRanges(UnicodeRange[] ranges)
+ {
+ _allowedUnicodeRanges = ranges;
+
+ return this;
+ }
+
+ public SimpleCensor WithCustomDictionary(string[] naughtyWords)
+ {
+ _censoredWords = naughtyWords;
+
+ return this;
+ }
+
+ public SimpleCensor WithFalsePositives(string[] falsePositives)
+ {
+ _falsePositives = falsePositives;
+
+ return this;
+ }
+
+ public SimpleCensor WithFalseNegatives(string[] falseNegatives)
+ {
+ _falseNegatives = falseNegatives;
+
+ return this;
+ }
+
+ public SimpleCensor WithLeetspeakReplacements(Dictionary<char, char> replacements)
+ {
+ _leetspeakReplacements = replacements.ToFrozenDictionary();
+
+ return this;
+ }
+
+ public SimpleCensor WithSpecialCharacterReplacements(Dictionary<char, char> replacements)
+ {
+ _leetspeakReplacements = replacements.ToFrozenDictionary();
+
+ return this;
+ }
+
+ private string CheckProfanity(string input, List<char> censored, string[] words, char replaceWith = '*')
+ {
+ foreach (var word in words)
+ {
+ var wordLength = word.Length;
+ var endOfFoundWord = 0;
+ var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+
+ while(foundIndex > -1)
+ {
+ endOfFoundWord = foundIndex + wordLength;
+
+ for (var i = 0; i < wordLength; i++)
+ {
+ censored[foundIndex+i] = replaceWith;
+ }
+
+ foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+ }
+ }
+
+ return input;
+ }
+
+ private char[] FindFalsePositives(List<char> chars, char replaceWith = '*')
+ {
+ var input = string.Concat(chars);
+
+ var output = Enumerable.Repeat(replaceWith, input.Length).ToArray();
+ var inputAsARr = input.ToArray();
+
+ foreach (var word in _falsePositives)
+ {
+ var wordLength = word.Length;
+ var endOfFoundWord = 0;
+ var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+
+ while(foundIndex > -1)
+ {
+ endOfFoundWord = foundIndex + wordLength;
+
+ for (var i = foundIndex; i < endOfFoundWord; i++)
+ {
+ output[i] = inputAsARr[i];
+ }
+
+ foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+ }
+ }
+
+ return output;
+ }
+
+ private string SanitizeInput(string input)
+ {
+ // "()" is a broad enough trick to beat censors that we we should check for it broadly.
+ if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
+ {
+ input = input.Replace("()", "o");
+ }
+
+ var sb = new StringBuilder();
+
+ // ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator
+ foreach (var character in input)
+ {
+ if (character == ' ' || _shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(character))
+ {
+ continue;
+ }
+
+ if (_shouldSanitizeLeetspeak && _leetspeakReplacements.TryGetValue(character, out var leetRepl))
+ {
+ sb.Append(leetRepl);
+
+ continue;
+ }
+
+ sb.Append(character);
+ }
+
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Returns a string with all characters not in ISO-8851-1 replaced with question marks
+ /// </summary>
+ private string SanitizeOutBlockedUnicode(string input)
+ {
+ if (_allowedUnicodeRanges.Length <= 0)
+ {
+ return input;
+ }
+
+ var sb = new StringBuilder();
+
+ foreach (var symbol in input.EnumerateRunes())
+ {
+ // ReSharper disable once LoopCanBeConvertedToQuery
+ foreach (var range in _allowedUnicodeRanges)
+ {
+ if (symbol.Value < range.FirstCodePoint || symbol.Value >= range.FirstCodePoint + range.Length)
+ continue;
+
+ sb.Append(symbol);
+
+ break;
+ }
+ }
+
+ return sb.ToString();
+ }
+
+ private SimpleCensor BuildCharacterReplacements()
+ {
+ if (_shouldSanitizeSpecialCharacters)
+ {
+ _specialCharacterReplacements =
+ [
+ '-',
+ '_',
+ '|',
+ '.',
+ ',',
+ '(',
+ ')',
+ '<',
+ '>',
+ '"',
+ '`',
+ '~',
+ '*',
+ '&',
+ '%',
+ '$',
+ '#',
+ '@',
+ '!',
+ '?',
+ '+'
+ ];
+ }
+
+ if (_shouldSanitizeLeetspeak)
+ {
+ _leetspeakReplacements = new Dictionary<char, char>
+ {
+ ['4'] = 'a',
+ ['$'] = 's',
+ ['!'] = 'i',
+ ['+'] = 't',
+ ['#'] = 'h',
+ ['@'] = 'a',
+ ['0'] = 'o',
+ ['1'] = 'i', // also obviously can be l; gamer-words need i's more though.
+ ['7'] = 'l',
+ ['3'] = 'e',
+ ['5'] = 's',
+ ['9'] = 'g',
+ ['<'] = 'c'
+ }.ToFrozenDictionary();
+ }
+
+ return this;
+ }
+}
--- /dev/null
+using System.Text.Unicode;
+using Content.Shared.Chat.V2.Moderation;
+using NUnit.Framework;
+
+namespace Content.Tests.Shared.Chat.V2.Moderation;
+
+public sealed class SimpleCensorTests
+{
+ [Test]
+ public void CanCensorASingleWord()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus"]);
+ var output = sut.Censor("hello amogus");
+
+ Assert.That(output, Is.EqualTo("hello ******"));
+ }
+
+ // Basics - use custom dictionary
+
+ [Test]
+ public void CanCensorMultipleWordInstances()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus"]);
+ var output = sut.Censor("amogus hello amogus");
+
+ Assert.That(output, Is.EqualTo("****** hello ******"));
+ }
+
+ [Test]
+ public void CanCensorMultipleWords()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+ var output = sut.Censor("amogus hello sus");
+
+ Assert.That(output, Is.EqualTo("****** hello ***"));
+ }
+
+ [Test]
+ public void CanUseDifferentCensorSymbols()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+ var output = sut.Censor("amogus hello sus", '#');
+
+ Assert.That(output, Is.EqualTo("###### hello ###"));
+ }
+
+ [Test]
+ public void CanCatchCapitalizedWords()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+ var output = sut.Censor("AMOGUS hello SUS");
+
+ Assert.That(output, Is.EqualTo("****** hello ***"));
+ }
+
+ [Test]
+ public void CanCatchWordsWithSomeCaptialsInThem()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+ var output = sut.Censor("AmoGuS hello SuS");
+
+ Assert.That(output, Is.EqualTo("****** hello ***"));
+ }
+
+ [Test]
+ public void CanCatchWordsHiddenInsideOtherWords()
+ {
+ var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+ var output = sut.Censor("helamoguslo suspicious");
+
+ Assert.That(output, Is.EqualTo("hel******lo ***picious"));
+ }
+
+ // Sanitizing Leetspeak
+
+ [Test]
+ public void CanSanitizeLeetspeak()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+ var output = sut.Censor("am0gu5 hello 5u5");
+
+ Assert.That(output, Is.EqualTo("****** hello ***"));
+ }
+
+ [Test]
+ public void SanitizingLeetspeakOnlyOccursWhenTheWordIsBlocked()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+ var output = sut.Censor("he110");
+
+ Assert.That(output, Is.EqualTo("he110"));
+ }
+
+ [Test]
+ public void CanCatchLeetspeakReplacementsWithMoreThanOneLetter()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+ var output = sut.Censor("am()gu5 hello 5u5");
+
+ Assert.That(output, Is.EqualTo("******* hello ***"));
+ }
+
+ // Sanitizing special characters
+
+ [Test]
+ public void DoesNotSanitizeOutUncensoredSpecialCharacters()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters();
+ var output = sut.Censor("amogus!hello!sus");
+
+ Assert.That(output, Is.EqualTo("******!hello!***"));
+ }
+
+ [Test]
+ public void DoesSanitizeOutCensoredSpecialCharacters()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters();
+ var output = sut.Censor("amo!gus hello s?us");
+
+ Assert.That(output, Is.EqualTo("***!*** hello *?**"));
+ }
+
+ // Unicode ranges
+
+ [Test]
+ public void SanitizesOutNonLatinCharaters()
+ {
+ var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement]);
+ var output = sut.Censor("amogus Україна sus 日本");
+
+ Assert.That(output, Is.EqualTo("amogus sus "));
+ }
+
+ [Test]
+ public void SanitizesOutNonLatinOrCyrillicCharaters()
+ {
+ var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement, UnicodeRanges.Cyrillic]);
+ var output = sut.Censor("amogus Україна sus 日本");
+
+ Assert.That(output, Is.EqualTo("amogus Україна sus "));
+ }
+
+ // False positives
+ [Test]
+ public void CanHandleFalsePositives()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]);
+ var output = sut.Censor("amogusus hello amogus hello sus");
+
+ Assert.That(output, Is.EqualTo("amogusus hello ****** hello ***"));
+ }
+
+ // False negatives
+ [Test]
+ public void CanHandleFalseNegatives()
+ {
+ var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]).WithFalseNegatives(["susamogusus"]);
+ var output = sut.Censor("susamogusus hello amogus hello sus amogusus");
+
+ Assert.That(output, Is.EqualTo("*********** hello ****** hello *** ********"));
+ }
+}