]> git.smokeofanarchy.ru Git - space-station-14.git/commitdiff
SS14-17313 Chatfactor: Chat Censorship Systems (#25908)
authorHannah Giovanna Dawson <karakkaraz@gmail.com>
Mon, 25 Mar 2024 23:50:20 +0000 (23:50 +0000)
committerGitHub <noreply@github.com>
Mon, 25 Mar 2024 23:50:20 +0000 (10:50 +1100)
* SS14-17313 Chat Censorship Systems

Adds some systems to manage chat censorship:

1. No-op: does nothing
2. SimpleCensor: a regex-free censor with a variety of rules to use
3. RegexCensor: a censor that uses regex.

This exposes a singleton backed by a builder pattern (ChatCensor) that
is set up, probably during the code init phase, and then globally available
for your censorship needs.

* Migrate to Shared

* Add a reset function to the builder.

* Resolve PJB's feedback; add unit tests

Content.Shared/Chat/V2/Moderation/ChatCensor.cs [new file with mode: 0644]
Content.Shared/Chat/V2/Moderation/RegexCensor.cs [new file with mode: 0644]
Content.Shared/Chat/V2/Moderation/SimpleCensor.cs [new file with mode: 0644]
Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs [new file with mode: 0644]

diff --git a/Content.Shared/Chat/V2/Moderation/ChatCensor.cs b/Content.Shared/Chat/V2/Moderation/ChatCensor.cs
new file mode 100644 (file)
index 0000000..b5d6aa0
--- /dev/null
@@ -0,0 +1,59 @@
+using System.Linq;
+
+namespace Content.Shared.Chat.V2.Moderation;
+
+public interface IChatCensor
+{
+    public bool Censor(string input, out string output, char replaceWith = '*');
+}
+
+public sealed class CompoundChatCensor(IEnumerable<IChatCensor> censors) : IChatCensor
+{
+    public bool Censor(string input, out string output, char replaceWith = '*')
+    {
+        var censored = false;
+
+        foreach (var censor in censors)
+        {
+            if (censor.Censor(input, out output, replaceWith))
+            {
+                censored = true;
+            }
+        }
+
+        output = input;
+
+        return censored;
+    }
+}
+
+public sealed class ChatCensorFactory
+{
+    private List<IChatCensor> _censors = new();
+
+    public void With(IChatCensor censor)
+    {
+        _censors.Add(censor);
+    }
+
+    /// <summary>
+    /// Builds a ChatCensor that combines all the censors that have been added to this.
+    /// </summary>
+    public IChatCensor Build()
+    {
+        return new CompoundChatCensor(_censors.ToArray());
+    }
+
+    /// <summary>
+    /// Resets the build state to zero, allowing for different rules to be provided to the next censor(s) built.
+    /// </summary>
+    /// <returns>True if the builder had any setup prior to the reset.</returns>
+    public bool Reset()
+    {
+        var notEmpty = _censors.Count > 0;
+
+        _censors = new List<IChatCensor>();
+
+        return notEmpty;
+    }
+}
diff --git a/Content.Shared/Chat/V2/Moderation/RegexCensor.cs b/Content.Shared/Chat/V2/Moderation/RegexCensor.cs
new file mode 100644 (file)
index 0000000..cd47bf0
--- /dev/null
@@ -0,0 +1,15 @@
+using System.Text.RegularExpressions;
+
+namespace Content.Shared.Chat.V2.Moderation;
+
+public sealed class RegexCensor(Regex censorInstruction) : IChatCensor
+{
+    private readonly Regex _censorInstruction = censorInstruction;
+
+    public bool Censor(string input, out string output, char replaceWith = '*')
+    {
+        output = _censorInstruction.Replace(input, replaceWith.ToString());
+
+        return !string.Equals(input, output);
+    }
+}
diff --git a/Content.Shared/Chat/V2/Moderation/SimpleCensor.cs b/Content.Shared/Chat/V2/Moderation/SimpleCensor.cs
new file mode 100644 (file)
index 0000000..a6bb70d
--- /dev/null
@@ -0,0 +1,340 @@
+using System.Collections.Frozen;
+using System.Linq;
+using System.Text;
+using System.Text.Unicode;
+
+namespace Content.Shared.Chat.V2.Moderation;
+
+/// <summary>
+/// A basic censor. Not bullet-proof.
+/// </summary>
+public sealed class SimpleCensor : IChatCensor
+{
+    // Common substitution symbols are replaced with one of the characters they commonly substitute.
+    private bool _shouldSanitizeLeetspeak;
+    private FrozenDictionary<char, char> _leetspeakReplacements = FrozenDictionary<char, char>.Empty;
+
+    // Special characters are replaced with spaces.
+    private bool _shouldSanitizeSpecialCharacters;
+    private HashSet<char> _specialCharacterReplacements = [];
+
+    // Censored words are removed unless they're a false positive (e.g. Scunthorpe)
+    private string[] _censoredWords = Array.Empty<string>();
+    private string[] _falsePositives = Array.Empty<string>();
+
+    // False negatives are censored words that contain a false positives.
+    private string[] _falseNegatives = Array.Empty<string>();
+
+    // What unicode ranges are allowed? If this array is empty, don't filter by range.
+    private UnicodeRange[] _allowedUnicodeRanges= Array.Empty<UnicodeRange>();
+
+    /// <summary>
+    /// Censors the input string.
+    /// </summary>
+    /// <param name="input">The input string</param>
+    /// <param name="output">The output string</param>
+    /// <param name="replaceWith">The character to replace with</param>
+    /// <returns>If output is valid</returns>
+    public bool Censor(string input, out string output, char replaceWith = '*')
+    {
+        output = Censor(input, replaceWith);
+
+        return !string.Equals(input, output);
+    }
+
+    public string Censor(string input, char replaceWith = '*')
+    {
+        // We flat-out ban anything not in the allowed unicode ranges, stripping them
+        input = SanitizeOutBlockedUnicode(input);
+
+        var originalInput = input.ToCharArray();
+
+        input = SanitizeInput(input);
+
+        var censored = input.ToList();
+
+        // Remove false negatives
+        input = CheckProfanity(input, censored, _falseNegatives, replaceWith);
+
+        // Get false positives
+        var falsePositives = FindFalsePositives(censored, replaceWith);
+
+        // Remove censored words
+        CheckProfanity(input, censored, _censoredWords, replaceWith);
+
+        // Reconstruct
+        // Reconstruct false positives
+        for (var i = 0; i < falsePositives.Length; i++)
+        {
+            if (falsePositives[i] != replaceWith)
+            {
+                censored[i] = falsePositives[i];
+            }
+        }
+
+        for (var i = 0; i < originalInput.Length; i++)
+        {
+            if (originalInput[i] == ' ')
+            {
+                censored.Insert(i, ' ');
+
+                continue;
+            }
+
+            if (_shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(originalInput[i]))
+            {
+                censored.Insert(i, originalInput[i]);
+
+                continue;
+            }
+
+            if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
+            {
+                // detect "()"
+                if (originalInput[i] == '(' && i != originalInput.Length - 1 && originalInput[i+1] == ')')
+                {
+                    // censored has now had "o" replaced with "o) so both strings line up again..."
+                    censored.Insert(i+1, censored[i] != replaceWith ? ')' : replaceWith);
+                }
+            }
+
+            if (censored[i] != replaceWith)
+            {
+                censored[i] = originalInput[i];
+            }
+        }
+
+        // SO says this is fast...
+        return string.Concat(censored);
+    }
+
+    /// <summary>
+    /// Adds a l33tsp34k sanitization rule
+    /// </summary>
+    /// <returns>The censor for further configuration</returns>
+    public SimpleCensor WithSanitizeLeetSpeak()
+    {
+        _shouldSanitizeLeetspeak = true;
+
+        return BuildCharacterReplacements();
+    }
+
+    /// <summary>
+    /// Adds a l33tsp34k sanitization rule
+    /// </summary>
+    /// <returns>The censor for further configuration</returns>
+    public SimpleCensor WithSanitizeSpecialCharacters()
+    {
+        _shouldSanitizeSpecialCharacters = true;
+
+        return BuildCharacterReplacements();
+    }
+
+    public SimpleCensor WithRanges(UnicodeRange[] ranges)
+    {
+        _allowedUnicodeRanges = ranges;
+
+        return this;
+    }
+
+    public SimpleCensor WithCustomDictionary(string[] naughtyWords)
+    {
+        _censoredWords = naughtyWords;
+
+        return this;
+    }
+
+    public SimpleCensor WithFalsePositives(string[] falsePositives)
+    {
+        _falsePositives = falsePositives;
+
+        return this;
+    }
+
+    public SimpleCensor WithFalseNegatives(string[] falseNegatives)
+    {
+        _falseNegatives = falseNegatives;
+
+        return this;
+    }
+
+    public SimpleCensor WithLeetspeakReplacements(Dictionary<char, char> replacements)
+    {
+        _leetspeakReplacements = replacements.ToFrozenDictionary();
+
+        return this;
+    }
+
+    public SimpleCensor WithSpecialCharacterReplacements(Dictionary<char, char> replacements)
+    {
+        _leetspeakReplacements = replacements.ToFrozenDictionary();
+
+        return this;
+    }
+
+    private string CheckProfanity(string input, List<char> censored, string[] words, char replaceWith = '*')
+    {
+        foreach (var word in words)
+        {
+            var wordLength = word.Length;
+            var endOfFoundWord = 0;
+            var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+
+            while(foundIndex > -1)
+            {
+                endOfFoundWord = foundIndex + wordLength;
+
+                for (var i = 0; i < wordLength; i++)
+                {
+                    censored[foundIndex+i] = replaceWith;
+                }
+
+                foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+            }
+        }
+
+        return input;
+    }
+
+    private char[] FindFalsePositives(List<char> chars, char replaceWith = '*')
+    {
+        var input = string.Concat(chars);
+
+        var output = Enumerable.Repeat(replaceWith, input.Length).ToArray();
+        var inputAsARr = input.ToArray();
+
+        foreach (var word in _falsePositives)
+        {
+            var wordLength = word.Length;
+            var endOfFoundWord = 0;
+            var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+
+            while(foundIndex > -1)
+            {
+                endOfFoundWord = foundIndex + wordLength;
+
+                for (var i = foundIndex; i < endOfFoundWord; i++)
+                {
+                    output[i] = inputAsARr[i];
+                }
+
+                foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
+            }
+        }
+
+        return output;
+    }
+
+    private string SanitizeInput(string input)
+    {
+        // "()" is a broad enough trick to beat censors that we we should check for it broadly.
+        if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
+        {
+            input = input.Replace("()", "o");
+        }
+
+        var sb = new StringBuilder();
+
+        // ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator
+        foreach (var character in input)
+        {
+            if (character == ' ' || _shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(character))
+            {
+                continue;
+            }
+
+            if (_shouldSanitizeLeetspeak && _leetspeakReplacements.TryGetValue(character, out var leetRepl))
+            {
+                sb.Append(leetRepl);
+
+                continue;
+            }
+
+            sb.Append(character);
+        }
+
+        return sb.ToString();
+    }
+
+    /// <summary>
+    /// Returns a string with all characters not in ISO-8851-1 replaced with question marks
+    /// </summary>
+    private string SanitizeOutBlockedUnicode(string input)
+    {
+        if (_allowedUnicodeRanges.Length <= 0)
+        {
+            return input;
+        }
+
+        var sb = new StringBuilder();
+
+        foreach (var symbol in input.EnumerateRunes())
+        {
+            // ReSharper disable once LoopCanBeConvertedToQuery
+            foreach (var range in _allowedUnicodeRanges)
+            {
+                if (symbol.Value < range.FirstCodePoint || symbol.Value >= range.FirstCodePoint + range.Length)
+                    continue;
+
+                sb.Append(symbol);
+
+                break;
+            }
+        }
+
+        return sb.ToString();
+    }
+
+    private SimpleCensor BuildCharacterReplacements()
+    {
+        if (_shouldSanitizeSpecialCharacters)
+        {
+            _specialCharacterReplacements =
+            [
+                '-',
+                '_',
+                '|',
+                '.',
+                ',',
+                '(',
+                ')',
+                '<',
+                '>',
+                '"',
+                '`',
+                '~',
+                '*',
+                '&',
+                '%',
+                '$',
+                '#',
+                '@',
+                '!',
+                '?',
+                '+'
+            ];
+        }
+
+        if (_shouldSanitizeLeetspeak)
+        {
+            _leetspeakReplacements = new Dictionary<char, char>
+            {
+                ['4'] = 'a',
+                ['$'] = 's',
+                ['!'] = 'i',
+                ['+'] = 't',
+                ['#'] = 'h',
+                ['@'] = 'a',
+                ['0'] = 'o',
+                ['1'] = 'i', // also obviously can be l; gamer-words need i's more though.
+                ['7'] = 'l',
+                ['3'] = 'e',
+                ['5'] = 's',
+                ['9'] = 'g',
+                ['<'] = 'c'
+            }.ToFrozenDictionary();
+        }
+
+        return this;
+    }
+}
diff --git a/Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs b/Content.Tests/Shared/Chat/V2/Moderation/SimpleCensor.cs
new file mode 100644 (file)
index 0000000..09870af
--- /dev/null
@@ -0,0 +1,162 @@
+using System.Text.Unicode;
+using Content.Shared.Chat.V2.Moderation;
+using NUnit.Framework;
+
+namespace Content.Tests.Shared.Chat.V2.Moderation;
+
+public sealed class SimpleCensorTests
+{
+    [Test]
+    public void CanCensorASingleWord()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus"]);
+        var output = sut.Censor("hello amogus");
+
+        Assert.That(output, Is.EqualTo("hello ******"));
+    }
+
+    // Basics - use custom dictionary
+
+    [Test]
+    public void CanCensorMultipleWordInstances()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus"]);
+        var output = sut.Censor("amogus hello amogus");
+
+        Assert.That(output, Is.EqualTo("****** hello ******"));
+    }
+
+    [Test]
+    public void CanCensorMultipleWords()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+        var output = sut.Censor("amogus hello sus");
+
+        Assert.That(output, Is.EqualTo("****** hello ***"));
+    }
+
+    [Test]
+    public void CanUseDifferentCensorSymbols()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+        var output = sut.Censor("amogus hello sus", '#');
+
+        Assert.That(output, Is.EqualTo("###### hello ###"));
+    }
+
+    [Test]
+    public void CanCatchCapitalizedWords()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+        var output = sut.Censor("AMOGUS hello SUS");
+
+        Assert.That(output, Is.EqualTo("****** hello ***"));
+    }
+
+    [Test]
+    public void CanCatchWordsWithSomeCaptialsInThem()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+        var output = sut.Censor("AmoGuS hello SuS");
+
+        Assert.That(output, Is.EqualTo("****** hello ***"));
+    }
+
+    [Test]
+    public void CanCatchWordsHiddenInsideOtherWords()
+    {
+        var sut= new SimpleCensor().WithCustomDictionary(["amogus", "sus"]);
+        var output = sut.Censor("helamoguslo suspicious");
+
+        Assert.That(output, Is.EqualTo("hel******lo ***picious"));
+    }
+
+    // Sanitizing Leetspeak
+
+    [Test]
+    public void CanSanitizeLeetspeak()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+        var output = sut.Censor("am0gu5 hello 5u5");
+
+        Assert.That(output, Is.EqualTo("****** hello ***"));
+    }
+
+    [Test]
+    public void SanitizingLeetspeakOnlyOccursWhenTheWordIsBlocked()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+        var output = sut.Censor("he110");
+
+        Assert.That(output, Is.EqualTo("he110"));
+    }
+
+    [Test]
+    public void CanCatchLeetspeakReplacementsWithMoreThanOneLetter()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeLeetSpeak();
+        var output = sut.Censor("am()gu5 hello 5u5");
+
+        Assert.That(output, Is.EqualTo("******* hello ***"));
+    }
+
+    // Sanitizing special characters
+
+    [Test]
+    public void DoesNotSanitizeOutUncensoredSpecialCharacters()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters();
+        var output = sut.Censor("amogus!hello!sus");
+
+        Assert.That(output, Is.EqualTo("******!hello!***"));
+    }
+
+    [Test]
+    public void DoesSanitizeOutCensoredSpecialCharacters()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithSanitizeSpecialCharacters();
+        var output = sut.Censor("amo!gus hello s?us");
+
+        Assert.That(output, Is.EqualTo("***!*** hello *?**"));
+    }
+
+    // Unicode ranges
+
+    [Test]
+    public void SanitizesOutNonLatinCharaters()
+    {
+        var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement]);
+        var output = sut.Censor("amogus Україна sus 日本");
+
+        Assert.That(output, Is.EqualTo("amogus  sus "));
+    }
+
+    [Test]
+    public void SanitizesOutNonLatinOrCyrillicCharaters()
+    {
+        var sut = new SimpleCensor().WithRanges([UnicodeRanges.BasicLatin, UnicodeRanges.Latin1Supplement, UnicodeRanges.Cyrillic]);
+        var output = sut.Censor("amogus Україна sus 日本");
+
+        Assert.That(output, Is.EqualTo("amogus Україна sus "));
+    }
+
+    // False positives
+    [Test]
+    public void CanHandleFalsePositives()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]);
+        var output = sut.Censor("amogusus hello amogus hello sus");
+
+        Assert.That(output, Is.EqualTo("amogusus hello ****** hello ***"));
+    }
+
+    // False negatives
+    [Test]
+    public void CanHandleFalseNegatives()
+    {
+        var sut = new SimpleCensor().WithCustomDictionary(["amogus", "sus"]).WithFalsePositives(["amogusus"]).WithFalseNegatives(["susamogusus"]);
+        var output = sut.Censor("susamogusus hello amogus hello sus amogusus");
+
+        Assert.That(output, Is.EqualTo("*********** hello ****** hello *** ********"));
+    }
+}