NVIDIA · jmartin-tech · Sep 19, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/garak/detectors/base.py b/garak/detectors/base.py
@@ -162,10 +162,13 @@ def detect(self, attempt: garak.attempt.Attempt) -> List[float]:
 class StringDetector(Detector):
     """Subclass of Detector using list of substrings as detection triggers"""
 
+    DEFAULT_PARAMS = Detector.DEFAULT_PARAMS | {
+        "matchtype": "str",  # "str" or "word"
+    }
+
     def __init__(self, substrings, config_root=_config):
         super().__init__(config_root=config_root)
         self.substrings = substrings
-        self.matchtype = "str"  # str or word
 
     def detect(
         self, attempt: garak.attempt.Attempt, case_sensitive=False

diff --git a/garak/detectors/mitigation.py b/garak/detectors/mitigation.py
@@ -17,8 +17,11 @@ class MitigationBypass(StringDetector):
     mitigation messages listed in the ShareGPT dataset used in training
     the Vicuna models."""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "str",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     doc_uri = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered"  # with updates
     tags = ["avid-effect:security:S0403", "avid-effect:performance:P0204"]
 

diff --git a/garak/detectors/riskywords.py b/garak/detectors/riskywords.py
@@ -40,8 +40,11 @@
 class SurgeProfanitySexual(StringDetector):
     """Surge AI list: sexual anatomy / sexual acts"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -52,8 +55,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityBodilyFluids(StringDetector):
     """Surge AI list: bodily fluids / excrement"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -64,8 +70,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanitySexualOrientationGender(StringDetector):
     """Surge AI list: sexual orientation / gender"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -80,8 +89,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityRacialEthnic(StringDetector):
     """Surge AI list: racial / ethnic"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -96,8 +108,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityMentalDisability(StringDetector):
     """Surge AI list: mental disability"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -112,8 +127,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityPhysicalDisability(StringDetector):
     """Surge AI list: physical disability"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -128,8 +146,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityPhysicalAttributes(StringDetector):
     """Surge AI list: physical attributes"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -140,8 +161,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityAnimalReferences(StringDetector):
     """Surge AI list: animal references"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -152,8 +176,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityReligious(StringDetector):
     """Surge AI list: religious offense"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -164,8 +191,11 @@ def __init__(self, config_root=_config):
 class SurgeProfanityPolitical(StringDetector):
     """Surge AI list: political"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -181,9 +211,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveGeneral(StringDetector):
     """UK OFCOM potentially offensive list: general"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "general"
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -194,9 +227,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveSexual(StringDetector):
     """UK OFCOM potentially offensive list: sexual"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "sexual"
     bcp47 = "en"
-    matchtype = "word"
     tags = ["avid-effect:performance:P0401"]
 
     def __init__(self, config_root=_config):
@@ -207,9 +243,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveDiscriminatory(StringDetector):
     """UK OFCOM potentially offensive list: discriminatory"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "discriminatory"
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -224,9 +263,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveSexGender(StringDetector):
     """UK OFCOM potentially offensive list: sex & gender"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "sexgender"
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -241,9 +283,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveMentalHealth(StringDetector):
     """UK OFCOM potentially offensive list: mental health"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "mentalhealth"
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",
@@ -258,9 +303,12 @@ def __init__(self, config_root=_config):
 class OfcomOffensiveRaceEthnic(StringDetector):
     """UK OFCOM potentially offensive list: race / ethnic"""
 
+    DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
+        "matchtype": "word",
+    }
+
     term = "raceethnic"
     bcp47 = "en"
-    matchtype = "word"
     tags = [
         "avid-effect:ethics:E0101",
         "avid-effect:ethics:E0301",

diff --git a/tests/detectors/test_detectors_riskywords.py b/tests/detectors/test_detectors_riskywords.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import garak._plugins
+import garak.attempt
+import garak.detectors.base
+
+SURGE_DETECTORS = [
+    classname
+    for (classname, active) in garak._plugins.enumerate_plugins("detectors")
+    if classname.startswith("detectors.riskywords.Surge") and active is True
+]
+
+
+@pytest.mark.parametrize("classname", SURGE_DETECTORS)
+def test_surge_word_match(classname):
+    d = garak._plugins.load_plugin(classname)
+    assert isinstance(d, garak.detectors.base.StringDetector)
+    assert d.matchtype == "word"
+    a = garak.attempt.Attempt(prompt="test prompt")
+    test_strings = [
+        "",
+        "an assault occurred",
+        "assault",
+        "Mr. Cummings",
+        "AFEG is a guideline for fire safety",
+        "chinks in the armour",
+        "penistone",
+        "scunthorpe",
+        "throttle retardation",
+        "crowing",
+    ]
+    a.outputs = test_strings
+    results = d.detect(a)
+    assert results == [0.0] * len(test_strings)