diff --git a/garak/detectors/base.py b/garak/detectors/base.py index 2c2b275dd..703db8264 100644 --- a/garak/detectors/base.py +++ b/garak/detectors/base.py @@ -162,10 +162,13 @@ def detect(self, attempt: garak.attempt.Attempt) -> List[float]: class StringDetector(Detector): """Subclass of Detector using list of substrings as detection triggers""" + DEFAULT_PARAMS = Detector.DEFAULT_PARAMS | { + "matchtype": "str", # "str" or "word" + } + def __init__(self, substrings, config_root=_config): super().__init__(config_root=config_root) self.substrings = substrings - self.matchtype = "str" # str or word def detect( self, attempt: garak.attempt.Attempt, case_sensitive=False diff --git a/garak/detectors/mitigation.py b/garak/detectors/mitigation.py index a6f9c00dd..8702e8bc8 100644 --- a/garak/detectors/mitigation.py +++ b/garak/detectors/mitigation.py @@ -17,8 +17,11 @@ class MitigationBypass(StringDetector): mitigation messages listed in the ShareGPT dataset used in training the Vicuna models.""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "str", + } + bcp47 = "en" - matchtype = "word" doc_uri = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered" # with updates tags = ["avid-effect:security:S0403", "avid-effect:performance:P0204"] diff --git a/garak/detectors/riskywords.py b/garak/detectors/riskywords.py index 741522054..fe325a824 100644 --- a/garak/detectors/riskywords.py +++ b/garak/detectors/riskywords.py @@ -40,8 +40,11 @@ class SurgeProfanitySexual(StringDetector): """Surge AI list: sexual anatomy / sexual acts""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = ["avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -52,8 +55,11 @@ def __init__(self, config_root=_config): class SurgeProfanityBodilyFluids(StringDetector): """Surge AI list: bodily fluids / excrement""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = ["avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -64,8 +70,11 @@ def __init__(self, config_root=_config): class SurgeProfanitySexualOrientationGender(StringDetector): """Surge AI list: sexual orientation / gender""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -80,8 +89,11 @@ def __init__(self, config_root=_config): class SurgeProfanityRacialEthnic(StringDetector): """Surge AI list: racial / ethnic""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -96,8 +108,11 @@ def __init__(self, config_root=_config): class SurgeProfanityMentalDisability(StringDetector): """Surge AI list: mental disability""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -112,8 +127,11 @@ def __init__(self, config_root=_config): class SurgeProfanityPhysicalDisability(StringDetector): """Surge AI list: physical disability""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -128,8 +146,11 @@ def __init__(self, config_root=_config): class SurgeProfanityPhysicalAttributes(StringDetector): """Surge AI list: physical attributes""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -140,8 +161,11 @@ def __init__(self, config_root=_config): class SurgeProfanityAnimalReferences(StringDetector): """Surge AI list: animal references""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -152,8 +176,11 @@ def __init__(self, config_root=_config): class SurgeProfanityReligious(StringDetector): """Surge AI list: religious offense""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -164,8 +191,11 @@ def __init__(self, config_root=_config): class SurgeProfanityPolitical(StringDetector): """Surge AI list: political""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -181,9 +211,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveGeneral(StringDetector): """UK OFCOM potentially offensive list: general""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "general" bcp47 = "en" - matchtype = "word" tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -194,9 +227,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveSexual(StringDetector): """UK OFCOM potentially offensive list: sexual""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "sexual" bcp47 = "en" - matchtype = "word" tags = ["avid-effect:performance:P0401"] def __init__(self, config_root=_config): @@ -207,9 +243,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveDiscriminatory(StringDetector): """UK OFCOM potentially offensive list: discriminatory""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "discriminatory" bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -224,9 +263,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveSexGender(StringDetector): """UK OFCOM potentially offensive list: sex & gender""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "sexgender" bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -241,9 +283,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveMentalHealth(StringDetector): """UK OFCOM potentially offensive list: mental health""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "mentalhealth" bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", @@ -258,9 +303,12 @@ def __init__(self, config_root=_config): class OfcomOffensiveRaceEthnic(StringDetector): """UK OFCOM potentially offensive list: race / ethnic""" + DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | { + "matchtype": "word", + } + term = "raceethnic" bcp47 = "en" - matchtype = "word" tags = [ "avid-effect:ethics:E0101", "avid-effect:ethics:E0301", diff --git a/tests/detectors/test_detectors_riskywords.py b/tests/detectors/test_detectors_riskywords.py new file mode 100644 index 000000000..409dfbc34 --- /dev/null +++ b/tests/detectors/test_detectors_riskywords.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import garak._plugins +import garak.attempt +import garak.detectors.base + +SURGE_DETECTORS = [ + classname + for (classname, active) in garak._plugins.enumerate_plugins("detectors") + if classname.startswith("detectors.riskywords.Surge") and active is True +] + + +@pytest.mark.parametrize("classname", SURGE_DETECTORS) +def test_surge_word_match(classname): + d = garak._plugins.load_plugin(classname) + assert isinstance(d, garak.detectors.base.StringDetector) + assert d.matchtype == "word" + a = garak.attempt.Attempt(prompt="test prompt") + test_strings = [ + "", + "an assault occurred", + "assault", + "Mr. Cummings", + "AFEG is a guideline for fire safety", + "chinks in the armour", + "penistone", + "scunthorpe", + "throttle retardation", + "crowing", + ] + a.outputs = test_strings + results = d.detect(a) + assert results == [0.0] * len(test_strings)