Skip to content

Commit

Permalink
bug: set match type correctly in riskywords detectors (#918)
Browse files Browse the repository at this point in the history
* move riskyword string detectors over to configurable
* fix the match type in riskywords
* add tests for surge detectors
  • Loading branch information
jmartin-tech authored Sep 19, 2024
2 parents 2593777 + 5ffc870 commit 89143c8
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 18 deletions.
5 changes: 4 additions & 1 deletion garak/detectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,13 @@ def detect(self, attempt: garak.attempt.Attempt) -> List[float]:
class StringDetector(Detector):
"""Subclass of Detector using list of substrings as detection triggers"""

DEFAULT_PARAMS = Detector.DEFAULT_PARAMS | {
"matchtype": "str", # "str" or "word"
}

def __init__(self, substrings, config_root=_config):
super().__init__(config_root=config_root)
self.substrings = substrings
self.matchtype = "str" # str or word

def detect(
self, attempt: garak.attempt.Attempt, case_sensitive=False
Expand Down
5 changes: 4 additions & 1 deletion garak/detectors/mitigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ class MitigationBypass(StringDetector):
mitigation messages listed in the ShareGPT dataset used in training
the Vicuna models."""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "str",
}

bcp47 = "en"
matchtype = "word"
doc_uri = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered" # with updates
tags = ["avid-effect:security:S0403", "avid-effect:performance:P0204"]

Expand Down
80 changes: 64 additions & 16 deletions garak/detectors/riskywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@
class SurgeProfanitySexual(StringDetector):
"""Surge AI list: sexual anatomy / sexual acts"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -52,8 +55,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityBodilyFluids(StringDetector):
"""Surge AI list: bodily fluids / excrement"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -64,8 +70,11 @@ def __init__(self, config_root=_config):
class SurgeProfanitySexualOrientationGender(StringDetector):
"""Surge AI list: sexual orientation / gender"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -80,8 +89,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityRacialEthnic(StringDetector):
"""Surge AI list: racial / ethnic"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -96,8 +108,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityMentalDisability(StringDetector):
"""Surge AI list: mental disability"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -112,8 +127,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPhysicalDisability(StringDetector):
"""Surge AI list: physical disability"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -128,8 +146,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPhysicalAttributes(StringDetector):
"""Surge AI list: physical attributes"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -140,8 +161,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityAnimalReferences(StringDetector):
"""Surge AI list: animal references"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -152,8 +176,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityReligious(StringDetector):
"""Surge AI list: religious offense"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -164,8 +191,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPolitical(StringDetector):
"""Surge AI list: political"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -181,9 +211,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveGeneral(StringDetector):
"""UK OFCOM potentially offensive list: general"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "general"
bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -194,9 +227,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveSexual(StringDetector):
"""UK OFCOM potentially offensive list: sexual"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "sexual"
bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -207,9 +243,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveDiscriminatory(StringDetector):
"""UK OFCOM potentially offensive list: discriminatory"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "discriminatory"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -224,9 +263,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveSexGender(StringDetector):
"""UK OFCOM potentially offensive list: sex & gender"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "sexgender"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -241,9 +283,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveMentalHealth(StringDetector):
"""UK OFCOM potentially offensive list: mental health"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "mentalhealth"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -258,9 +303,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveRaceEthnic(StringDetector):
"""UK OFCOM potentially offensive list: race / ethnic"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "raceethnic"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand Down
37 changes: 37 additions & 0 deletions tests/detectors/test_detectors_riskywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import pytest

import garak._plugins
import garak.attempt
import garak.detectors.base

SURGE_DETECTORS = [
classname
for (classname, active) in garak._plugins.enumerate_plugins("detectors")
if classname.startswith("detectors.riskywords.Surge") and active is True
]


@pytest.mark.parametrize("classname", SURGE_DETECTORS)
def test_surge_word_match(classname):
d = garak._plugins.load_plugin(classname)
assert isinstance(d, garak.detectors.base.StringDetector)
assert d.matchtype == "word"
a = garak.attempt.Attempt(prompt="test prompt")
test_strings = [
"",
"an assault occurred",
"assault",
"Mr. Cummings",
"AFEG is a guideline for fire safety",
"chinks in the armour",
"penistone",
"scunthorpe",
"throttle retardation",
"crowing",
]
a.outputs = test_strings
results = d.detect(a)
assert results == [0.0] * len(test_strings)

0 comments on commit 89143c8

Please sign in to comment.