Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug: set match type correctly in riskywords detectors #918

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion garak/detectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,13 @@ def detect(self, attempt: garak.attempt.Attempt) -> List[float]:
class StringDetector(Detector):
"""Subclass of Detector using list of substrings as detection triggers"""

DEFAULT_PARAMS = Detector.DEFAULT_PARAMS | {
"matchtype": "str", # "str" or "word"
}

def __init__(self, substrings, config_root=_config):
super().__init__(config_root=config_root)
self.substrings = substrings
self.matchtype = "str" # str or word

def detect(
self, attempt: garak.attempt.Attempt, case_sensitive=False
Expand Down
5 changes: 4 additions & 1 deletion garak/detectors/mitigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ class MitigationBypass(StringDetector):
mitigation messages listed in the ShareGPT dataset used in training
the Vicuna models."""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "str",
}

bcp47 = "en"
matchtype = "word"
doc_uri = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered" # with updates
tags = ["avid-effect:security:S0403", "avid-effect:performance:P0204"]

Expand Down
80 changes: 64 additions & 16 deletions garak/detectors/riskywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@
class SurgeProfanitySexual(StringDetector):
"""Surge AI list: sexual anatomy / sexual acts"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -52,8 +55,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityBodilyFluids(StringDetector):
"""Surge AI list: bodily fluids / excrement"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -64,8 +70,11 @@ def __init__(self, config_root=_config):
class SurgeProfanitySexualOrientationGender(StringDetector):
"""Surge AI list: sexual orientation / gender"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -80,8 +89,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityRacialEthnic(StringDetector):
"""Surge AI list: racial / ethnic"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -96,8 +108,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityMentalDisability(StringDetector):
"""Surge AI list: mental disability"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -112,8 +127,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPhysicalDisability(StringDetector):
"""Surge AI list: physical disability"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -128,8 +146,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPhysicalAttributes(StringDetector):
"""Surge AI list: physical attributes"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -140,8 +161,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityAnimalReferences(StringDetector):
"""Surge AI list: animal references"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -152,8 +176,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityReligious(StringDetector):
"""Surge AI list: religious offense"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -164,8 +191,11 @@ def __init__(self, config_root=_config):
class SurgeProfanityPolitical(StringDetector):
"""Surge AI list: political"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -181,9 +211,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveGeneral(StringDetector):
"""UK OFCOM potentially offensive list: general"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "general"
bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -194,9 +227,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveSexual(StringDetector):
"""UK OFCOM potentially offensive list: sexual"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "sexual"
bcp47 = "en"
matchtype = "word"
tags = ["avid-effect:performance:P0401"]

def __init__(self, config_root=_config):
Expand All @@ -207,9 +243,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveDiscriminatory(StringDetector):
"""UK OFCOM potentially offensive list: discriminatory"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "discriminatory"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -224,9 +263,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveSexGender(StringDetector):
"""UK OFCOM potentially offensive list: sex & gender"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "sexgender"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -241,9 +283,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveMentalHealth(StringDetector):
"""UK OFCOM potentially offensive list: mental health"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "mentalhealth"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand All @@ -258,9 +303,12 @@ def __init__(self, config_root=_config):
class OfcomOffensiveRaceEthnic(StringDetector):
"""UK OFCOM potentially offensive list: race / ethnic"""

DEFAULT_PARAMS = StringDetector.DEFAULT_PARAMS | {
"matchtype": "word",
}

term = "raceethnic"
bcp47 = "en"
matchtype = "word"
tags = [
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
Expand Down
37 changes: 37 additions & 0 deletions tests/detectors/test_detectors_riskywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import pytest

import garak._plugins
import garak.attempt
import garak.detectors.base

SURGE_DETECTORS = [
classname
for (classname, active) in garak._plugins.enumerate_plugins("detectors")
if classname.startswith("detectors.riskywords.Surge") and active is True
]


@pytest.mark.parametrize("classname", SURGE_DETECTORS)
def test_surge_word_match(classname):
d = garak._plugins.load_plugin(classname)
assert isinstance(d, garak.detectors.base.StringDetector)
assert d.matchtype == "word"
a = garak.attempt.Attempt(prompt="test prompt")
test_strings = [
"",
"an assault occurred",
"assault",
"Mr. Cummings",
"AFEG is a guideline for fire safety",
"chinks in the armour",
"penistone",
"scunthorpe",
"throttle retardation",
"crowing",
]
a.outputs = test_strings
results = d.detect(a)
assert results == [0.0] * len(test_strings)
Loading