diff --git a/garak/buffs/base.py b/garak/buffs/base.py index c90ac735b..4f7478f34 100644 --- a/garak/buffs/base.py +++ b/garak/buffs/base.py @@ -33,14 +33,17 @@ class Buff: active = True def __init__(self) -> None: + module = self.__class__.__module__.replace("garak.buffs.", "") print( - f"🦾 loading {Style.BRIGHT}{Fore.LIGHTGREEN_EX}buff: {Style.RESET_ALL}{self.__class__.__name__}" + f"🦾 loading {Style.BRIGHT}{Fore.LIGHTGREEN_EX}buff: {Style.RESET_ALL}{module}.{self.__class__.__name__}" ) logging.info(f"buff init: {self}") def _derive_new_attempt( self, source_attempt: garak.attempt.Attempt, seq=-1 ) -> garak.attempt.Attempt: + if seq == -1: + seq = source_attempt.seq new_attempt = garak.attempt.Attempt( status=source_attempt.status, prompt=source_attempt.prompt, @@ -64,6 +67,7 @@ def _derive_new_attempt( def transform( self, attempt: garak.attempt.Attempt ) -> Iterable[garak.attempt.Attempt]: + """attempt copying is handled elsewhere. isn't that nice""" yield attempt def buff( diff --git a/garak/buffs/paraphrase.py b/garak/buffs/paraphrase.py new file mode 100644 index 000000000..6e436c17f --- /dev/null +++ b/garak/buffs/paraphrase.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" Buff that paraphrases a prompt. """ + +from collections.abc import Iterable +import copy + +import torch +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from transformers import PegasusForConditionalGeneration, PegasusTokenizer + +import garak.attempt +from garak.buffs.base import Buff + + +class PegasusT5(Buff): + """Paraphrasing buff using Pegasus model""" + + bcp47 = "en" + uri = "https://huggingface.co/tuner007/pegasus_paraphrase" + + def __init__(self) -> None: + super().__init__() + self.para_model_name = "tuner007/pegasus_paraphrase" # https://huggingface.co/tuner007/pegasus_paraphrase + self.torch_device = "cuda" if torch.cuda.is_available() else "cpu" + self.max_length = 60 + self.temperature = 1.5 + self.num_return_sequences = 6 + self.num_beams = self.num_return_sequences + self.tokenizer = PegasusTokenizer.from_pretrained(self.para_model_name) + self.para_model = PegasusForConditionalGeneration.from_pretrained( + self.para_model_name + ).to(self.torch_device) + + def _get_response(self, input_text): + batch = self.tokenizer( + [input_text], + truncation=True, + padding="longest", + max_length=self.max_length, + return_tensors="pt", + ).to(self.torch_device) + translated = self.para_model.generate( + **batch, + max_length=self.max_length, + num_beams=self.num_beams, + num_return_sequences=self.num_return_sequences, + temperature=self.temperature, + ) + tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True) + return tgt_text + + def transform( + self, attempt: garak.attempt.Attempt + ) -> Iterable[garak.attempt.Attempt]: + yield self._derive_new_attempt(attempt) + paraphrases = self._get_response(attempt.prompt) + for paraphrase in set(paraphrases): + paraphrased_attempt = self._derive_new_attempt(attempt) + paraphrased_attempt.prompt = paraphrase + yield paraphrased_attempt + + +class HumarinT5(Buff): + """CPU-friendly paraphrase buff based on Humarin's T5 paraphraser""" + + bcp47 = "en" + uri = "https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base" + + def __init__(self) -> None: + super().__init__() + self.para_model_name = "humarin/chatgpt_paraphraser_on_T5_base" + self.torch_device = "cuda" if torch.cuda.is_available() else "cpu" + self.num_beams = 5 + self.num_beam_groups = 5 + self.num_return_sequences = 5 + self.repetition_penalty = 10.0 + self.diversity_penalty = 3.0 + self.no_repeat_ngram_size = 2 + # self.temperature = 0.7 + self.max_length = 128 + self.tokenizer = AutoTokenizer.from_pretrained(self.para_model_name) + self.model = AutoModelForSeq2SeqLM.from_pretrained(self.para_model_name).to( + self.torch_device + ) + + def _get_response(self, input_text): + input_ids = self.tokenizer( + f"paraphrase: {input_text}", + return_tensors="pt", + padding="longest", + max_length=self.max_length, + truncation=True, + ).input_ids + + outputs = self.model.generate( + input_ids, + # temperature=self.temperature, + repetition_penalty=self.repetition_penalty, + num_return_sequences=self.num_return_sequences, + no_repeat_ngram_size=self.no_repeat_ngram_size, + num_beams=self.num_beams, + num_beam_groups=self.num_beam_groups, + max_length=self.max_length, + diversity_penalty=self.diversity_penalty, + # do_sample = False, + ) + + res = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + + return res + + def transform( + self, attempt: garak.attempt.Attempt + ) -> Iterable[garak.attempt.Attempt]: + yield self._derive_new_attempt(attempt) + paraphrases = self._get_response(attempt.prompt) + for paraphrase in set(paraphrases): + paraphrased_attempt = self._derive_new_attempt(attempt) + paraphrased_attempt.prompt = paraphrase + yield paraphrased_attempt diff --git a/garak/command.py b/garak/command.py index 0e2b26db3..1ef9ac4d8 100644 --- a/garak/command.py +++ b/garak/command.py @@ -158,11 +158,12 @@ def plugin_info(plugin_name): ) -# set config vars - debug, threshold -# load generator -# probe +# TODO set config vars - debug, threshold +# TODO load generator +# TODO set probe config string +# do a run def probewise_run(generator, probe_names, evaluator, buffs): import garak.harnesses.probewise diff --git a/garak/garak.site.yaml b/garak/garak.site.yaml deleted file mode 100644 index 070271206..000000000 --- a/garak/garak.site.yaml +++ /dev/null @@ -1,3 +0,0 @@ ---- -run: - eval_threshold: 0.777 diff --git a/garak/harnesses/base.py b/garak/harnesses/base.py index 053a8a30b..59a594d4d 100644 --- a/garak/harnesses/base.py +++ b/garak/harnesses/base.py @@ -43,16 +43,21 @@ def _load_buffs(self, buffs: List) -> None: If one wants to use buffs directly with this harness without subclassing, then call this method instance directly.""" - _config.buffs = [] + _config.buffs = [] # maybe put this in transient / session, eh for buff in buffs: + err_msg = None try: _config.buffs.append(_plugins.load_plugin(buff)) - logging.debug(f"loaded {buff}") + logging.debug("loaded %s", buff) + except ValueError as ve: + err_msg = f"❌🦾 buff load error:❌ {ve}" except Exception as e: - msg = f"failed to load buff {buff}" - print(msg) - logging.warning(f"{msg}: {e}") - continue + err_msg = f"❌🦾 failed to load buff {buff}:❌ {e}" + finally: + if err_msg is not None: + print(err_msg) + logging.warning(err_msg) + continue def run(self, model, probes, detectors, evaluator, announce_probe=True) -> None: """Core harness method diff --git a/pyproject.toml b/pyproject.toml index 3a740dfcb..a07978bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,9 @@ dependencies = [ "langchain>=0.0.300", "nemollm>=0.3.0", "octoai-sdk", - "cmd2" + "cmd2", + "torch>=2.1.0", + "sentencepiece>=0.1.99" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index f030d4a7c..3ea694eac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,5 @@ langchain>=0.0.300 nemollm>=0.3.0 octoai-sdk cmd2 +torch>=2.1.0 +sentencepiece>=0.1.99