From dafb06fc372f61522e79958ed0230c603bf23199 Mon Sep 17 00:00:00 2001
From: Leon Derczynski <leonderczynski@gmail.com>
Date: Wed, 29 Nov 2023 08:26:25 +0100
Subject: [PATCH] paraphrase buffs (#333)

* add pegasus paraphrase (it's heavy though, let's find another)

* add reqs for paraphrasing

* clarify a few command comments

* tidy up buff load failure messages

* rm dev file

* get a cpu-friendly buff running

* state module on buff load
---
 garak/buffs/base.py       |   6 +-
 garak/buffs/paraphrase.py | 124 ++++++++++++++++++++++++++++++++++++++
 garak/command.py          |   7 ++-
 garak/garak.site.yaml     |   3 -
 garak/harnesses/base.py   |  17 ++++--
 pyproject.toml            |   4 +-
 requirements.txt          |   2 +
 7 files changed, 149 insertions(+), 14 deletions(-)
 create mode 100644 garak/buffs/paraphrase.py
 delete mode 100644 garak/garak.site.yaml

diff --git a/garak/buffs/base.py b/garak/buffs/base.py
index c90ac735b..4f7478f34 100644
--- a/garak/buffs/base.py
+++ b/garak/buffs/base.py
@@ -33,14 +33,17 @@ class Buff:
     active = True
 
     def __init__(self) -> None:
+        module = self.__class__.__module__.replace("garak.buffs.", "")
         print(
-            f"🦾 loading {Style.BRIGHT}{Fore.LIGHTGREEN_EX}buff: {Style.RESET_ALL}{self.__class__.__name__}"
+            f"🦾 loading {Style.BRIGHT}{Fore.LIGHTGREEN_EX}buff: {Style.RESET_ALL}{module}.{self.__class__.__name__}"
         )
         logging.info(f"buff init: {self}")
 
     def _derive_new_attempt(
         self, source_attempt: garak.attempt.Attempt, seq=-1
     ) -> garak.attempt.Attempt:
+        if seq == -1:
+            seq = source_attempt.seq
         new_attempt = garak.attempt.Attempt(
             status=source_attempt.status,
             prompt=source_attempt.prompt,
@@ -64,6 +67,7 @@ def _derive_new_attempt(
     def transform(
         self, attempt: garak.attempt.Attempt
     ) -> Iterable[garak.attempt.Attempt]:
+        """attempt copying is handled elsewhere. isn't that nice"""
         yield attempt
 
     def buff(
diff --git a/garak/buffs/paraphrase.py b/garak/buffs/paraphrase.py
new file mode 100644
index 000000000..6e436c17f
--- /dev/null
+++ b/garak/buffs/paraphrase.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+""" Buff that paraphrases a prompt. """
+
+from collections.abc import Iterable
+import copy
+
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+
+import garak.attempt
+from garak.buffs.base import Buff
+
+
+class PegasusT5(Buff):
+    """Paraphrasing buff using Pegasus model"""
+
+    bcp47 = "en"
+    uri = "https://huggingface.co/tuner007/pegasus_paraphrase"
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.para_model_name = "tuner007/pegasus_paraphrase"  # https://huggingface.co/tuner007/pegasus_paraphrase
+        self.torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.max_length = 60
+        self.temperature = 1.5
+        self.num_return_sequences = 6
+        self.num_beams = self.num_return_sequences
+        self.tokenizer = PegasusTokenizer.from_pretrained(self.para_model_name)
+        self.para_model = PegasusForConditionalGeneration.from_pretrained(
+            self.para_model_name
+        ).to(self.torch_device)
+
+    def _get_response(self, input_text):
+        batch = self.tokenizer(
+            [input_text],
+            truncation=True,
+            padding="longest",
+            max_length=self.max_length,
+            return_tensors="pt",
+        ).to(self.torch_device)
+        translated = self.para_model.generate(
+            **batch,
+            max_length=self.max_length,
+            num_beams=self.num_beams,
+            num_return_sequences=self.num_return_sequences,
+            temperature=self.temperature,
+        )
+        tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
+        return tgt_text
+
+    def transform(
+        self, attempt: garak.attempt.Attempt
+    ) -> Iterable[garak.attempt.Attempt]:
+        yield self._derive_new_attempt(attempt)
+        paraphrases = self._get_response(attempt.prompt)
+        for paraphrase in set(paraphrases):
+            paraphrased_attempt = self._derive_new_attempt(attempt)
+            paraphrased_attempt.prompt = paraphrase
+            yield paraphrased_attempt
+
+
+class HumarinT5(Buff):
+    """CPU-friendly paraphrase buff based on Humarin's T5 paraphraser"""
+
+    bcp47 = "en"
+    uri = "https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base"
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.para_model_name = "humarin/chatgpt_paraphraser_on_T5_base"
+        self.torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.num_beams = 5
+        self.num_beam_groups = 5
+        self.num_return_sequences = 5
+        self.repetition_penalty = 10.0
+        self.diversity_penalty = 3.0
+        self.no_repeat_ngram_size = 2
+        # self.temperature = 0.7
+        self.max_length = 128
+        self.tokenizer = AutoTokenizer.from_pretrained(self.para_model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.para_model_name).to(
+            self.torch_device
+        )
+
+    def _get_response(self, input_text):
+        input_ids = self.tokenizer(
+            f"paraphrase: {input_text}",
+            return_tensors="pt",
+            padding="longest",
+            max_length=self.max_length,
+            truncation=True,
+        ).input_ids
+
+        outputs = self.model.generate(
+            input_ids,
+            # temperature=self.temperature,
+            repetition_penalty=self.repetition_penalty,
+            num_return_sequences=self.num_return_sequences,
+            no_repeat_ngram_size=self.no_repeat_ngram_size,
+            num_beams=self.num_beams,
+            num_beam_groups=self.num_beam_groups,
+            max_length=self.max_length,
+            diversity_penalty=self.diversity_penalty,
+            # do_sample = False,
+        )
+
+        res = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        return res
+
+    def transform(
+        self, attempt: garak.attempt.Attempt
+    ) -> Iterable[garak.attempt.Attempt]:
+        yield self._derive_new_attempt(attempt)
+        paraphrases = self._get_response(attempt.prompt)
+        for paraphrase in set(paraphrases):
+            paraphrased_attempt = self._derive_new_attempt(attempt)
+            paraphrased_attempt.prompt = paraphrase
+            yield paraphrased_attempt
diff --git a/garak/command.py b/garak/command.py
index 0e2b26db3..1ef9ac4d8 100644
--- a/garak/command.py
+++ b/garak/command.py
@@ -158,11 +158,12 @@ def plugin_info(plugin_name):
         )
 
 
-# set config vars - debug, threshold
-# load generator
-# probe
+# TODO set config vars - debug, threshold
+# TODO load generator
+# TODO set probe config string
 
 
+# do a run
 def probewise_run(generator, probe_names, evaluator, buffs):
     import garak.harnesses.probewise
 
diff --git a/garak/garak.site.yaml b/garak/garak.site.yaml
deleted file mode 100644
index 070271206..000000000
--- a/garak/garak.site.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
----
-run:
-  eval_threshold: 0.777
diff --git a/garak/harnesses/base.py b/garak/harnesses/base.py
index 053a8a30b..59a594d4d 100644
--- a/garak/harnesses/base.py
+++ b/garak/harnesses/base.py
@@ -43,16 +43,21 @@ def _load_buffs(self, buffs: List) -> None:
         If one wants to use buffs directly with this harness without subclassing,
         then call this method instance directly."""
 
-        _config.buffs = []
+        _config.buffs = []  # maybe put this in transient / session, eh
         for buff in buffs:
+            err_msg = None
             try:
                 _config.buffs.append(_plugins.load_plugin(buff))
-                logging.debug(f"loaded {buff}")
+                logging.debug("loaded %s", buff)
+            except ValueError as ve:
+                err_msg = f"❌🦾 buff load error:❌ {ve}"
             except Exception as e:
-                msg = f"failed to load buff {buff}"
-                print(msg)
-                logging.warning(f"{msg}: {e}")
-                continue
+                err_msg = f"❌🦾 failed to load buff {buff}:❌ {e}"
+            finally:
+                if err_msg is not None:
+                    print(err_msg)
+                    logging.warning(err_msg)
+                    continue
 
     def run(self, model, probes, detectors, evaluator, announce_probe=True) -> None:
         """Core harness method
diff --git a/pyproject.toml b/pyproject.toml
index 3a740dfcb..a07978bbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,9 @@ dependencies = [
   "langchain>=0.0.300",
   "nemollm>=0.3.0",
   "octoai-sdk",
-  "cmd2"
+  "cmd2",
+  "torch>=2.1.0",
+  "sentencepiece>=0.1.99"
 ]
 
 [project.urls]
diff --git a/requirements.txt b/requirements.txt
index f030d4a7c..3ea694eac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,5 @@ langchain>=0.0.300
 nemollm>=0.3.0
 octoai-sdk
 cmd2
+torch>=2.1.0
+sentencepiece>=0.1.99