diff --git a/garak/probes/transferableattacks.py b/garak/probes/transferableattacks.py deleted file mode 100644 index 9f708d2af..000000000 --- a/garak/probes/transferableattacks.py +++ /dev/null @@ -1,9 +0,0 @@ -"""transferableattacks - -Creates universal, transferable adversarial attacks using the llm-attacks framework. -(code: https://github.com/llm-attacks/llm-attacks/tree/main, paper: https://arxiv.org/abs/2307.15043) -Code is adapted from the llm-attacks repo to garak probes -""" - -from garak.attempt import Attempt -from garak.probes.base import Probe diff --git a/garak/resources/promptinject/llm-attacks.py b/garak/resources/promptinject/llm-attacks.py deleted file mode 100644 index dbeef951a..000000000 --- a/garak/resources/promptinject/llm-attacks.py +++ /dev/null @@ -1,129 +0,0 @@ -# MIT License -# -# Copyright (c) 2023 Andy Zou -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -"""A main script to run attack for LLMs.""" - - -import time -import importlib -import numpy as np -import torch.multiprocessing as mp -from absl import app -from ml_collections import config_flags - -from llm_attacks import get_goals_and_targets, get_workers - -_CONFIG = config_flags.DEFINE_config_file("config") - - -# Function to import module at the runtime -def dynamic_import(module): - return importlib.import_module(module) - - -def main(_): - mp.set_start_method("spawn") - - params = _CONFIG.value - - attack_lib = dynamic_import(f"llm_attacks.{params.attack}") - - print(params) - - train_goals, train_targets, test_goals, test_targets = get_goals_and_targets(params) - - process_fn = lambda s: s.replace("Sure, h", "H") - process_fn2 = lambda s: s.replace("Sure, here is", "Sure, here's") - train_targets = [ - process_fn(t) if np.random.random() < 0.5 else process_fn2(t) - for t in train_targets - ] - test_targets = [ - process_fn(t) if np.random.random() < 0.5 else process_fn2(t) - for t in test_targets - ] - - workers, test_workers = get_workers(params) - - managers = { - "AP": attack_lib.AttackPrompt, - "PM": attack_lib.PromptManager, - "MPA": attack_lib.MultiPromptAttack, - } - - timestamp = time.strftime("%Y%m%d-%H:%M:%S") - if params.transfer: - attack = attack_lib.ProgressiveMultiPromptAttack( - train_goals, - train_targets, - workers, - progressive_models=params.progressive_models, - progressive_goals=params.progressive_goals, - control_init=params.control_init, - logfile=f"{params.result_prefix}_{timestamp}.json", - managers=managers, - test_goals=test_goals, - test_targets=test_targets, - test_workers=test_workers, - mpa_deterministic=params.gbda_deterministic, - mpa_lr=params.lr, - mpa_batch_size=params.batch_size, - mpa_n_steps=params.n_steps, - ) - else: - attack = attack_lib.IndividualPromptAttack( - train_goals, - train_targets, - workers, - control_init=params.control_init, - logfile=f"{params.result_prefix}_{timestamp}.json", - managers=managers, - test_goals=getattr(params, "test_goals", []), - test_targets=getattr(params, "test_targets", []), - test_workers=test_workers, - mpa_deterministic=params.gbda_deterministic, - mpa_lr=params.lr, - mpa_batch_size=params.batch_size, - mpa_n_steps=params.n_steps, - ) - attack.run( - n_steps=params.n_steps, - batch_size=params.batch_size, - topk=params.topk, - temp=params.temp, - target_weight=params.target_weight, - control_weight=params.control_weight, - test_steps=getattr(params, "test_steps", 1), - anneal=params.anneal, - incr_control=params.incr_control, - stop_on_success=params.stop_on_success, - verbose=params.verbose, - filter_cand=params.filter_cand, - allow_non_ascii=params.allow_non_ascii, - ) - - for worker in workers + test_workers: - worker.stop() - - -if __name__ == "__main__": - app.run(main)