

Although CleverHans is likely to work on many other machine configurations, we currently test it with Python 3.6, Jax 0.2, PyTorch 1.7, and Tensorflow 2.4 on Ubuntu 18.04 LTS (Bionic Beaver).

torch可使用的攻击方法:CW FASM PGD 等等



整个库都使用tensorflow加速图形计算 Python 3.5 and TensorFlow {1.8, 1.12}


使用EagerPy框架,它能够编写与框架无关(framework-agnostic)的代码,这些代码可以与 PyTorch、TensorFlow、Jax 和 NumPy 实现原生地适配。

  1. 适配:pytorch 1.4.0\tensorflow 2.1.0\jax0.1.547\numpy1.18.1

  2. 可使用攻击类型:

  3. 使用foolbox


    • 安装:

      python3 -m pip install foolbox

      foolbox ==3.3.1

    • 将pytorch模型转化为Foolbox模型:

      torch.nn.Module to fb.PyTorchModel. 本例中使用ResNet-18

      preprocessing不懂 和 resnet18相关


      # PyTorch ResNet18
      import torch
      import torchvision
      model = torchvision.models.resnet18(pretrained=True).eval()
      preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)
      bounds = (0, 1)
      fmodel = fb.PyTorchModel(model, bounds=bounds, preprocessing=preprocessing)
    • 转换边界:


      fmodel = fmodel.transform_bounds((0, 1))
    • 数据集





      有images 和 labels 就行了呗

      images, labels = fb.utils.samples(fmodel, dataset='imagenet', batchsize=16)
    • 攻击模型:

    • foolbox/foolbox/attacks/

      from .base import Attack  # noqa: F401# FixedEpsilonAttack subclasses
      from .contrast import L2ContrastReductionAttack  # noqa: F401
      from .virtual_adversarial_attack import VirtualAdversarialAttack  # noqa: F401
      from .ddn import DDNAttack  # noqa: F401
      from .projected_gradient_descent import (  # noqa: F401L1ProjectedGradientDescentAttack,L2ProjectedGradientDescentAttack,LinfProjectedGradientDescentAttack,
      from .basic_iterative_method import (  # noqa: F401L1BasicIterativeAttack,L2BasicIterativeAttack,LinfBasicIterativeAttack,
      from .fast_gradient_method import (  # noqa: F401L1FastGradientAttack,L2FastGradientAttack,LinfFastGradientAttack,
      from .additive_noise import (  # noqa: F401L2AdditiveGaussianNoiseAttack,L2AdditiveUniformNoiseAttack,L2ClippingAwareAdditiveGaussianNoiseAttack,L2ClippingAwareAdditiveUniformNoiseAttack,LinfAdditiveUniformNoiseAttack,L2RepeatedAdditiveGaussianNoiseAttack,L2RepeatedAdditiveUniformNoiseAttack,L2ClippingAwareRepeatedAdditiveGaussianNoiseAttack,L2ClippingAwareRepeatedAdditiveUniformNoiseAttack,LinfRepeatedAdditiveUniformNoiseAttack,
      from .sparse_l1_descent_attack import SparseL1DescentAttack  # noqa: F401# MinimizatonAttack subclasses
      from .inversion import InversionAttack  # noqa: F401
      from .contrast_min import (  # noqa: F401BinarySearchContrastReductionAttack,LinearSearchContrastReductionAttack,
      from .carlini_wagner import L2CarliniWagnerAttack  # noqa: F401
      from .newtonfool import NewtonFoolAttack  # noqa: F401
      from .ead import EADAttack  # noqa: F401
      from .blur import GaussianBlurAttack  # noqa: F401
      from .spatial_attack import SpatialAttack  # noqa: F401
      from .deepfool import L2DeepFoolAttack, LinfDeepFoolAttack  # noqa: F401
      from .saltandpepper import SaltAndPepperNoiseAttack  # noqa: F401
      from .blended_noise import LinearSearchBlendedUniformNoiseAttack  # noqa: F401
      from .binarization import BinarizationRefinementAttack  # noqa: F401
      from .dataset_attack import DatasetAttack  # noqa: F401
      from .boundary_attack import BoundaryAttack  # noqa: F401
      from .hop_skip_jump import HopSkipJump  # noqa: F401
      from .brendel_bethge import (  # noqa: F401L0BrendelBethgeAttack,L1BrendelBethgeAttack,L2BrendelBethgeAttack,LinfinityBrendelBethgeAttack,
      from .fast_minimum_norm import (  # noqa: F401L0FMNAttack,L1FMNAttack,L2FMNAttack,LInfFMNAttack,
      from .gen_attack import GenAttack  # noqa: F401# from .blended_noise import LinearSearchBlendedUniformNoiseAttack  # noqa: F401
      # from .brendel_bethge import (  # noqa: F401
      #     L0BrendelBethgeAttack,
      #     L1BrendelBethgeAttack,
      #     L2BrendelBethgeAttack,
      #     LinfinityBrendelBethgeAttack,
      # )
      # from .additive_noise import L2AdditiveGaussianNoiseAttack  # noqa: F401
      # from .additive_noise import L2AdditiveUniformNoiseAttack  # noqa: F401
      # from .additive_noise import LinfAdditiveUniformNoiseAttack  # noqa: F401
      # from .additive_noise import L2RepeatedAdditiveGaussianNoiseAttack  # noqa: F401
      # from .additive_noise import L2RepeatedAdditiveUniformNoiseAttack  # noqa: F401
      # from .additive_noise import LinfRepeatedAdditiveUniformNoiseAttack  # noqa: F401
      # from .saltandpepper import SaltAndPepperNoiseAttack  # noqa: F401FGM = L2FastGradientAttack
      FGSM = LinfFastGradientAttack
      L1PGD = L1ProjectedGradientDescentAttack
      L2PGD = L2ProjectedGradientDescentAttack
      LinfPGD = LinfProjectedGradientDescentAttack
      PGD = LinfPGD



      fb.utils.accuracy(fmodel, images, labels)


      attack = fb.attacks.LinfDeepFoolAttack()


      raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)


      • 原始的对抗例子(raw):这取决于攻击,我们不能对输出做出保证。

      • 简短的对抗性例子(clipped)。这些被保证不受干扰,因此是你想看到的实际对抗的例子。请注意,其中一些可能实际上不会切换类。要知道哪些样本是对抗性的,你应该看看第三张量。

      • 第三张量包含了每个样本的布尔值,表明哪些样本是真正的对手,这些样本都被错误分类了,并且在干净样本周围的球中。

      • 如何使用这些张量一会就会更清楚了。

    • 多个ε


      import numpy as np
      epsilons = np.linspace(0.0, 0.005, num=20)
      raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=epsilons)


    • 稳健的准确率


      robust_accuracy = 1 - is_adv.float32().mean(axis=-1)


      import matplotlib.pyplot as plt
      plt.plot(epsilons, robust_accuracy.numpy())
    • 例子


        import torchvision.models as models
        import eagerpy as ep
        from foolbox import PyTorchModel, accuracy, samples
        from foolbox.attacks import LinfPGDdef main() -> None:# instantiate a model (could also be a TensorFlow or JAX model)model = models.resnet18(pretrained=True).eval()preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)fmodel = PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)# get data and test the model# wrapping the tensors with ep.astensors is optional, but it allows# us to work with EagerPy tensors in the followingimages, labels = ep.astensors(*samples(fmodel, dataset="imagenet", batchsize=16))clean_acc = accuracy(fmodel, images, labels)print(f"clean accuracy:  {clean_acc * 100:.1f} %")# apply the attackattack = LinfPGD()epsilons = [0.0,0.0002,0.0005,0.0008,0.001,0.0015,0.002,0.003,0.01,0.1,0.3,0.5,1.0,]raw_advs, clipped_advs, success = attack(fmodel, images, labels, epsilons=epsilons)# calculate and report the robust accuracy (the accuracy of the model when# it is attacked)robust_accuracy = 1 - success.float32().mean(axis=-1)print("robust accuracy for perturbations with")for eps, acc in zip(epsilons, robust_accuracy):print(f"  Linf norm ≤ {eps:<6}: {acc.item() * 100:4.1f} %")# we can also manually check this# we will use the clipped advs instead of the raw advs, otherwise# we would need to check if the perturbation sizes are actually# within the specified epsilon boundprint()print("we can also manually check this:")print()print("robust accuracy for perturbations with")for eps, advs_ in zip(epsilons, clipped_advs):acc2 = accuracy(fmodel, advs_, labels)print(f"  Linf norm ≤ {eps:<6}: {acc2 * 100:4.1f} %")print("    perturbation sizes:")perturbation_sizes = (advs_ - images).norms.linf(axis=(1, 2, 3)).numpy()print("    ", str(perturbation_sizes).replace("\n", "\n" + "    "))if acc2 == 0:breakif __name__ == "__main__":main()

        #!/usr/bin/env python3
        import torchvision.models as models
        import eagerpy as ep
        from foolbox import PyTorchModel, accuracy, samples
        import foolbox.attacks as fa
        import numpy as npif __name__ == "__main__":# instantiate a model (could also be a TensorFlow or JAX model)model = models.resnet18(pretrained=True).eval()preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)fmodel = PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)# get data and test the model# wrapping the tensors with ep.astensors is optional, but it allows# us to work with EagerPy tensors in the followingimages, labels = ep.astensors(*samples(fmodel, dataset="imagenet", batchsize=16))clean_acc = accuracy(fmodel, images, labels)print(f"clean accuracy:  {clean_acc * 100:.1f} %")print("")attacks = [fa.FGSM(),fa.LinfPGD(),fa.LinfBasicIterativeAttack(),fa.LinfAdditiveUniformNoiseAttack(),fa.LinfDeepFoolAttack(),]epsilons = [0.0,0.0005,0.001,0.0015,0.002,0.003,0.005,0.01,0.02,0.03,0.1,0.3,0.5,1.0,]print("epsilons")print(epsilons)print("")attack_success = np.zeros((len(attacks), len(epsilons), len(images)), dtype=np.bool)for i, attack in enumerate(attacks):_, _, success = attack(fmodel, images, labels, epsilons=epsilons)assert success.shape == (len(epsilons), len(images))success_ = success.numpy()assert success_.dtype == np.boolattack_success[i] = success_print(attack)print("  ", 1.0 - success_.mean(axis=-1).round(2))# calculate and report the robust accuracy (the accuracy of the model when# it is attacked) using the best attack per samplerobust_accuracy = 1.0 - attack_success.max(axis=0).mean(axis=-1)print("")print("-" * 79)print("")print("worst case (best attack per-sample)")print("  ", robust_accuracy.round(2))print("")print("robust accuracy for perturbations with")for eps, acc in zip(epsilons, robust_accuracy):print(f"  Linf norm ≤ {eps:<6}: {acc.item() * 100:4.1f} %")

        #!/usr/bin/env python3
        The spatial attack is a very special attack because it tries to find adversarial
        perturbations using a set of translations and rotations rather then in an Lp ball.
        It therefore has a slightly different interface.
        """import torchvision.models as models
        import eagerpy as ep
        from foolbox import PyTorchModel, accuracy, samples
        import foolbox.attacks as fadef main() -> None:# instantiate a model (could also be a TensorFlow or JAX model)model = models.resnet18(pretrained=True).eval()preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)fmodel = PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)# get data and test the model# wrapping the tensors with ep.astensors is optional, but it allows# us to work with EagerPy tensors in the followingimages, labels = ep.astensors(*samples(fmodel, dataset="imagenet", batchsize=16))clean_acc = accuracy(fmodel, images, labels) * 100print(f"clean accuracy:  {clean_acc:.1f} %")# the attack trys a combination of specified rotations and translations to an image# stops early if adversarial shifts and translations for all images are foundattack = fa.SpatialAttack(max_translation=6,  # 6px so x in [x-6, x+6] and y in [y-6, y+6]num_translations=6,  # number of translations in x, y.max_rotation=20,  # +- rotation in degreesnum_rotations=5,  # number of rotations# max total iterations = num_rotations * num_translations**2)# report the success rate of the attack (percentage of samples that could# be adversarially perturbed) and the robust accuracy (the remaining accuracy# of the model when it is attacked)xp_, _, success = attack(fmodel, images, labels)suc = success.float32().mean().item() * 100print(f"attack success:  {suc:.1f} %"" (for the specified rotation and translation bounds)")print(f"robust accuracy: {100 - suc:.1f} %"" (for the specified rotation and translation bounds)")if __name__ == "__main__":main()

        #!/usr/bin/env python3
        # mypy: no-disallow-untyped-defs
        import torchvision.models as models
        import eagerpy as ep
        from foolbox import PyTorchModel, accuracy, samples
        from foolbox.attacks import LinfPGD
        from foolbox.attacks.base import get_criteriondef main() -> None:# instantiate a model (could also be a TensorFlow or JAX model)model = models.resnet18(pretrained=True).eval()preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)fmodel = PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)# get data and test the model# wrapping the tensors with ep.astensors is optional, but it allows# us to work with EagerPy tensors in the followingimages, labels = ep.astensors(*samples(fmodel, dataset="imagenet", batchsize=16))clean_acc = accuracy(fmodel, images, labels)print(f"clean accuracy:  {clean_acc * 100:.1f} %")# replace the gradient with the gradient from another modelmodel2 = fmodel  # demo, we just use the same model,也可以换别的模型# TODO: this is still a bit annoying because we need# to overwrite run to get the labelsclass Attack(LinfPGD):def value_and_grad(self, loss_fn, x):val1 = loss_fn(x)loss_fn2 = self.get_loss_fn(model2, self.labels)_, grad2 = ep.value_and_grad(loss_fn2, x)return val1, grad2def run(self, model, inputs, criterion, *, epsilon, **kwargs):criterion_ = get_criterion(criterion)self.labels = criterion_.labelsreturn super().run(model, inputs, criterion_, epsilon=epsilon, **kwargs)# apply the attackattack = Attack()epsilons = [0.0,0.0002,0.0005,0.0008,0.001,0.0015,0.002,0.003,0.01,0.1,0.3,0.5,1.0,]raw_advs, clipped_advs, success = attack(fmodel, images, labels, epsilons=epsilons)# calculate and report the robust accuracy (the accuracy of the model when# it is attacked)robust_accuracy = 1 - success.float32().mean(axis=-1)print("robust accuracy for perturbations with")for eps, acc in zip(epsilons, robust_accuracy):print(f"  Linf norm ≤ {eps:<6}: {acc.item() * 100:4.1f} %")# we can also manually check this# we will use the clipped advs instead of the raw advs, otherwise# we would need to check if the perturbation sizes are actually# within the specified epsilon boundprint()print("we can also manually check this:")print()print("robust accuracy for perturbations with")for eps, advs_ in zip(epsilons, clipped_advs):acc2 = accuracy(fmodel, advs_, labels)print(f"  Linf norm ≤ {eps:<6}: {acc2 * 100:4.1f} %")print("    perturbation sizes:")perturbation_sizes = (advs_ - images).norms.linf(axis=(1, 2, 3)).numpy()print("    ", str(perturbation_sizes).replace("\n", "\n" + "    "))if acc2 == 0:breakif __name__ == "__main__":main()


