bat.attacks.square_attack

  1import os
  2import gc
  3import numpy as np
  4from tqdm import tqdm
  5import concurrent.futures
  6
  7SCALE = 255
  8PREPROCESS = lambda x: x
  9
 10class SquareAttack():
 11
 12    def __init__(self,  classifier):
 13        """
 14        Create a class: `SquareAttack` instance.
 15        - classifier: model to attack
 16        """
 17        self.min_val, self.max_val = 0.0, 1.0 * SCALE
 18        self.classifier = classifier
 19
 20    def p_selection(self, p_init, it, n_iters):
 21        """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """
 22        it = int(it / n_iters * 10000)
 23
 24        if 10 < it <= 50:
 25            p = p_init / 2
 26        elif 50 < it <= 200:
 27            p = p_init / 4
 28        elif 200 < it <= 500:
 29            p = p_init / 8
 30        elif 500 < it <= 1000:
 31            p = p_init / 16
 32        elif 1000 < it <= 2000:
 33            p = p_init / 32
 34        elif 2000 < it <= 4000:
 35            p = p_init / 64
 36        elif 4000 < it <= 6000:
 37            p = p_init / 128
 38        elif 6000 < it <= 8000:
 39            p = p_init / 256
 40        elif 8000 < it <= 10000:
 41            p = p_init / 512
 42        else:
 43            p = p_init
 44
 45        return p
 46
 47
 48    def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'):
 49        """ Implements the margin loss (difference between the correct and 2nd best class). """
 50
 51        def softmax(x):
 52            e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
 53            return e_x / e_x.sum(axis=1, keepdims=True)
 54
 55        if loss_type == 'margin_loss':
 56            preds_correct_class = (logits * y).sum(1, keepdims=True)
 57            diff = preds_correct_class - logits  # difference between the correct class and all other classes
 58            diff[np.array(y)] = np.inf  # to exclude zeros coming from f_correct - f_correct
 59            margin = diff.min(1, keepdims=True)
 60            loss = margin * -1 if targeted else margin
 61        elif loss_type == 'cross_entropy':
 62            probs = softmax(logits)
 63            loss = -np.log(probs[y])
 64            loss = loss * -1 if not targeted else loss
 65        else:
 66            raise ValueError('Wrong loss.')
 67
 68        return loss.flatten()
 69
 70
 71    def init(self, x, y, epsilon, targeted, loss_type):
 72        # Initialize the attack
 73        x_adv = []
 74        for i, xi in enumerate(x):
 75            h, w, c = xi.shape[:]
 76            # [1, w, c], i.e. vertical stripes work best for untargeted attacks
 77            init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c])
 78            x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val))
 79
 80        logits = self.classifier.predict(PREPROCESS(x_adv))
 81
 82        n_queries = np.ones(len(x))  # ones because we have already used 1 query
 83
 84        assert(len(logits) == len(y))
 85
 86        loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type)
 87        margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss')
 88
 89        return x_adv, margin_min, loss_min, n_queries
 90
 91
 92    def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type):
 93        """ Horicontal: One step of the attack. """
 94
 95        idx_to_fool = margin_min > 0
 96        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
 97
 98        if len(idx_to_fool) == 0:
 99            return x_adv, margin_min, loss_min, n_queries
100
101        x_curr = []
102        x_adv_curr = []
103        y_curr = []
104        for idx in idx_to_fool:
105            x_curr.append(x[idx])
106            x_adv_curr.append(x_adv[idx])
107            y_curr.append(y[idx])
108
109        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
110        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
111
112        p = self.p_selection(p_init, i_iter, n_iters)
113
114        for i_img in range(len(x_adv_curr)):
115            h, w, c = x[i_img].shape[:]
116            n_features = c*h*w
117
118            s = int(round(np.sqrt(p * n_features / c)))
119            s = min(max(s, 1), h-1)  # at least c x 1 x 1 window is taken and at most c x h-1 x h-1
120            center_h = np.random.randint(0, h - s)
121            center_w = np.random.randint(0, w - s)
122
123            x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
124            x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
125            # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch)
126            while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s:
127                deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c])
128
129        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
130
131        logits = self.classifier.predict(PREPROCESS(x_new))
132
133        assert(len(logits) == len(y_curr))
134
135        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
136        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
137
138        idx_improved = loss < loss_min_curr
139        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
140        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
141
142        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
143
144        for i in range(len(idx_improved)):
145            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
146
147        n_queries[idx_to_fool] += 1
148
149        return x_adv, margin_min, loss_min, n_queries
150
151    def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency):
152        """ Verticle: Multiple steps of the attack. """
153
154        idx_to_fool = margin_min > 0
155        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
156
157        x_curr = [x[idx] for idx in idx_to_fool]
158        x_adv_curr = [x_adv[idx] for idx in idx_to_fool]
159        y_curr = [y[idx] for idx in idx_to_fool]
160
161        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
162
163        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
164
165        with concurrent.futures.ThreadPoolExecutor() as executor:
166            future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)}
167            for future in concurrent.futures.as_completed(future_to_url):
168                j = future_to_url[future]
169                try:
170                    x_adv, _, loss_min_temp, _ = future.result()
171                    idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr
172                    idx_improved = [i for i, v in enumerate(idx_improved) if v]
173
174                    for idx in idx_improved:
175                        deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx]
176
177                except Exception as e:
178                    print('Task %r generated an exception: %s' % (j, e))
179                else:
180                    pass
181
182        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
183
184        logits = self.classifier.predict(PREPROCESS(x_new))
185
186        assert(len(logits) == len(y_curr))
187
188        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
189        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
190
191        idx_improved = loss < loss_min_curr
192        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
193        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
194
195        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
196
197        for i in range(len(idx_improved)):
198            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
199
200        n_queries[idx_to_fool] +=  (concurrency + 1)
201
202        return x_adv, margin_min, loss_min, n_queries
203
204    def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 
205        """ The Linf square attack """
206        n_targets = 0
207        if type(x) == list:
208            n_targets = len(x)
209        elif type(x) == np.ndarray:
210            n_targets = x.shape[0]
211        else:
212            raise ValueError('Input type not supported...')
213
214        assert n_targets > 0
215
216        assert(len(x) > 0)
217        assert(len(x) == len(y))
218
219        y_label = np.argmax(y, axis=1)
220
221        logits_clean = self.classifier.predict(PREPROCESS(x))
222
223        corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))]
224
225        # important to check that the model was restored correctly and the clean accuracy is high
226        print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified)))
227
228        if np.mean(corr_classified) == 0:
229            print('No clean examples classified correctly. Aborting...')
230            n_queries = np.ones(len(x))  # ones because we have already used 1 query
231
232            mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries)
233
234            return x, n_queries
235
236        else:
237            if n_targets > 1:
238                # Horizontally Distributed Attack
239                pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)")
240            else:
241                # Vertically Distributed Attack
242                pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)")
243
244        np.random.seed(0)  # important to leave it here as well
245
246        # Only attack the correctly classified examples
247        y = y[corr_classified]
248        if type(x) == list:
249            idx_corr_classified = [i for i, v in enumerate(corr_classified) if v]
250            x = [xi for i, xi in enumerate(x) if i in idx_corr_classified]
251        elif type(x) == np.ndarray:
252            x = x[corr_classified]
253
254        x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type)
255
256        acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified))
257
258        if acc == 0:
259            print('\nSuceessfully found adversarial examples for all examples after initialization')
260            return x_adv, n_queries
261
262        # Main loop
263
264        for i_iter in pbar:
265
266            if n_targets > 1:
267                # Horizontally Distributed Attack
268                x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type)
269            else:
270                # Vertically Distributed Attack
271                x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency)
272
273            acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified))
274
275            pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae})
276
277            if acc == 0:
278                print('\nSuceessfully found adversarial examples for all examples')
279                break
280
281            gc.collect()
282
283        return x_adv, n_queries
284
285
286    def evaluate(self, margin_min, n_queries, i_iter, n_ex_total):
287        if len(margin_min) > 0 and len(n_queries) > 0:
288            acc = (margin_min > 0.0).sum() / n_ex_total
289            acc_curr = (margin_min > 0.0).mean()
290            mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0])
291            avg_margin_min = np.mean(margin_min)
292
293            return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min
294
295        else:
296            return -1, -1, -1, -1, -1, -1
SCALE = 255
def PREPROCESS(x):
9PREPROCESS = lambda x: x
class SquareAttack:
 11class SquareAttack():
 12
 13    def __init__(self,  classifier):
 14        """
 15        Create a class: `SquareAttack` instance.
 16        - classifier: model to attack
 17        """
 18        self.min_val, self.max_val = 0.0, 1.0 * SCALE
 19        self.classifier = classifier
 20
 21    def p_selection(self, p_init, it, n_iters):
 22        """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """
 23        it = int(it / n_iters * 10000)
 24
 25        if 10 < it <= 50:
 26            p = p_init / 2
 27        elif 50 < it <= 200:
 28            p = p_init / 4
 29        elif 200 < it <= 500:
 30            p = p_init / 8
 31        elif 500 < it <= 1000:
 32            p = p_init / 16
 33        elif 1000 < it <= 2000:
 34            p = p_init / 32
 35        elif 2000 < it <= 4000:
 36            p = p_init / 64
 37        elif 4000 < it <= 6000:
 38            p = p_init / 128
 39        elif 6000 < it <= 8000:
 40            p = p_init / 256
 41        elif 8000 < it <= 10000:
 42            p = p_init / 512
 43        else:
 44            p = p_init
 45
 46        return p
 47
 48
 49    def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'):
 50        """ Implements the margin loss (difference between the correct and 2nd best class). """
 51
 52        def softmax(x):
 53            e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
 54            return e_x / e_x.sum(axis=1, keepdims=True)
 55
 56        if loss_type == 'margin_loss':
 57            preds_correct_class = (logits * y).sum(1, keepdims=True)
 58            diff = preds_correct_class - logits  # difference between the correct class and all other classes
 59            diff[np.array(y)] = np.inf  # to exclude zeros coming from f_correct - f_correct
 60            margin = diff.min(1, keepdims=True)
 61            loss = margin * -1 if targeted else margin
 62        elif loss_type == 'cross_entropy':
 63            probs = softmax(logits)
 64            loss = -np.log(probs[y])
 65            loss = loss * -1 if not targeted else loss
 66        else:
 67            raise ValueError('Wrong loss.')
 68
 69        return loss.flatten()
 70
 71
 72    def init(self, x, y, epsilon, targeted, loss_type):
 73        # Initialize the attack
 74        x_adv = []
 75        for i, xi in enumerate(x):
 76            h, w, c = xi.shape[:]
 77            # [1, w, c], i.e. vertical stripes work best for untargeted attacks
 78            init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c])
 79            x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val))
 80
 81        logits = self.classifier.predict(PREPROCESS(x_adv))
 82
 83        n_queries = np.ones(len(x))  # ones because we have already used 1 query
 84
 85        assert(len(logits) == len(y))
 86
 87        loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type)
 88        margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss')
 89
 90        return x_adv, margin_min, loss_min, n_queries
 91
 92
 93    def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type):
 94        """ Horicontal: One step of the attack. """
 95
 96        idx_to_fool = margin_min > 0
 97        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
 98
 99        if len(idx_to_fool) == 0:
100            return x_adv, margin_min, loss_min, n_queries
101
102        x_curr = []
103        x_adv_curr = []
104        y_curr = []
105        for idx in idx_to_fool:
106            x_curr.append(x[idx])
107            x_adv_curr.append(x_adv[idx])
108            y_curr.append(y[idx])
109
110        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
111        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
112
113        p = self.p_selection(p_init, i_iter, n_iters)
114
115        for i_img in range(len(x_adv_curr)):
116            h, w, c = x[i_img].shape[:]
117            n_features = c*h*w
118
119            s = int(round(np.sqrt(p * n_features / c)))
120            s = min(max(s, 1), h-1)  # at least c x 1 x 1 window is taken and at most c x h-1 x h-1
121            center_h = np.random.randint(0, h - s)
122            center_w = np.random.randint(0, w - s)
123
124            x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
125            x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
126            # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch)
127            while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s:
128                deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c])
129
130        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
131
132        logits = self.classifier.predict(PREPROCESS(x_new))
133
134        assert(len(logits) == len(y_curr))
135
136        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
137        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
138
139        idx_improved = loss < loss_min_curr
140        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
141        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
142
143        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
144
145        for i in range(len(idx_improved)):
146            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
147
148        n_queries[idx_to_fool] += 1
149
150        return x_adv, margin_min, loss_min, n_queries
151
152    def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency):
153        """ Verticle: Multiple steps of the attack. """
154
155        idx_to_fool = margin_min > 0
156        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
157
158        x_curr = [x[idx] for idx in idx_to_fool]
159        x_adv_curr = [x_adv[idx] for idx in idx_to_fool]
160        y_curr = [y[idx] for idx in idx_to_fool]
161
162        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
163
164        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
165
166        with concurrent.futures.ThreadPoolExecutor() as executor:
167            future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)}
168            for future in concurrent.futures.as_completed(future_to_url):
169                j = future_to_url[future]
170                try:
171                    x_adv, _, loss_min_temp, _ = future.result()
172                    idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr
173                    idx_improved = [i for i, v in enumerate(idx_improved) if v]
174
175                    for idx in idx_improved:
176                        deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx]
177
178                except Exception as e:
179                    print('Task %r generated an exception: %s' % (j, e))
180                else:
181                    pass
182
183        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
184
185        logits = self.classifier.predict(PREPROCESS(x_new))
186
187        assert(len(logits) == len(y_curr))
188
189        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
190        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
191
192        idx_improved = loss < loss_min_curr
193        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
194        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
195
196        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
197
198        for i in range(len(idx_improved)):
199            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
200
201        n_queries[idx_to_fool] +=  (concurrency + 1)
202
203        return x_adv, margin_min, loss_min, n_queries
204
205    def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 
206        """ The Linf square attack """
207        n_targets = 0
208        if type(x) == list:
209            n_targets = len(x)
210        elif type(x) == np.ndarray:
211            n_targets = x.shape[0]
212        else:
213            raise ValueError('Input type not supported...')
214
215        assert n_targets > 0
216
217        assert(len(x) > 0)
218        assert(len(x) == len(y))
219
220        y_label = np.argmax(y, axis=1)
221
222        logits_clean = self.classifier.predict(PREPROCESS(x))
223
224        corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))]
225
226        # important to check that the model was restored correctly and the clean accuracy is high
227        print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified)))
228
229        if np.mean(corr_classified) == 0:
230            print('No clean examples classified correctly. Aborting...')
231            n_queries = np.ones(len(x))  # ones because we have already used 1 query
232
233            mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries)
234
235            return x, n_queries
236
237        else:
238            if n_targets > 1:
239                # Horizontally Distributed Attack
240                pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)")
241            else:
242                # Vertically Distributed Attack
243                pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)")
244
245        np.random.seed(0)  # important to leave it here as well
246
247        # Only attack the correctly classified examples
248        y = y[corr_classified]
249        if type(x) == list:
250            idx_corr_classified = [i for i, v in enumerate(corr_classified) if v]
251            x = [xi for i, xi in enumerate(x) if i in idx_corr_classified]
252        elif type(x) == np.ndarray:
253            x = x[corr_classified]
254
255        x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type)
256
257        acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified))
258
259        if acc == 0:
260            print('\nSuceessfully found adversarial examples for all examples after initialization')
261            return x_adv, n_queries
262
263        # Main loop
264
265        for i_iter in pbar:
266
267            if n_targets > 1:
268                # Horizontally Distributed Attack
269                x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type)
270            else:
271                # Vertically Distributed Attack
272                x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency)
273
274            acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified))
275
276            pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae})
277
278            if acc == 0:
279                print('\nSuceessfully found adversarial examples for all examples')
280                break
281
282            gc.collect()
283
284        return x_adv, n_queries
285
286
287    def evaluate(self, margin_min, n_queries, i_iter, n_ex_total):
288        if len(margin_min) > 0 and len(n_queries) > 0:
289            acc = (margin_min > 0.0).sum() / n_ex_total
290            acc_curr = (margin_min > 0.0).mean()
291            mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0])
292            avg_margin_min = np.mean(margin_min)
293
294            return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min
295
296        else:
297            return -1, -1, -1, -1, -1, -1
SquareAttack(classifier)
13    def __init__(self,  classifier):
14        """
15        Create a class: `SquareAttack` instance.
16        - classifier: model to attack
17        """
18        self.min_val, self.max_val = 0.0, 1.0 * SCALE
19        self.classifier = classifier

Create a class: SquareAttack instance.

  • classifier: model to attack
classifier
def p_selection(self, p_init, it, n_iters):
21    def p_selection(self, p_init, it, n_iters):
22        """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """
23        it = int(it / n_iters * 10000)
24
25        if 10 < it <= 50:
26            p = p_init / 2
27        elif 50 < it <= 200:
28            p = p_init / 4
29        elif 200 < it <= 500:
30            p = p_init / 8
31        elif 500 < it <= 1000:
32            p = p_init / 16
33        elif 1000 < it <= 2000:
34            p = p_init / 32
35        elif 2000 < it <= 4000:
36            p = p_init / 64
37        elif 4000 < it <= 6000:
38            p = p_init / 128
39        elif 6000 < it <= 8000:
40            p = p_init / 256
41        elif 8000 < it <= 10000:
42            p = p_init / 512
43        else:
44            p = p_init
45
46        return p

Piece-wise constant schedule for p (the fraction of pixels changed on every iteration).

def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'):
49    def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'):
50        """ Implements the margin loss (difference between the correct and 2nd best class). """
51
52        def softmax(x):
53            e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
54            return e_x / e_x.sum(axis=1, keepdims=True)
55
56        if loss_type == 'margin_loss':
57            preds_correct_class = (logits * y).sum(1, keepdims=True)
58            diff = preds_correct_class - logits  # difference between the correct class and all other classes
59            diff[np.array(y)] = np.inf  # to exclude zeros coming from f_correct - f_correct
60            margin = diff.min(1, keepdims=True)
61            loss = margin * -1 if targeted else margin
62        elif loss_type == 'cross_entropy':
63            probs = softmax(logits)
64            loss = -np.log(probs[y])
65            loss = loss * -1 if not targeted else loss
66        else:
67            raise ValueError('Wrong loss.')
68
69        return loss.flatten()

Implements the margin loss (difference between the correct and 2nd best class).

def init(self, x, y, epsilon, targeted, loss_type):
72    def init(self, x, y, epsilon, targeted, loss_type):
73        # Initialize the attack
74        x_adv = []
75        for i, xi in enumerate(x):
76            h, w, c = xi.shape[:]
77            # [1, w, c], i.e. vertical stripes work best for untargeted attacks
78            init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c])
79            x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val))
80
81        logits = self.classifier.predict(PREPROCESS(x_adv))
82
83        n_queries = np.ones(len(x))  # ones because we have already used 1 query
84
85        assert(len(logits) == len(y))
86
87        loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type)
88        margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss')
89
90        return x_adv, margin_min, loss_min, n_queries
def step( self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type):
 93    def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type):
 94        """ Horicontal: One step of the attack. """
 95
 96        idx_to_fool = margin_min > 0
 97        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
 98
 99        if len(idx_to_fool) == 0:
100            return x_adv, margin_min, loss_min, n_queries
101
102        x_curr = []
103        x_adv_curr = []
104        y_curr = []
105        for idx in idx_to_fool:
106            x_curr.append(x[idx])
107            x_adv_curr.append(x_adv[idx])
108            y_curr.append(y[idx])
109
110        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
111        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
112
113        p = self.p_selection(p_init, i_iter, n_iters)
114
115        for i_img in range(len(x_adv_curr)):
116            h, w, c = x[i_img].shape[:]
117            n_features = c*h*w
118
119            s = int(round(np.sqrt(p * n_features / c)))
120            s = min(max(s, 1), h-1)  # at least c x 1 x 1 window is taken and at most c x h-1 x h-1
121            center_h = np.random.randint(0, h - s)
122            center_w = np.random.randint(0, w - s)
123
124            x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
125            x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :]
126            # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch)
127            while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s:
128                deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c])
129
130        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
131
132        logits = self.classifier.predict(PREPROCESS(x_new))
133
134        assert(len(logits) == len(y_curr))
135
136        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
137        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
138
139        idx_improved = loss < loss_min_curr
140        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
141        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
142
143        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
144
145        for i in range(len(idx_improved)):
146            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
147
148        n_queries[idx_to_fool] += 1
149
150        return x_adv, margin_min, loss_min, n_queries

Horicontal: One step of the attack.

def batch( self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency):
152    def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency):
153        """ Verticle: Multiple steps of the attack. """
154
155        idx_to_fool = margin_min > 0
156        idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v]
157
158        x_curr = [x[idx] for idx in idx_to_fool]
159        x_adv_curr = [x_adv[idx] for idx in idx_to_fool]
160        y_curr = [y[idx] for idx in idx_to_fool]
161
162        loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool]
163
164        deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ]
165
166        with concurrent.futures.ThreadPoolExecutor() as executor:
167            future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)}
168            for future in concurrent.futures.as_completed(future_to_url):
169                j = future_to_url[future]
170                try:
171                    x_adv, _, loss_min_temp, _ = future.result()
172                    idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr
173                    idx_improved = [i for i, v in enumerate(idx_improved) if v]
174
175                    for idx in idx_improved:
176                        deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx]
177
178                except Exception as e:
179                    print('Task %r generated an exception: %s' % (j, e))
180                else:
181                    pass
182
183        x_new = [  np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ]
184
185        logits = self.classifier.predict(PREPROCESS(x_new))
186
187        assert(len(logits) == len(y_curr))
188
189        loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type)
190        margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss')
191
192        idx_improved = loss < loss_min_curr
193        loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr
194        margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr
195
196        idx_improved = np.reshape(idx_improved, [-1, *[1]*3])
197
198        for i in range(len(idx_improved)):
199            x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i]
200
201        n_queries[idx_to_fool] +=  (concurrency + 1)
202
203        return x_adv, margin_min, loss_min, n_queries

Verticle: Multiple steps of the attack.

def attack( self, x, y, targeted, epsilon=0.05, max_it=1000, p_init=0.05, loss_type='margin_loss', concurrency=1):
205    def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 
206        """ The Linf square attack """
207        n_targets = 0
208        if type(x) == list:
209            n_targets = len(x)
210        elif type(x) == np.ndarray:
211            n_targets = x.shape[0]
212        else:
213            raise ValueError('Input type not supported...')
214
215        assert n_targets > 0
216
217        assert(len(x) > 0)
218        assert(len(x) == len(y))
219
220        y_label = np.argmax(y, axis=1)
221
222        logits_clean = self.classifier.predict(PREPROCESS(x))
223
224        corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))]
225
226        # important to check that the model was restored correctly and the clean accuracy is high
227        print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified)))
228
229        if np.mean(corr_classified) == 0:
230            print('No clean examples classified correctly. Aborting...')
231            n_queries = np.ones(len(x))  # ones because we have already used 1 query
232
233            mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries)
234
235            return x, n_queries
236
237        else:
238            if n_targets > 1:
239                # Horizontally Distributed Attack
240                pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)")
241            else:
242                # Vertically Distributed Attack
243                pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)")
244
245        np.random.seed(0)  # important to leave it here as well
246
247        # Only attack the correctly classified examples
248        y = y[corr_classified]
249        if type(x) == list:
250            idx_corr_classified = [i for i, v in enumerate(corr_classified) if v]
251            x = [xi for i, xi in enumerate(x) if i in idx_corr_classified]
252        elif type(x) == np.ndarray:
253            x = x[corr_classified]
254
255        x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type)
256
257        acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified))
258
259        if acc == 0:
260            print('\nSuceessfully found adversarial examples for all examples after initialization')
261            return x_adv, n_queries
262
263        # Main loop
264
265        for i_iter in pbar:
266
267            if n_targets > 1:
268                # Horizontally Distributed Attack
269                x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type)
270            else:
271                # Vertically Distributed Attack
272                x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency)
273
274            acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified))
275
276            pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae})
277
278            if acc == 0:
279                print('\nSuceessfully found adversarial examples for all examples')
280                break
281
282            gc.collect()
283
284        return x_adv, n_queries

The Linf square attack

def evaluate(self, margin_min, n_queries, i_iter, n_ex_total):
287    def evaluate(self, margin_min, n_queries, i_iter, n_ex_total):
288        if len(margin_min) > 0 and len(n_queries) > 0:
289            acc = (margin_min > 0.0).sum() / n_ex_total
290            acc_curr = (margin_min > 0.0).mean()
291            mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0])
292            avg_margin_min = np.mean(margin_min)
293
294            return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min
295
296        else:
297            return -1, -1, -1, -1, -1, -1