bat.attacks.square_attack
1import os 2import gc 3import numpy as np 4from tqdm import tqdm 5import concurrent.futures 6 7SCALE = 255 8PREPROCESS = lambda x: x 9 10class SquareAttack(): 11 12 def __init__(self, classifier): 13 """ 14 Create a class: `SquareAttack` instance. 15 - classifier: model to attack 16 """ 17 self.min_val, self.max_val = 0.0, 1.0 * SCALE 18 self.classifier = classifier 19 20 def p_selection(self, p_init, it, n_iters): 21 """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """ 22 it = int(it / n_iters * 10000) 23 24 if 10 < it <= 50: 25 p = p_init / 2 26 elif 50 < it <= 200: 27 p = p_init / 4 28 elif 200 < it <= 500: 29 p = p_init / 8 30 elif 500 < it <= 1000: 31 p = p_init / 16 32 elif 1000 < it <= 2000: 33 p = p_init / 32 34 elif 2000 < it <= 4000: 35 p = p_init / 64 36 elif 4000 < it <= 6000: 37 p = p_init / 128 38 elif 6000 < it <= 8000: 39 p = p_init / 256 40 elif 8000 < it <= 10000: 41 p = p_init / 512 42 else: 43 p = p_init 44 45 return p 46 47 48 def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'): 49 """ Implements the margin loss (difference between the correct and 2nd best class). """ 50 51 def softmax(x): 52 e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) 53 return e_x / e_x.sum(axis=1, keepdims=True) 54 55 if loss_type == 'margin_loss': 56 preds_correct_class = (logits * y).sum(1, keepdims=True) 57 diff = preds_correct_class - logits # difference between the correct class and all other classes 58 diff[np.array(y)] = np.inf # to exclude zeros coming from f_correct - f_correct 59 margin = diff.min(1, keepdims=True) 60 loss = margin * -1 if targeted else margin 61 elif loss_type == 'cross_entropy': 62 probs = softmax(logits) 63 loss = -np.log(probs[y]) 64 loss = loss * -1 if not targeted else loss 65 else: 66 raise ValueError('Wrong loss.') 67 68 return loss.flatten() 69 70 71 def init(self, x, y, epsilon, targeted, loss_type): 72 # Initialize the attack 73 x_adv = [] 74 for i, xi in enumerate(x): 75 h, w, c = xi.shape[:] 76 # [1, w, c], i.e. vertical stripes work best for untargeted attacks 77 init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c]) 78 x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val)) 79 80 logits = self.classifier.predict(PREPROCESS(x_adv)) 81 82 n_queries = np.ones(len(x)) # ones because we have already used 1 query 83 84 assert(len(logits) == len(y)) 85 86 loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type) 87 margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss') 88 89 return x_adv, margin_min, loss_min, n_queries 90 91 92 def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): 93 """ Horicontal: One step of the attack. """ 94 95 idx_to_fool = margin_min > 0 96 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 97 98 if len(idx_to_fool) == 0: 99 return x_adv, margin_min, loss_min, n_queries 100 101 x_curr = [] 102 x_adv_curr = [] 103 y_curr = [] 104 for idx in idx_to_fool: 105 x_curr.append(x[idx]) 106 x_adv_curr.append(x_adv[idx]) 107 y_curr.append(y[idx]) 108 109 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 110 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 111 112 p = self.p_selection(p_init, i_iter, n_iters) 113 114 for i_img in range(len(x_adv_curr)): 115 h, w, c = x[i_img].shape[:] 116 n_features = c*h*w 117 118 s = int(round(np.sqrt(p * n_features / c))) 119 s = min(max(s, 1), h-1) # at least c x 1 x 1 window is taken and at most c x h-1 x h-1 120 center_h = np.random.randint(0, h - s) 121 center_w = np.random.randint(0, w - s) 122 123 x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 124 x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 125 # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch) 126 while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s: 127 deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c]) 128 129 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 130 131 logits = self.classifier.predict(PREPROCESS(x_new)) 132 133 assert(len(logits) == len(y_curr)) 134 135 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 136 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 137 138 idx_improved = loss < loss_min_curr 139 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 140 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 141 142 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 143 144 for i in range(len(idx_improved)): 145 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 146 147 n_queries[idx_to_fool] += 1 148 149 return x_adv, margin_min, loss_min, n_queries 150 151 def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency): 152 """ Verticle: Multiple steps of the attack. """ 153 154 idx_to_fool = margin_min > 0 155 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 156 157 x_curr = [x[idx] for idx in idx_to_fool] 158 x_adv_curr = [x_adv[idx] for idx in idx_to_fool] 159 y_curr = [y[idx] for idx in idx_to_fool] 160 161 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 162 163 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 164 165 with concurrent.futures.ThreadPoolExecutor() as executor: 166 future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)} 167 for future in concurrent.futures.as_completed(future_to_url): 168 j = future_to_url[future] 169 try: 170 x_adv, _, loss_min_temp, _ = future.result() 171 idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr 172 idx_improved = [i for i, v in enumerate(idx_improved) if v] 173 174 for idx in idx_improved: 175 deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx] 176 177 except Exception as e: 178 print('Task %r generated an exception: %s' % (j, e)) 179 else: 180 pass 181 182 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 183 184 logits = self.classifier.predict(PREPROCESS(x_new)) 185 186 assert(len(logits) == len(y_curr)) 187 188 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 189 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 190 191 idx_improved = loss < loss_min_curr 192 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 193 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 194 195 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 196 197 for i in range(len(idx_improved)): 198 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 199 200 n_queries[idx_to_fool] += (concurrency + 1) 201 202 return x_adv, margin_min, loss_min, n_queries 203 204 def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 205 """ The Linf square attack """ 206 n_targets = 0 207 if type(x) == list: 208 n_targets = len(x) 209 elif type(x) == np.ndarray: 210 n_targets = x.shape[0] 211 else: 212 raise ValueError('Input type not supported...') 213 214 assert n_targets > 0 215 216 assert(len(x) > 0) 217 assert(len(x) == len(y)) 218 219 y_label = np.argmax(y, axis=1) 220 221 logits_clean = self.classifier.predict(PREPROCESS(x)) 222 223 corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))] 224 225 # important to check that the model was restored correctly and the clean accuracy is high 226 print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified))) 227 228 if np.mean(corr_classified) == 0: 229 print('No clean examples classified correctly. Aborting...') 230 n_queries = np.ones(len(x)) # ones because we have already used 1 query 231 232 mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries) 233 234 return x, n_queries 235 236 else: 237 if n_targets > 1: 238 # Horizontally Distributed Attack 239 pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)") 240 else: 241 # Vertically Distributed Attack 242 pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)") 243 244 np.random.seed(0) # important to leave it here as well 245 246 # Only attack the correctly classified examples 247 y = y[corr_classified] 248 if type(x) == list: 249 idx_corr_classified = [i for i, v in enumerate(corr_classified) if v] 250 x = [xi for i, xi in enumerate(x) if i in idx_corr_classified] 251 elif type(x) == np.ndarray: 252 x = x[corr_classified] 253 254 x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type) 255 256 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified)) 257 258 if acc == 0: 259 print('\nSuceessfully found adversarial examples for all examples after initialization') 260 return x_adv, n_queries 261 262 # Main loop 263 264 for i_iter in pbar: 265 266 if n_targets > 1: 267 # Horizontally Distributed Attack 268 x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type) 269 else: 270 # Vertically Distributed Attack 271 x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency) 272 273 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified)) 274 275 pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae}) 276 277 if acc == 0: 278 print('\nSuceessfully found adversarial examples for all examples') 279 break 280 281 gc.collect() 282 283 return x_adv, n_queries 284 285 286 def evaluate(self, margin_min, n_queries, i_iter, n_ex_total): 287 if len(margin_min) > 0 and len(n_queries) > 0: 288 acc = (margin_min > 0.0).sum() / n_ex_total 289 acc_curr = (margin_min > 0.0).mean() 290 mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0]) 291 avg_margin_min = np.mean(margin_min) 292 293 return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min 294 295 else: 296 return -1, -1, -1, -1, -1, -1
SCALE =
255
def
PREPROCESS(x):
9PREPROCESS = lambda x: x
class
SquareAttack:
11class SquareAttack(): 12 13 def __init__(self, classifier): 14 """ 15 Create a class: `SquareAttack` instance. 16 - classifier: model to attack 17 """ 18 self.min_val, self.max_val = 0.0, 1.0 * SCALE 19 self.classifier = classifier 20 21 def p_selection(self, p_init, it, n_iters): 22 """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """ 23 it = int(it / n_iters * 10000) 24 25 if 10 < it <= 50: 26 p = p_init / 2 27 elif 50 < it <= 200: 28 p = p_init / 4 29 elif 200 < it <= 500: 30 p = p_init / 8 31 elif 500 < it <= 1000: 32 p = p_init / 16 33 elif 1000 < it <= 2000: 34 p = p_init / 32 35 elif 2000 < it <= 4000: 36 p = p_init / 64 37 elif 4000 < it <= 6000: 38 p = p_init / 128 39 elif 6000 < it <= 8000: 40 p = p_init / 256 41 elif 8000 < it <= 10000: 42 p = p_init / 512 43 else: 44 p = p_init 45 46 return p 47 48 49 def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'): 50 """ Implements the margin loss (difference between the correct and 2nd best class). """ 51 52 def softmax(x): 53 e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) 54 return e_x / e_x.sum(axis=1, keepdims=True) 55 56 if loss_type == 'margin_loss': 57 preds_correct_class = (logits * y).sum(1, keepdims=True) 58 diff = preds_correct_class - logits # difference between the correct class and all other classes 59 diff[np.array(y)] = np.inf # to exclude zeros coming from f_correct - f_correct 60 margin = diff.min(1, keepdims=True) 61 loss = margin * -1 if targeted else margin 62 elif loss_type == 'cross_entropy': 63 probs = softmax(logits) 64 loss = -np.log(probs[y]) 65 loss = loss * -1 if not targeted else loss 66 else: 67 raise ValueError('Wrong loss.') 68 69 return loss.flatten() 70 71 72 def init(self, x, y, epsilon, targeted, loss_type): 73 # Initialize the attack 74 x_adv = [] 75 for i, xi in enumerate(x): 76 h, w, c = xi.shape[:] 77 # [1, w, c], i.e. vertical stripes work best for untargeted attacks 78 init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c]) 79 x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val)) 80 81 logits = self.classifier.predict(PREPROCESS(x_adv)) 82 83 n_queries = np.ones(len(x)) # ones because we have already used 1 query 84 85 assert(len(logits) == len(y)) 86 87 loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type) 88 margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss') 89 90 return x_adv, margin_min, loss_min, n_queries 91 92 93 def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): 94 """ Horicontal: One step of the attack. """ 95 96 idx_to_fool = margin_min > 0 97 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 98 99 if len(idx_to_fool) == 0: 100 return x_adv, margin_min, loss_min, n_queries 101 102 x_curr = [] 103 x_adv_curr = [] 104 y_curr = [] 105 for idx in idx_to_fool: 106 x_curr.append(x[idx]) 107 x_adv_curr.append(x_adv[idx]) 108 y_curr.append(y[idx]) 109 110 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 111 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 112 113 p = self.p_selection(p_init, i_iter, n_iters) 114 115 for i_img in range(len(x_adv_curr)): 116 h, w, c = x[i_img].shape[:] 117 n_features = c*h*w 118 119 s = int(round(np.sqrt(p * n_features / c))) 120 s = min(max(s, 1), h-1) # at least c x 1 x 1 window is taken and at most c x h-1 x h-1 121 center_h = np.random.randint(0, h - s) 122 center_w = np.random.randint(0, w - s) 123 124 x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 125 x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 126 # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch) 127 while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s: 128 deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c]) 129 130 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 131 132 logits = self.classifier.predict(PREPROCESS(x_new)) 133 134 assert(len(logits) == len(y_curr)) 135 136 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 137 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 138 139 idx_improved = loss < loss_min_curr 140 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 141 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 142 143 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 144 145 for i in range(len(idx_improved)): 146 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 147 148 n_queries[idx_to_fool] += 1 149 150 return x_adv, margin_min, loss_min, n_queries 151 152 def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency): 153 """ Verticle: Multiple steps of the attack. """ 154 155 idx_to_fool = margin_min > 0 156 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 157 158 x_curr = [x[idx] for idx in idx_to_fool] 159 x_adv_curr = [x_adv[idx] for idx in idx_to_fool] 160 y_curr = [y[idx] for idx in idx_to_fool] 161 162 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 163 164 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 165 166 with concurrent.futures.ThreadPoolExecutor() as executor: 167 future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)} 168 for future in concurrent.futures.as_completed(future_to_url): 169 j = future_to_url[future] 170 try: 171 x_adv, _, loss_min_temp, _ = future.result() 172 idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr 173 idx_improved = [i for i, v in enumerate(idx_improved) if v] 174 175 for idx in idx_improved: 176 deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx] 177 178 except Exception as e: 179 print('Task %r generated an exception: %s' % (j, e)) 180 else: 181 pass 182 183 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 184 185 logits = self.classifier.predict(PREPROCESS(x_new)) 186 187 assert(len(logits) == len(y_curr)) 188 189 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 190 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 191 192 idx_improved = loss < loss_min_curr 193 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 194 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 195 196 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 197 198 for i in range(len(idx_improved)): 199 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 200 201 n_queries[idx_to_fool] += (concurrency + 1) 202 203 return x_adv, margin_min, loss_min, n_queries 204 205 def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 206 """ The Linf square attack """ 207 n_targets = 0 208 if type(x) == list: 209 n_targets = len(x) 210 elif type(x) == np.ndarray: 211 n_targets = x.shape[0] 212 else: 213 raise ValueError('Input type not supported...') 214 215 assert n_targets > 0 216 217 assert(len(x) > 0) 218 assert(len(x) == len(y)) 219 220 y_label = np.argmax(y, axis=1) 221 222 logits_clean = self.classifier.predict(PREPROCESS(x)) 223 224 corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))] 225 226 # important to check that the model was restored correctly and the clean accuracy is high 227 print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified))) 228 229 if np.mean(corr_classified) == 0: 230 print('No clean examples classified correctly. Aborting...') 231 n_queries = np.ones(len(x)) # ones because we have already used 1 query 232 233 mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries) 234 235 return x, n_queries 236 237 else: 238 if n_targets > 1: 239 # Horizontally Distributed Attack 240 pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)") 241 else: 242 # Vertically Distributed Attack 243 pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)") 244 245 np.random.seed(0) # important to leave it here as well 246 247 # Only attack the correctly classified examples 248 y = y[corr_classified] 249 if type(x) == list: 250 idx_corr_classified = [i for i, v in enumerate(corr_classified) if v] 251 x = [xi for i, xi in enumerate(x) if i in idx_corr_classified] 252 elif type(x) == np.ndarray: 253 x = x[corr_classified] 254 255 x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type) 256 257 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified)) 258 259 if acc == 0: 260 print('\nSuceessfully found adversarial examples for all examples after initialization') 261 return x_adv, n_queries 262 263 # Main loop 264 265 for i_iter in pbar: 266 267 if n_targets > 1: 268 # Horizontally Distributed Attack 269 x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type) 270 else: 271 # Vertically Distributed Attack 272 x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency) 273 274 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified)) 275 276 pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae}) 277 278 if acc == 0: 279 print('\nSuceessfully found adversarial examples for all examples') 280 break 281 282 gc.collect() 283 284 return x_adv, n_queries 285 286 287 def evaluate(self, margin_min, n_queries, i_iter, n_ex_total): 288 if len(margin_min) > 0 and len(n_queries) > 0: 289 acc = (margin_min > 0.0).sum() / n_ex_total 290 acc_curr = (margin_min > 0.0).mean() 291 mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0]) 292 avg_margin_min = np.mean(margin_min) 293 294 return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min 295 296 else: 297 return -1, -1, -1, -1, -1, -1
SquareAttack(classifier)
13 def __init__(self, classifier): 14 """ 15 Create a class: `SquareAttack` instance. 16 - classifier: model to attack 17 """ 18 self.min_val, self.max_val = 0.0, 1.0 * SCALE 19 self.classifier = classifier
Create a class: SquareAttack
instance.
- classifier: model to attack
def
p_selection(self, p_init, it, n_iters):
21 def p_selection(self, p_init, it, n_iters): 22 """ Piece-wise constant schedule for p (the fraction of pixels changed on every iteration). """ 23 it = int(it / n_iters * 10000) 24 25 if 10 < it <= 50: 26 p = p_init / 2 27 elif 50 < it <= 200: 28 p = p_init / 4 29 elif 200 < it <= 500: 30 p = p_init / 8 31 elif 500 < it <= 1000: 32 p = p_init / 16 33 elif 1000 < it <= 2000: 34 p = p_init / 32 35 elif 2000 < it <= 4000: 36 p = p_init / 64 37 elif 4000 < it <= 6000: 38 p = p_init / 128 39 elif 6000 < it <= 8000: 40 p = p_init / 256 41 elif 8000 < it <= 10000: 42 p = p_init / 512 43 else: 44 p = p_init 45 46 return p
Piece-wise constant schedule for p (the fraction of pixels changed on every iteration).
def
model_loss(self, y, logits, targeted=False, loss_type='margin_loss'):
49 def model_loss(self, y, logits, targeted=False, loss_type='margin_loss'): 50 """ Implements the margin loss (difference between the correct and 2nd best class). """ 51 52 def softmax(x): 53 e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) 54 return e_x / e_x.sum(axis=1, keepdims=True) 55 56 if loss_type == 'margin_loss': 57 preds_correct_class = (logits * y).sum(1, keepdims=True) 58 diff = preds_correct_class - logits # difference between the correct class and all other classes 59 diff[np.array(y)] = np.inf # to exclude zeros coming from f_correct - f_correct 60 margin = diff.min(1, keepdims=True) 61 loss = margin * -1 if targeted else margin 62 elif loss_type == 'cross_entropy': 63 probs = softmax(logits) 64 loss = -np.log(probs[y]) 65 loss = loss * -1 if not targeted else loss 66 else: 67 raise ValueError('Wrong loss.') 68 69 return loss.flatten()
Implements the margin loss (difference between the correct and 2nd best class).
def
init(self, x, y, epsilon, targeted, loss_type):
72 def init(self, x, y, epsilon, targeted, loss_type): 73 # Initialize the attack 74 x_adv = [] 75 for i, xi in enumerate(x): 76 h, w, c = xi.shape[:] 77 # [1, w, c], i.e. vertical stripes work best for untargeted attacks 78 init_delta = np.random.choice([-epsilon, epsilon], size=[len(x), 1, w, c]) 79 x_adv.append(np.clip(xi + init_delta[i], self.min_val, self.max_val)) 80 81 logits = self.classifier.predict(PREPROCESS(x_adv)) 82 83 n_queries = np.ones(len(x)) # ones because we have already used 1 query 84 85 assert(len(logits) == len(y)) 86 87 loss_min = self.model_loss(y, logits, targeted, loss_type=loss_type) 88 margin_min = self.model_loss(y, logits, targeted, loss_type='margin_loss') 89 90 return x_adv, margin_min, loss_min, n_queries
def
step( self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type):
93 def step(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): 94 """ Horicontal: One step of the attack. """ 95 96 idx_to_fool = margin_min > 0 97 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 98 99 if len(idx_to_fool) == 0: 100 return x_adv, margin_min, loss_min, n_queries 101 102 x_curr = [] 103 x_adv_curr = [] 104 y_curr = [] 105 for idx in idx_to_fool: 106 x_curr.append(x[idx]) 107 x_adv_curr.append(x_adv[idx]) 108 y_curr.append(y[idx]) 109 110 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 111 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 112 113 p = self.p_selection(p_init, i_iter, n_iters) 114 115 for i_img in range(len(x_adv_curr)): 116 h, w, c = x[i_img].shape[:] 117 n_features = c*h*w 118 119 s = int(round(np.sqrt(p * n_features / c))) 120 s = min(max(s, 1), h-1) # at least c x 1 x 1 window is taken and at most c x h-1 x h-1 121 center_h = np.random.randint(0, h - s) 122 center_w = np.random.randint(0, w - s) 123 124 x_curr_window = x_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 125 x_best_curr_window = x_adv_curr[i_img][center_h:center_h+s, center_w:center_w+s, :] 126 # prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch) 127 while np.sum(np.abs(np.clip(x_curr_window + deltas[i_img][center_h:center_h+s, center_w:center_w+s, :], self.min_val, self.max_val) - x_best_curr_window) < 10**-7) == c*s*s: 128 deltas[i_img][center_h:center_h+s, center_w:center_w+s, :] = np.random.choice([-eps, eps], size=[1, 1, c]) 129 130 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 131 132 logits = self.classifier.predict(PREPROCESS(x_new)) 133 134 assert(len(logits) == len(y_curr)) 135 136 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 137 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 138 139 idx_improved = loss < loss_min_curr 140 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 141 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 142 143 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 144 145 for i in range(len(idx_improved)): 146 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 147 148 n_queries[idx_to_fool] += 1 149 150 return x_adv, margin_min, loss_min, n_queries
Horicontal: One step of the attack.
def
batch( self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency):
152 def batch(self, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type, concurrency): 153 """ Verticle: Multiple steps of the attack. """ 154 155 idx_to_fool = margin_min > 0 156 idx_to_fool = [i for i, v in enumerate(idx_to_fool) if v] 157 158 x_curr = [x[idx] for idx in idx_to_fool] 159 x_adv_curr = [x_adv[idx] for idx in idx_to_fool] 160 y_curr = [y[idx] for idx in idx_to_fool] 161 162 loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] 163 164 deltas = [ (xa - xc) for xa, xc in zip(x_adv_curr, x_curr) ] 165 166 with concurrent.futures.ThreadPoolExecutor() as executor: 167 future_to_url = {executor.submit(self.step, x, y, x_adv, margin_min, loss_min, n_queries, i_iter, n_iters, p_init, eps, targeted, loss_type): j for j in range(0, concurrency)} 168 for future in concurrent.futures.as_completed(future_to_url): 169 j = future_to_url[future] 170 try: 171 x_adv, _, loss_min_temp, _ = future.result() 172 idx_improved = loss_min_temp[idx_to_fool] < loss_min_curr 173 idx_improved = [i for i, v in enumerate(idx_improved) if v] 174 175 for idx in idx_improved: 176 deltas[idx] = deltas[idx] + x_adv[idx_to_fool[idx]] - x_adv_curr[idx] 177 178 except Exception as e: 179 print('Task %r generated an exception: %s' % (j, e)) 180 else: 181 pass 182 183 x_new = [ np.clip(xc + d, self.min_val, self.max_val) for xc, d in zip(x_curr, deltas) ] 184 185 logits = self.classifier.predict(PREPROCESS(x_new)) 186 187 assert(len(logits) == len(y_curr)) 188 189 loss = self.model_loss(y_curr, logits, targeted, loss_type=loss_type) 190 margin = self.model_loss(y_curr, logits, targeted, loss_type='margin_loss') 191 192 idx_improved = loss < loss_min_curr 193 loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr 194 margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr 195 196 idx_improved = np.reshape(idx_improved, [-1, *[1]*3]) 197 198 for i in range(len(idx_improved)): 199 x_adv[idx_to_fool[i]] = idx_improved[i] * x_new[i] + ~idx_improved[i] * x_adv_curr[i] 200 201 n_queries[idx_to_fool] += (concurrency + 1) 202 203 return x_adv, margin_min, loss_min, n_queries
Verticle: Multiple steps of the attack.
def
attack( self, x, y, targeted, epsilon=0.05, max_it=1000, p_init=0.05, loss_type='margin_loss', concurrency=1):
205 def attack(self, x, y, targeted, epsilon = 0.05, max_it = 1000, p_init = 0.05, loss_type = 'margin_loss', concurrency=1): 206 """ The Linf square attack """ 207 n_targets = 0 208 if type(x) == list: 209 n_targets = len(x) 210 elif type(x) == np.ndarray: 211 n_targets = x.shape[0] 212 else: 213 raise ValueError('Input type not supported...') 214 215 assert n_targets > 0 216 217 assert(len(x) > 0) 218 assert(len(x) == len(y)) 219 220 y_label = np.argmax(y, axis=1) 221 222 logits_clean = self.classifier.predict(PREPROCESS(x)) 223 224 corr_classified = [(logits_clean[i].argmax() == y_label[i]) for i in range(len(x))] 225 226 # important to check that the model was restored correctly and the clean accuracy is high 227 print('Clean accuracy: {:.2%}'.format(np.mean(corr_classified))) 228 229 if np.mean(corr_classified) == 0: 230 print('No clean examples classified correctly. Aborting...') 231 n_queries = np.ones(len(x)) # ones because we have already used 1 query 232 233 mean_nq, mean_nq_ae = np.mean(n_queries), np.mean(n_queries) 234 235 return x, n_queries 236 237 else: 238 if n_targets > 1: 239 # Horizontally Distributed Attack 240 pbar = tqdm(range(0, max_it - 1), desc="Distributed Square Attack (Horizontal)") 241 else: 242 # Vertically Distributed Attack 243 pbar = tqdm(range(0, max_it - 1, concurrency), desc="Distributed Square Attack (Vertical)") 244 245 np.random.seed(0) # important to leave it here as well 246 247 # Only attack the correctly classified examples 248 y = y[corr_classified] 249 if type(x) == list: 250 idx_corr_classified = [i for i, v in enumerate(corr_classified) if v] 251 x = [xi for i, xi in enumerate(x) if i in idx_corr_classified] 252 elif type(x) == np.ndarray: 253 x = x[corr_classified] 254 255 x_adv, margin_min, loss_min, n_queries = self.init(x, y, epsilon * SCALE, targeted, loss_type) 256 257 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, 0, np.sum(corr_classified)) 258 259 if acc == 0: 260 print('\nSuceessfully found adversarial examples for all examples after initialization') 261 return x_adv, n_queries 262 263 # Main loop 264 265 for i_iter in pbar: 266 267 if n_targets > 1: 268 # Horizontally Distributed Attack 269 x_adv, margin_min, loss_min, n_queries = self.step(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type) 270 else: 271 # Vertically Distributed Attack 272 x_adv, margin_min, loss_min, n_queries = self.batch(x, y, x_adv, margin_min, loss_min, n_queries, i_iter, max_it, p_init, epsilon * SCALE, targeted, loss_type, concurrency=concurrency) 273 274 acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min = self.evaluate(margin_min, n_queries, i_iter, np.sum(corr_classified)) 275 276 pbar.set_postfix({'Total Queries': n_queries.sum(), 'Average Margin': avg_margin_min, 'Attack Success Rate': 1-acc, 'Avg Queries': mean_nq_ae}) 277 278 if acc == 0: 279 print('\nSuceessfully found adversarial examples for all examples') 280 break 281 282 gc.collect() 283 284 return x_adv, n_queries
The Linf square attack
def
evaluate(self, margin_min, n_queries, i_iter, n_ex_total):
287 def evaluate(self, margin_min, n_queries, i_iter, n_ex_total): 288 if len(margin_min) > 0 and len(n_queries) > 0: 289 acc = (margin_min > 0.0).sum() / n_ex_total 290 acc_curr = (margin_min > 0.0).mean() 291 mean_nq, mean_nq_ae = np.mean(n_queries), -1 if (margin_min <= 0).sum() == 0 else np.mean(n_queries[margin_min <= 0]) 292 avg_margin_min = np.mean(margin_min) 293 294 return acc, acc_curr, mean_nq, mean_nq_ae, avg_margin_min 295 296 else: 297 return -1, -1, -1, -1, -1, -1