Source code for avalanche.training.plugins.bic

from collections import defaultdict
from typing import (
    Dict,
    List,
    Literal,
    Optional,
    Sequence,
    Set,
    SupportsInt,
    Union,
)

from copy import deepcopy
import torch
from torch import Tensor
from torch.nn import Module
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR

from avalanche.benchmarks.utils.data import AvalancheDataset
from avalanche.benchmarks.utils.data_loader import ReplayDataLoader
from avalanche.benchmarks.utils.utils import concat_datasets
from avalanche.training.plugins.strategy_plugin import SupervisedPlugin
from avalanche.training.storage_policy import (
    ExemplarsBuffer,
    ExperienceBalancedBuffer,
    ReservoirSamplingBuffer,
)
from avalanche.models.dynamic_modules import MultiTaskModule
from avalanche.models.bic_model import BiasLayer

from avalanche.training.templates import SupervisedTemplate


[docs]class BiCPlugin(SupervisedPlugin): """ Bias Correction (BiC) plugin. Technique introduced in: "Wu, Yue, et al. "Large scale incremental learning." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019" Implementation based on FACIL, as in: https://github.com/mmasana/FACIL/blob/master/src/approach/bic.py """
[docs] def __init__( self, mem_size: int = 2000, batch_size: Optional[int] = None, batch_size_mem: Optional[int] = None, task_balanced_dataloader: bool = False, storage_policy: Optional["ExemplarsBuffer"] = None, val_percentage: float = 0.1, T: int = 2, stage_2_epochs: int = 200, lamb: float = -1, lr: float = 0.1, num_workers: Union[int, Literal["as_strategy"]] = "as_strategy", verbose: bool = False, ): """ :param mem_size: replay buffer size. :param batch_size: the size of the data batch. If set to `None`, it will be set equal to the strategy's batch size. :param batch_size_mem: the size of the memory batch. If `task_balanced_dataloader` is set to True, it must be greater than or equal to the number of tasks. If its value is set to `None` (the default value), it will be automatically set equal to the data batch size. :param task_balanced_dataloader: if True, buffer data loaders will be task-balanced, otherwise it will create a single dataloader for the buffer samples. :param storage_policy: The policy that controls how to add new exemplars in memory :param val_percentage: hyperparameter used to set the percentage of exemplars in the val set. :param T: hyperparameter used to set the temperature used in stage 1. :param stage_2_epochs: hyperparameter used to set the amount of epochs of stage 2. :param lamb: hyperparameter used to balance the distilling loss and the classification loss. :param lr: hyperparameter used as a learning rate for the second phase of training. :param num_workers: number of workers using during stage 2 data loading. Defaults to "as_strategy", which means that the number of workers will be the same as the one used by the strategy. :param verbose: if True, prints additional info regarding the stage 2 stage """ # Replay (Phase 1) super().__init__() self.mem_size = mem_size self.batch_size = batch_size self.batch_size_mem = batch_size_mem self.task_balanced_dataloader = task_balanced_dataloader if storage_policy is not None: # Use other storage policy self.storage_policy = storage_policy assert storage_policy.max_size == self.mem_size else: # Default self.storage_policy = ExperienceBalancedBuffer( max_size=self.mem_size, adaptive_size=True ) # Train Bias (Phase 2) self.val_percentage = val_percentage self.stage_2_epochs = stage_2_epochs self.T = T self.lamb = lamb self.mem_size = mem_size self.lr = lr self.num_workers: Union[int, Literal["as_strategy"]] = num_workers self.seen_classes: Set[int] = set() self.class_to_tasks: Dict[int, int] = {} self.bias_layer: Optional[BiasLayer] = None self.model_old: Optional[Module] = None self.val_buffer: Dict[int, ReservoirSamplingBuffer] = {} self.is_first_experience: bool = True self.verbose: bool = verbose
def before_training(self, strategy: "SupervisedTemplate", *args, **kwargs): assert not isinstance( strategy.model, MultiTaskModule ), "BiC only supported for Class Incremetnal Learning (single head)" def before_train_dataset_adaptation(self, strategy: "SupervisedTemplate", **kwargs): assert strategy.experience is not None new_data: AvalancheDataset = strategy.experience.dataset task_id = strategy.clock.train_exp_counter cl_idxs: Dict[int, List[int]] = defaultdict(list) targets: Sequence[SupportsInt] = getattr(new_data, "targets") for idx, target in enumerate(targets): # Conversion to int may fix issues when target # is a single-element torch.tensor target = int(target) cl_idxs[target].append(idx) for c in cl_idxs.keys(): self.class_to_tasks[c] = task_id self.seen_classes.update(cl_idxs.keys()) lens = self.get_group_lengths(len(self.seen_classes)) class_to_len = {} for class_id, ll in zip(self.seen_classes, lens): class_to_len[class_id] = ll train_data = [] for class_id in cl_idxs.keys(): ll = class_to_len[class_id] new_data_c = new_data.subset(cl_idxs[class_id][:ll]) if class_id in self.val_buffer: old_buffer_c = self.val_buffer[class_id] old_buffer_c.update_from_dataset(new_data_c) old_buffer_c.resize(strategy, ll) else: new_buffer = ReservoirSamplingBuffer(ll) new_buffer.update_from_dataset(new_data_c) self.val_buffer[class_id] = new_buffer train_data.append(new_data.subset(cl_idxs[class_id][ll:])) # resize buffers for class_id, class_buf in self.val_buffer.items(): class_buf.resize(strategy, class_to_len[class_id]) strategy.experience.dataset = concat_datasets(train_data) def before_training_exp( self, strategy: "SupervisedTemplate", num_workers: int = 0, shuffle: bool = True, **kwargs ): """ Dataloader to build batches containing examples from both memories and the training dataset """ assert strategy.adapted_dataset is not None # During the distillation phase this layer is not trained and is only # used to correct the bias of the classes encountered in the previous experience. # It will be unlocked in the bias correction phase. if self.bias_layer is not None: for param in self.bias_layer.parameters(): param.requires_grad = False if len(self.storage_policy.buffer) == 0: # first experience. We don't use the buffer, no need to change # the dataloader. return batch_size = self.batch_size if batch_size is None: batch_size = strategy.train_mb_size batch_size_mem = self.batch_size_mem if batch_size_mem is None: batch_size_mem = strategy.train_mb_size strategy.dataloader = ReplayDataLoader( strategy.adapted_dataset, self.storage_policy.buffer, oversample_small_tasks=True, batch_size=batch_size, batch_size_mem=batch_size_mem, task_balanced_dataloader=self.task_balanced_dataloader, num_workers=num_workers, shuffle=shuffle, ) def after_eval_forward(self, strategy, **kwargs): if self.is_first_experience: # https://github.com/wuyuebupt/LargeScaleIncrementalLearning/blob/7f687a323ae3629109b35c369b547af74a94e73d/resnet.py#L488 return strategy.mb_output = self.bias_forward(strategy.mb_output) def bias_forward(self, input_data: Tensor) -> Tensor: if self.bias_layer is None: return input_data return self.bias_layer(input_data) def before_backward(self, strategy, **kwargs): # Distillation if self.model_old is not None: # That is, from the second experience onwards distillation_loss = self.make_distillation_loss(strategy) # Count the number of already seen classes (i.e., classes from previous experiences) initial_classes, previous_classes, current_classes = self._classes_groups( strategy ) # Make old_classes and all_classes old_clss: Set[int] = set(initial_classes) | set(previous_classes) all_clss: Set[int] = old_clss | set(current_classes) if self.lamb == -1: lamb = len(old_clss) / len(all_clss) strategy.loss = (1.0 - lamb) * strategy.loss + lamb * distillation_loss else: strategy.loss = strategy.loss + self.lamb * distillation_loss def after_training_exp(self, strategy, **kwargs): self.is_first_experience = False # Make sure that the old_model is frozen (including batch norm layers) # requires_grad=False is not sufficient to freeze BN layers, # we also need eval() self.model_old = None self.model_old = deepcopy(strategy.model) self.model_old.eval() for param in self.model_old.parameters(): param.requires_grad = False task_id = strategy.clock.train_exp_counter self.storage_policy.update(strategy, **kwargs) if task_id > 0: num_workers = ( int(kwargs.get("num_workers", 0)) if self.num_workers == "as_strategy" else self.num_workers ) persistent_workers = ( False if num_workers == 0 else kwargs.get("persistent_workers", False) ) self.bias_correction_step( strategy, persistent_workers=persistent_workers, num_workers=num_workers, ) def cross_entropy(self, new_outputs, old_outputs): """Calculates cross-entropy with temperature scaling""" # logp = torch.nn.functional.log_softmax(new_outputs / self.T, dim=1) # pre_p = torch.nn.functional.softmax(old_outputs / self.T, dim=1) # return -torch.mean(torch.sum(pre_p * logp, dim=1)) * self.T * self.T # The previous implementation (above), multiplied the final loss by T^2, which is not correct. # In addition, this is more aligned to how it's done in the original implementation. dis_logits_soft = torch.nn.functional.softmax(old_outputs / 2, dim=0) loss_distill = torch.nn.functional.cross_entropy( new_outputs / 2, dis_logits_soft ) return loss_distill def get_group_lengths(self, num_groups): """Compute groups lengths given the number of groups `num_groups`.""" max_size = int(self.val_percentage * self.mem_size) lengths = [max_size // num_groups for _ in range(num_groups)] # distribute remaining size among experiences. rem = max_size - sum(lengths) for i in range(rem): lengths[i] += 1 return lengths def make_distillation_loss(self, strategy): assert self.model_old is not None initial_classes, previous_classes, current_classes = self._classes_groups( strategy ) # print('initial_classes', initial_classes, 'previous_classes', previous_classes, 'current_classes', current_classes) # Forward current minibatch through the old model with torch.no_grad(): out_old: Tensor = self.model_old(strategy.mb_x) if len(initial_classes) == 0: # We are in the second experience, no need to correct the bias # https://github.com/wuyuebupt/LargeScaleIncrementalLearning/blob/7f687a323ae3629109b35c369b547af74a94e73d/resnet.py#L561 pass else: # We are in the third experience or later # bias_forward will apply the bias correction to the output of the old model for the classes # found in previous_classes (bias correction is not applied to initial_classes or current_classes)! # https://github.com/wuyuebupt/LargeScaleIncrementalLearning/blob/7f687a323ae3629109b35c369b547af74a94e73d/resnet.py#L564 assert self.bias_layer is not None assert set(self.bias_layer.clss.tolist()) == set(previous_classes) with torch.no_grad(): # out_old_before = out_old.clone() out_old = self.bias_forward(out_old) # Asserts commented out for performance reasons. # Remove the comments if you want to check that the bias correction is applied correctly. # assert torch.equal(out_old_before[:, initial_classes], out_old[:, initial_classes]) # assert torch.equal(out_old_before[:, current_classes], out_old[:, current_classes]) # assert not torch.equal(out_old_before[:, previous_classes], out_old[:, previous_classes]) # To compute the distillation loss, we need the output of the new model # without the bias correction. During train, the output of the new model # does not undergo bias correction, so we can use mb_output directly. out_new: Tensor = strategy.mb_output # Union of initial_classes and previous_classes: needed to select the logits of all the old classes old_clss: List[int] = sorted(set(initial_classes) | set(previous_classes)) # Distillation loss on the logits of the old classes return self.cross_entropy(out_new[:, old_clss], out_old[:, old_clss]) def bias_correction_step( self, strategy: SupervisedTemplate, persistent_workers: bool = False, num_workers: int = 0, ): # --- Prepare the models --- # Freeze the base model, only train the new bias layer strategy.model.eval() # Note: we use torch.no_grad for this. # In this way, we don't need to store the status of each requires_grad # which is useful when we have multiple parameters with different # requires_grad status. # for param in strategy.model.parameters(): # param.requires_grad = False # Create the bias layer of the current experience targets = getattr(strategy.adapted_dataset, "targets") self.bias_layer = BiasLayer(targets.uniques) self.bias_layer.to(strategy.device) self.bias_layer.train() for param in self.bias_layer.parameters(): param.requires_grad = True bic_optimizer = torch.optim.SGD( self.bias_layer.parameters(), lr=self.lr, momentum=0.9 ) # Typing note: verbose here is actually correct # The PyTorch type stubs for MultiStepLR are broken in some versions scheduler = MultiStepLR( bic_optimizer, milestones=[50, 100, 150], gamma=0.1, verbose=False ) # type: ignore # --- Prepare the dataloader for the validation set --- list_subsets: List[AvalancheDataset] = [] for _, class_buf in self.val_buffer.items(): list_subsets.append(class_buf.buffer) stage_set = concat_datasets(list_subsets) stage_loader = DataLoader( stage_set, batch_size=strategy.train_mb_size, shuffle=True, num_workers=num_workers, persistent_workers=persistent_workers, ) # Loop epochs for e in range(self.stage_2_epochs): total, t_acc, t_loss = 0, 0, 0 for inputs in stage_loader: x = inputs[0].to(strategy.device) y_real = inputs[1].to(strategy.device) with torch.no_grad(): outputs = strategy.model(x) outputs = self.bias_layer(outputs) loss = torch.nn.functional.cross_entropy(outputs, y_real) _, preds = torch.max(outputs, 1) t_acc += torch.sum(preds == y_real.data) t_loss += loss.item() * x.size(0) total += x.size(0) # Hand-made L2 loss # https://github.com/wuyuebupt/LargeScaleIncrementalLearning/blob/7f687a323ae3629109b35c369b547af74a94e73d/resnet.py#L636 loss += 0.1 * ((self.bias_layer.beta.sum() ** 2) / 2) bic_optimizer.zero_grad() loss.backward() bic_optimizer.step() scheduler.step() if self.verbose and (self.stage_2_epochs // 4) > 0: if (e + 1) % (self.stage_2_epochs // 4) == 0: print( "| E {:3d} | Train: loss={:.3f}, S2 acc={:5.1f}% |".format( e + 1, t_loss / total, 100 * t_acc / total ) ) # Freeze the bias layer self.bias_layer.eval() for param in self.bias_layer.parameters(): param.requires_grad = False if self.verbose: print( "Bias correction done: alpha={}, beta={}".format( self.bias_layer.alpha.item(), self.bias_layer.beta.item() ) ) def _classes_groups(self, strategy: SupervisedTemplate): current_experience: int = strategy.experience.current_experience # Split between # - "initial" classes: seen between in experiences [0, current_experience-2] # - "previous" classes: seen in current_experience-1 # - "current" classes: seen in current_experience # "initial" classes initial_classes: Set[int] = ( set() ) # pre_initial_cl in the original implementation previous_classes: Set[int] = set() # pre_new_cl in the original implementation current_classes: Set[int] = set() # new_cl in the original implementation # Note: pre_initial_cl + pre_new_cl is "initial_cl" in the original implementation for cls, exp_id in self.class_to_tasks.items(): assert exp_id >= 0 assert exp_id <= current_experience if exp_id < current_experience - 1: initial_classes.add(cls) elif exp_id == current_experience - 1: previous_classes.add(cls) else: current_classes.add(cls) return ( sorted(initial_classes), sorted(previous_classes), sorted(current_classes), )
__all__ = [ "BiCPlugin", ]