import torch
from .optimizer import Optimizer, required[docs]class SGD(Optimizer):r"""Implements stochastic gradient descent (optionally with momentum).Nesterov momentum is based on the formula from`On the importance of initialization and momentum in deep learning`__.Args:params (iterable): iterable of parameters to optimize or dicts definingparameter groupslr (float): learning ratemomentum (float, optional): momentum factor (default: 0)weight_decay (float, optional): weight decay (L2 penalty) (default: 0)dampening (float, optional): dampening for momentum (default: 0)nesterov (bool, optional): enables Nesterov momentum (default: False)Example:>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)>>> optimizer.zero_grad()>>> loss_fn(model(input), target).backward()>>> optimizer.step()__ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf.. note::The implementation of SGD with Momentum/Nesterov subtly differs fromSutskever et. al. and implementations in some other frameworks.Considering the specific case of Momentum, the update can be written as.. math::\begin{aligned}v_{t+1} & = \mu * v_{t} + g_{t+1}, \\p_{t+1} & = p_{t} - \text{lr} * v_{t+1},\end{aligned}where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the parameters, gradient, velocity, and momentum respectively.This is in contrast to Sutskever et. al. andother frameworks which employ an update of the form.. math::\begin{aligned}v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\p_{t+1} & = p_{t} - v_{t+1}.\end{aligned}The Nesterov version is analogously modified."""def __init__(self, params, lr=required, momentum=0, dampening=0,weight_decay=0, nesterov=False):if lr is not required and lr < 0.0:raise ValueError("Invalid learning rate: {}".format(lr))if momentum < 0.0:raise ValueError("Invalid momentum value: {}".format(momentum))if weight_decay < 0.0:raise ValueError("Invalid weight_decay value: {}".format(weight_decay))defaults = dict(lr=lr, momentum=momentum, dampening=dampening,weight_decay=weight_decay, nesterov=nesterov)if nesterov and (momentum <= 0 or dampening != 0):raise ValueError("Nesterov momentum requires a momentum and zero dampening")super(SGD, self).__init__(params, defaults)def __setstate__(self, state):super(SGD, self).__setstate__(state)for group in self.param_groups:group.setdefault('nesterov', False)[docs] @torch.no_grad()def step(self, closure=None):"""Performs a single optimization step.Arguments:closure (callable, optional): A closure that reevaluates the modeland returns the loss."""loss = Noneif closure is not None:with torch.enable_grad():loss = closure()for group in self.param_groups:weight_decay = group['weight_decay']momentum = group['momentum']dampening = group['dampening']nesterov = group['nesterov']for p in group['params']:if p.grad is None:continued_p = p.gradif weight_decay != 0:d_p = d_p.add(p, alpha=weight_decay)if momentum != 0:param_state = self.state[p]if 'momentum_buffer' not in param_state:buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()else:buf = param_state['momentum_buffer']buf.mul_(momentum).add_(d_p, alpha=1 - dampening)if nesterov:d_p = d_p.add(buf, alpha=momentum)else:d_p = bufp.add_(d_p, alpha=-group['lr'])return loss
[docs]class Adagrad(Optimizer):"""Implements Adagrad algorithm.It has been proposed in `Adaptive Subgradient Methods for Online Learningand Stochastic Optimization`_.Arguments:params (iterable): iterable of parameters to optimize or dicts definingparameter groupslr (float, optional): learning rate (default: 1e-2)lr_decay (float, optional): learning rate decay (default: 0)weight_decay (float, optional): weight decay (L2 penalty) (default: 0)eps (float, optional): term added to the denominator to improvenumerical stability (default: 1e-10).. _Adaptive Subgradient Methods for Online Learning and StochasticOptimization: http://jmlr.org/papers/v12/duchi11a.html"""def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10):if not 0.0 <= lr:raise ValueError("Invalid learning rate: {}".format(lr))if not 0.0 <= lr_decay:raise ValueError("Invalid lr_decay value: {}".format(lr_decay))if not 0.0 <= weight_decay:raise ValueError("Invalid weight_decay value: {}".format(weight_decay))if not 0.0 <= initial_accumulator_value:raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))if not 0.0 <= eps:raise ValueError("Invalid epsilon value: {}".format(eps))defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,initial_accumulator_value=initial_accumulator_value)super(Adagrad, self).__init__(params, defaults)for group in self.param_groups:for p in group['params']:state = self.state[p]state['step'] = 0state['sum'] = torch.full_like(p, initial_accumulator_value, memory_format=torch.preserve_format)def share_memory(self):for group in self.param_groups:for p in group['params']:state = self.state[p]state['sum'].share_memory_()[docs] @torch.no_grad()def step(self, closure=None):"""Performs a single optimization step.Arguments:closure (callable, optional): A closure that reevaluates the modeland returns the loss."""loss = Noneif closure is not None:with torch.enable_grad():loss = closure()for group in self.param_groups:params_with_grad = []grads = []state_sums = []state_steps = []for p in group['params']:if p.grad is not None:params_with_grad.append(p)grads.append(p.grad)state = self.state[p]state_sums.append(state['sum'])# update the steps for each param group updatestate['step'] += 1# record the step after step updatestate_steps.append(state['step'])F.adagrad(params_with_grad,grads,state_sums,state_steps,group['lr'],group['weight_decay'],group['lr_decay'],group['eps'])return loss
CLASS torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
参数:
params (iterable) – 优化器作用的模型参数。
lr (float) – learning rate – 相当于是统一框架中的
。
momentum (float, optional) – 动量参数。(默认值:0)。
alpha(float,optional) – 平滑常数 (默认值:0.99)。
centered(bool,optional) – ifTrue, compute the centered RMSProp, the gradient is normalized by an estimation of its variance,就是这一项是 True 的话就把方差使用梯度作归一化。
import torch
from .optimizer import Optimizer[docs]class RMSprop(Optimizer):r"""Implements RMSprop algorithm.Proposed by G. Hinton in his`course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.The centered version first appears in `Generating SequencesWith Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.The implementation here takes the square root of the gradient average beforeadding epsilon (note that TensorFlow interchanges these two operations). The effectivelearning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`is the scheduled learning rate and :math:`v` is the weighted moving averageof the squared gradient.Arguments:params (iterable): iterable of parameters to optimize or dicts definingparameter groupslr (float, optional): learning rate (default: 1e-2)momentum (float, optional): momentum factor (default: 0)alpha (float, optional): smoothing constant (default: 0.99)eps (float, optional): term added to the denominator to improvenumerical stability (default: 1e-8)centered (bool, optional) : if ``True``, compute the centered RMSProp,the gradient is normalized by an estimation of its varianceweight_decay (float, optional): weight decay (L2 penalty) (default: 0)"""def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):if not 0.0 <= lr:raise ValueError("Invalid learning rate: {}".format(lr))if not 0.0 <= eps:raise ValueError("Invalid epsilon value: {}".format(eps))if not 0.0 <= momentum:raise ValueError("Invalid momentum value: {}".format(momentum))if not 0.0 <= weight_decay:raise ValueError("Invalid weight_decay value: {}".format(weight_decay))if not 0.0 <= alpha:raise ValueError("Invalid alpha value: {}".format(alpha))defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)super(RMSprop, self).__init__(params, defaults)def __setstate__(self, state):super(RMSprop, self).__setstate__(state)for group in self.param_groups:group.setdefault('momentum', 0)group.setdefault('centered', False)[docs] @torch.no_grad()def step(self, closure=None):"""Performs a single optimization step.Arguments:closure (callable, optional): A closure that reevaluates the modeland returns the loss."""loss = Noneif closure is not None:with torch.enable_grad():loss = closure()for group in self.param_groups:for p in group['params']:if p.grad is None:continuegrad = p.gradif grad.is_sparse:raise RuntimeError('RMSprop does not support sparse gradients')state = self.state[p]# State initializationif len(state) == 0:state['step'] = 0state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)if group['momentum'] > 0:state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)if group['centered']:state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)square_avg = state['square_avg']alpha = group['alpha']state['step'] += 1if group['weight_decay'] != 0:grad = grad.add(p, alpha=group['weight_decay'])square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)if group['centered']:grad_avg = state['grad_avg']grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(group['eps'])else:avg = square_avg.sqrt().add_(group['eps'])if group['momentum'] > 0:buf = state['momentum_buffer']buf.mul_(group['momentum']).addcdiv_(grad, avg)p.add_(buf, alpha=-group['lr'])else:p.addcdiv_(grad, avg, value=-group['lr'])return loss
这里通过 grad = p.grad 得到每个参数的梯度,也就是1式的
。
如果使用 weight_decay 的话,那么相当于目标函数加上
,所以相当于是梯度相当于要再加上
,故使用了 grad = grad.add(p, alpha=group['weight_decay'])。
square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha) 对应10式,计算当前步的
import math
import torch
from .optimizer import Optimizer[docs]class Adam(Optimizer):r"""Implements Adam algorithm.It has been proposed in `Adam: A Method for Stochastic Optimization`_.Arguments:params (iterable): iterable of parameters to optimize or dicts definingparameter groupslr (float, optional): learning rate (default: 1e-3)betas (Tuple[float, float], optional): coefficients used for computingrunning averages of gradient and its square (default: (0.9, 0.999))eps (float, optional): term added to the denominator to improvenumerical stability (default: 1e-8)weight_decay (float, optional): weight decay (L2 penalty) (default: 0)amsgrad (boolean, optional): whether to use the AMSGrad variant of thisalgorithm from the paper `On the Convergence of Adam and Beyond`_(default: False).. _Adam\: A Method for Stochastic Optimization:https://arxiv.org/abs/1412.6980.. _On the Convergence of Adam and Beyond:https://openreview.net/forum?id=ryQu7f-RZ"""def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,weight_decay=0, amsgrad=False):if not 0.0 <= lr:raise ValueError("Invalid learning rate: {}".format(lr))if not 0.0 <= eps:raise ValueError("Invalid epsilon value: {}".format(eps))if not 0.0 <= betas[0] < 1.0:raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))if not 0.0 <= betas[1] < 1.0:raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))if not 0.0 <= weight_decay:raise ValueError("Invalid weight_decay value: {}".format(weight_decay))defaults = dict(lr=lr, betas=betas, eps=eps,weight_decay=weight_decay, amsgrad=amsgrad)super(Adam, self).__init__(params, defaults)def __setstate__(self, state):super(Adam, self).__setstate__(state)for group in self.param_groups:group.setdefault('amsgrad', False)[docs] @torch.no_grad()def step(self, closure=None):"""Performs a single optimization step.Arguments:closure (callable, optional): A closure that reevaluates the modeland returns the loss."""loss = Noneif closure is not None:with torch.enable_grad():loss = closure()for group in self.param_groups:for p in group['params']:if p.grad is None:continuegrad = p.gradif grad.is_sparse:raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')amsgrad = group['amsgrad']state = self.state[p]# State initializationif len(state) == 0:state['step'] = 0# Exponential moving average of gradient valuesstate['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)# Exponential moving average of squared gradient valuesstate['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)if amsgrad:# Maintains max of all exp. moving avg. of sq. grad. valuesstate['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']if amsgrad:max_exp_avg_sq = state['max_exp_avg_sq']beta1, beta2 = group['betas']state['step'] += 1bias_correction1 = 1 - beta1 ** state['step']bias_correction2 = 1 - beta2 ** state['step']if group['weight_decay'] != 0:grad = grad.add(p, alpha=group['weight_decay'])# Decay the first and second moment running average coefficientexp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)if amsgrad:# Maintains the maximum of all 2nd moment running avg. till nowtorch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)# Use the max. for normalizing running avg. of gradientdenom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])else:denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])step_size = group['lr'] / bias_correction1p.addcdiv_(exp_avg, denom, value=-step_size)return loss
这里通过 grad = p.grad 得到每个参数的梯度,也就是1式的
。
如果使用 weight_decay 的话,那么相当于目标函数加上
,所以相当于是梯度相当于要再加上
,故使用了 grad = grad.add(p, alpha=group['weight_decay'])。
Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact with the m and v parameters in strange ways. Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent to adding the square of the weights to the loss with plain (non-momentum) SGD. Add weight decay at the end (fixed version).
import math
import torch
from .optimizer import Optimizer[docs]class AdamW(Optimizer):r"""Implements AdamW algorithm.The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.Arguments:params (iterable): iterable of parameters to optimize or dicts definingparameter groupslr (float, optional): learning rate (default: 1e-3)betas (Tuple[float, float], optional): coefficients used for computingrunning averages of gradient and its square (default: (0.9, 0.999))eps (float, optional): term added to the denominator to improvenumerical stability (default: 1e-8)weight_decay (float, optional): weight decay coefficient (default: 1e-2)amsgrad (boolean, optional): whether to use the AMSGrad variant of thisalgorithm from the paper `On the Convergence of Adam and Beyond`_(default: False).. _Adam\: A Method for Stochastic Optimization:https://arxiv.org/abs/1412.6980.. _Decoupled Weight Decay Regularization:https://arxiv.org/abs/1711.05101.. _On the Convergence of Adam and Beyond:https://openreview.net/forum?id=ryQu7f-RZ"""def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,weight_decay=1e-2, amsgrad=False):if not 0.0 <= lr:raise ValueError("Invalid learning rate: {}".format(lr))if not 0.0 <= eps:raise ValueError("Invalid epsilon value: {}".format(eps))if not 0.0 <= betas[0] < 1.0:raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))if not 0.0 <= betas[1] < 1.0:raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))if not 0.0 <= weight_decay:raise ValueError("Invalid weight_decay value: {}".format(weight_decay))defaults = dict(lr=lr, betas=betas, eps=eps,weight_decay=weight_decay, amsgrad=amsgrad)super(AdamW, self).__init__(params, defaults)def __setstate__(self, state):super(AdamW, self).__setstate__(state)for group in self.param_groups:group.setdefault('amsgrad', False)[docs] @torch.no_grad()def step(self, closure=None):"""Performs a single optimization step.Arguments:closure (callable, optional): A closure that reevaluates the modeland returns the loss."""loss = Noneif closure is not None:with torch.enable_grad():loss = closure()for group in self.param_groups:for p in group['params']:if p.grad is None:continue# Perform stepweight decayp.mul_(1 - group['lr'] * group['weight_decay'])# Perform optimization stepgrad = p.gradif grad.is_sparse:raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')amsgrad = group['amsgrad']state = self.state[p]# State initializationif len(state) == 0:state['step'] = 0# Exponential moving average of gradient valuesstate['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)# Exponential moving average of squared gradient valuesstate['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)if amsgrad:# Maintains max of all exp. moving avg. of sq. grad. valuesstate['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']if amsgrad:max_exp_avg_sq = state['max_exp_avg_sq']beta1, beta2 = group['betas']state['step'] += 1bias_correction1 = 1 - beta1 ** state['step']bias_correction2 = 1 - beta2 ** state['step']# Decay the first and second moment running average coefficientexp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)if amsgrad:# Maintains the maximum of all 2nd moment running avg. till nowtorch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)# Use the max. for normalizing running avg. of gradientdenom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])else:denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])step_size = group['lr'] / bias_correction1p.addcdiv_(exp_avg, denom, value=-step_size)return loss
与 Adam 不一样的地方是:
Adam 如果使用 weight_decay 的话,那么相当于目标函数加上
,所以相当于是梯度相当于要再加上
,故使用了 grad = grad.add(p, alpha=group['weight_decay'])。