优化算法之Adam
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了优化算法之Adam,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含9139字,纯文字阅读大概需要14分钟。
内容图文
![优化算法之Adam](/upload/InfoBanner/zyjiaocheng/622/e5fccff9aa2c497a9ebc81b16d7809e8.jpg)
Adam
机器学习算法中的代价函数通常可以分解成每个样本的代价函数的总和. 训练数据的负条件对数似然可以写成
\[J(\theta)=\mathbb{E}_{x,y \in \hat p_{data}}L(x, y, \theta)=\frac {1} {m} \sum_{i=1}^m L(x^{(i)},y^{(i)},\theta) \]其中 \(L\) 是每个样本的损失 \(L(x, y, \theta) = -log(p(y|x;\theta))\).
? 对于这些相加的代价函数, 梯度下降需要计算
\[\nabla_{\theta} J(\theta) = \frac {1} {m} \sum_{i=1}^m \nabla_{\theta} L(x^{(i)},y^{(i)},\theta) \]Adam 算法使用了动量变量 \(v_t\) 和 RMSProp 算法中小批量随机梯度按元素平方的指数加权平移变量 \(s_t\), 并在时间步 \(0\) 将它们中每个元素初始化为 \(0\). 给定超参数 \(0 \le \beta_1 < 1\) (算法作者建议设为 \(0.9\)), 时间步 \(t\) 的动量变量 \(v_t\) 即小批量随机梯度 \(g_t\) 的指数加权移动平均:
\[v_t \leftarrow \beta_1 \cdot v_{t-1} + (1-\beta_1)\cdot \mathcal{g}_t \]和 RMSProp 算法中一样, 给定超参数 \(0 \le \beta_2 < 1\) (算法作者建议设为 \(0.999\)), 将小批量随机梯度按元素平方后的项 \(g_t \odot g_t\) 做指数加权移动平均得到 \(s_t\):
\[s_t \leftarrow \beta_2 \cdot s_{t-1} + (1-\beta_2) \cdot g_t \odot g_t \]由于我们将 \(v_0\) 和 \(s_0\) 中的元素都初始化为 \(0\),在时间步 \(t\) 我们得到 \(v_t = (1 ? \beta_1) \sum^t_{i=1} \beta_1^{t?i} g_i\) 。将过去各时间步小批量随机梯度的权值相加,得到 \((1 ? \beta_1) \sum^t_{i=1} \beta_1^{t?i} = 1 ? \beta_1^t\) 。需要注意的是,当 \(t\) 较小时,过去各时间步小批量随机梯度权值之和会较小。例如,当 \(\beta_1 = 0.9\) 时, \(v_1 = 0.1g_1\) 。为了消除这样的影响,对于任意时间步 \(t\),我们可以将 \(v_t\) 再除以 \(1 ? \beta_1^t\),从而使过去各时间步小批量随机梯度权值之和为 \(1\)。这也叫作偏差修正。在 Adam 算法中,我们对变量 \(v_t\) 和 \(s_t\) 均作偏差修正:
\[\begin{split} & \hat v_t \leftarrow \frac {v_t} {1-\beta_1^t}, \\ & \hat s_t \leftarrow \frac {s_t} {1-\beta_2^t}. \end{split} \]接下来,Adam 算法使?以上偏差修正后的变量 \(\hat v_t\) 和 \(\hat s_t\),将模型参数中每个元素的学习率通过按元素运算重新调整:
\[g_t' \leftarrow \frac {\epsilon \hat v_t} {\sqrt{\hat s_t}+\delta}, \]其中 \(\epsilon\) 是学习率, \(\delta\) 是为了维持数值稳定性而添加的常数,如 \(1e?8\)。和 AdaGrad 算法、 RMSProp 算法以及 AdaDelta 算法?样,?标函数?变量中每个元素都分别拥有??的学习率。最后,使? \(g_t'\) 迭代?变量:
\[\theta_t \leftarrow \theta_{t-1} - \mathcal{g}_t' \]测试案例
仍然使用 Logisitic 和 Linear, 不再赘述, 参考 SGD.
实例主要包含三个文件分别是: optimizers.py
, nn.py
, test.py
.
"""
file name: base.py
"""
import numpy as np
class Optimizer(object):
def __init__(self, lr=0.01, delta=1e-6):
self.lr = lr
self.delta = delta
class OptimizerWeights(object):
def __init__(self, lr=0.01, delta=1e-6):
self.lr = lr
self.delta = delta
self.hyp_t = 1
def __call__(self, *args, **kwargs):
return None
def init_parameters(self, inputs_shape):
pass
# Module 基类
class Module(object):
def __init__(self, weight=None, bias=None):
self.weight = weight
self.bias = bias
self.train = True
self.y_pred = None
self.y_true = None
self.loss_diff = np.zeros((1, 1))
self._loss_pro = 0.
self._loss_now = 0.
self._weight_diff = 1.
self._bias_diff = np.zeros((1, 1))
self.optimizer_weights_update = None
def __call__(self, *args, **kwargs):
inputs_shape = []
for arg in args:
inputs_shape.append(arg.shape)
for _, arg in kwargs:
inputs_shape.append(arg.shape)
self.args = args
self.kwargs = kwargs
if len(inputs_shape) == 0:
self.build(inputs_shape)
elif len(inputs_shape) == 1:
self.build(inputs_shape[0])
else:
self.build(inputs_shape)
if self.optimizer_weights_update:
self.optimizer_weights_update.init_parameters(inputs_shape[0])
if hasattr(self, 'forward'):
forward = getattr(self, 'forward')
self.y_pred = forward(*args, **kwargs)
self.diff_parameters(*args, **kwargs)
return self.y_pred
def loss(self, *args, **kwargs):
return 0.
def build(self, inputs_shape):
if len(inputs_shape) == 0:
pass
else:
if self.weight is None:
self.weight = np.zeros(*inputs_shape[:-1])[:, np.newaxis]
if self.bias is None:
self.bias = np.zeros((1, 1))
def diff_parameters(self, *args, **kwargs) -> None:
pass
def backprop(self):
wb_diff = [np.matmul(self._weight_diff, self.loss_diff), self._bias_diff * self.loss_diff]
wb_diff = self.optimizer_weights_update(wb_diff)
self.weight -= wb_diff[0]
self.bias -= wb_diff[1]
return True
def set_optimizer_weights_update(self, weights_update):
self.optimizer_weights_update = weights_update
def set_hyp_t(self, hyp_t):
if self.optimizer_weights_update:
self.optimizer_weights_update.hyp_t = hyp_t
optimizers.py
文件内容如下:
"""
file name: optimizers.py
"""
import .base import OptimizerWeights, Optimizer
# SGD ...
class AdamWeights(OptimizerWeights):
def __init__(self, lr=0.01, delta=1e-6, beta1=0.9, beta2=0.999):
super(AdamWeights, self).__init__(lr=lr, delta=delta)
self.beta1 = beta1
self.beta2 = beta2
self.vector = None
self.steepest = None
def __call__(self, wb_diff, hyp_t=None):
if not hyp_t:
hyp_t = self.hyp_t
v_bias_corr, s_bias_corr = self._update_parameters(wb_diff, hyp_t)
g_adam_diff = [self.lr * v_corr / (np.sqrt(s_corr) + self.delta) for v_corr, s_corr in zip(v_bias_corr, s_bias_corr)]
return g_adam_diff
def init_parameters(self, inputs_shape):
self.vector = [np.zeros((inputs_shape[0], 1)), np.zeros((1, 1))]
self.steepest = [np.zeros((inputs_shape[0], 1)), np.zeros((1, 1))]
pass
def _update_parameters(self, wb_diff, hyp_t):
self.vector = [self.beta1 * v + (1 - self.beta1) * wb_diff[i] for i, v in enumerate(self.vector)]
self.steepest = [self.beta1 * s + (1 - self.beta1) * wb_diff[i] ** 2 for i, s in enumerate(self.steepest)]
v_bias_corr = [v / (1 - self.beta1 ** hyp_t) for v in self.vector]
s_bias_corr = [s / (1 - self.beta2 ** hyp_t) for s in self.steepest]
return v_bias_corr, s_bias_corr
class Adam(Optimizer):
def __init__(self, lr=0.01, delta=1e-6, beta1=0.9, beta2=0.999):
super(Adam, self).__init__(lr=lr, delta=delta)
self.beta1 = beta1
self.beta2 = beta2
def __call__(self, x, y, model, batch_size=1, epochs=10, threshold=0.01):
model.set_optimizer_weights_update(AdamWeights(self.lr, self.delta, self.beta1, self.beta2))
num_record = x.shape[-1]
if num_record < batch_size:
batch_size = num_record
bool_break = False
for i in range(epochs):
loss_mean = 0.
model.set_hyp_t(i + 1)
for j in range(num_record):
y_pred = model(x[..., j:j + 1])
y_true = y[..., j]
sgd_loss = model.loss(y_pred, y_true)
if (j + 1) % batch_size == 0:
if np.abs(loss_mean) < threshold or loss_mean == np.NAN:
bool_break = True
break
loss_mean = 0.
loss_mean = (loss_mean * j + sgd_loss) / (j + 1)
model.backprop()
if bool_break:
break
return model
"""
file name: nn.py
"""
from .base import Module
# Logistic
class Logistic(Module):
def __init__(self, w=None, b=None):
super(Logistic, self).__init__(w, b)
def forward(self, x):
return 1. / (1. + np.exp(np.matmul(self.weight.T, x) + self.bias))
def loss(self, y_pred, y_true, delta=1e-16):
self._loss_pro = self._loss_now
y_pred = np.minimum(np.maximum(y_pred, delta), 1. - delta)
self._loss_now = -(y_true * np.log(y_pred) +
(1. - y_true) * np.log(1. - y_pred))
self.loss_diff = -(y_true / y_pred - (1. - y_true) / (1. - y_pred))
return self._loss_now
def diff_parameters(self, x):
g_param_diff = -2. * self.y_pred * (1. - self.y_pred)
self._weight_diff = g_param_diff * x
self._bias_diff = g_param_diff
pass
# Linear
class Linear(Module):
def __init__(self, w=None, b=None):
super(Linear, self).__init__(w, b)
def forward(self, x):
return np.matmul(self.weight.T, x) + self.bias
def loss(self, y_pred, y_true):
self._loss_pro = self._loss_now
self._loss_now = np.sum((y_pred - y_true) ** 2)
self.loss_diff = 2. * (y_pred - y_true)
return self._loss_now
def diff_parameters(self, x):
self._weight_diff = x
self._bias_diff = 1.
pass
测试文件:
"""
file name: test.py
"""
import numpy as np
import .nn
from .optimizers import SGD, Adam
def Sigmod(x, w, b):
return 1. / (1. + np.exp(np.matmul(w.T, x) + b))
def Linear(x, w, b):
return np.matmul(w.T, x) + b
def test_Optimizer_Logistic(x, w, b, Optimizer):
y_true = Sigmod(x, w, b)
rand_y = np.random.randn(len(y_true))
rand_y = 0.01 * rand_y / np.max(np.abs(rand_y))
y_true = Sigmod(x, w, b) + rand_y > 0.5
model = nn.Logistic()
sgd_model = Optimizer(x, y_true, model, batch_size=256,
epochs=10000, threshold=.5)
y_pred = np.float32(Sigmod(x, sgd_model.weight, sgd_model.bias) > 0.5)
print('error_rate: ', np.sum(np.abs(y_pred - y_true)) / len(y_true))
def test_Optimizer_Linear(x, w, b, Optimizer):
y_true = Linear(x, w, b)
rand_y = np.random.randn(len(y_true))
y_true += 0.01 * rand_y / np.max(np.abs(rand_y))
model = nn.Linear()
sgd_model = Optimizer(x, y_true, model, batch_size=256,
epochs=10000, threshold=.005)
y_pred = Linear(x, sgd_model.weight, sgd_model.bias)
print('MSE: ', np.sum((y_pred - y_true) ** 2) / len(y_true))
def create_optimizer(optimizer='sgd', lr=0.01, delta=1e-6, **kwargs):
if optimizer == 'adam':
opt = Adam(lr=lr, delta=delta)
else:
opt = SGD(lr=lr, delta=delta)
return opt
def test_Optimizer(model='logistic', optimizer='sgd'):
"""
Args:
model: 'logistic', 'linear'
optimizer: 'sgd', 'adam'
"""
w = np.array([1.8, -2.5, 3.1, -2.3, .6, 2.1, -1.1])
b = 0.1
# Data
x = np.random.randn(len(w), 1024)
if model == 'logistic':
opt_logistic = create_optimizer('adam', lr=0.0001)
test_Optimizer_Logistic(x, w, b, opt_logistic)
elif model == 'linear':
opt_linear = create_optimizer('adam', lr=0.001)
test_Optimizer_Linear(x, w, b, opt_linear)
if __name__ == '__main__':
# fun: logistic linear
# optimizer: sdg adam
test_Optimizer('logistic')
内容总结
以上是互联网集市为您收集整理的优化算法之Adam全部内容,希望文章能够帮你解决优化算法之Adam所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。