Pytorch LSTM 代码解读及自定义双向 LSTM 算子

1. 理论

关于 LSTM 的理论部分可以参考

Paper

Long Short-Term Memory Based Recurrent Neural Network Architectures for Large Vocabulary Speech Recognition

解析

Understanding LSTM Networks
人人都能看懂的LSTM

Pytorch LSTM 算子

LSTM 文档

LSTMCell 前向计算过程如下：

2. 源代码

python 代码中仅仅能看到 _VF.lstm

# https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py
# line 688
if batch_sizes is None:result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training, self.bidirectional, self.batch_first)
else:result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias, self.num_layers,self.dropout, self.training, self.bidirectional)

转到 C++ 代。代码逻辑比较清晰，最终的计算是在 LSTMCell 中实现的。

# https://github.com/pytorch/pytorch/blob/49777e67303f608987ec0948c7fd8f46f6d3ca83/torch/csrc/api/src/nn/modules/rnn.cpp
# line 275
std::tie(output, hidden_state, cell_state) = torch::lstm(input,{state[0], state[1]},flat_weights_,options.with_bias(),options.layers(),options.dropout(),this->is_training(),options.bidirectional(),options.batch_first());

# https://github.com/pytorch/pytorch/blob/1a93b96815b5c87c92e060a6dca51be93d712d09/aten/src/ATen/native/RNN.cpp
# line 855
std::tuple<Tensor, Tensor, Tensor> lstm(const Tensor& _input, TensorList hx,TensorList _params, bool has_biases,int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");if (at::cudnn_is_acceptable(_input)) {Tensor output, hy, cy;lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases,num_layers, dropout_p, train, bidirectional, batch_first);return std::make_tuple(output, hy, cy);} if (use_miopen(_input, dropout_p)) {Tensor output, hy, cy;lstm_miopen_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases,num_layers, dropout_p, train, bidirectional, batch_first);return std::make_tuple(output, hy, cy);}check_device(_input, _params, hx);auto input = batch_first ? _input.transpose(0, 1) : _input;auto params = gather_params(_params, has_biases);auto results = _lstm_impl<FullLayer, FullBidirectionalLayer>(input, params, hx[0], hx[1], num_layers, dropout_p, train, bidirectional);if (batch_first) {std::get<0>(results) = std::get<0>(results).transpose(0, 1);}return results;
}# line 679
template<template<typename,typename> class LayerT, template<typename,typename> class BidirLayerT, typename cell_params, typename io_type>
std::tuple<io_type, Tensor, Tensor> _lstm_impl(const io_type& input,const std::vector<cell_params>& params, const Tensor& hx, const Tensor& cx,int64_t num_layers, double dropout_p, bool train, bool bidirectional) {// It's much more useful for us to work on lists of pairs of hx and cx for each layer, so we need// to transpose a pair of those tensors.auto layer_hx = hx.unbind(0);auto layer_cx = cx.unbind(0);int64_t total_layers = layer_hx.size();std::vector<typename LSTMCell<cell_params>::hidden_type> hiddens;hiddens.reserve(total_layers);for (int64_t i = 0; i < total_layers; ++i) {hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i]));}auto result = _rnn_impl<LSTMCell<cell_params>, LayerT, BidirLayerT>(input, params, hiddens, num_layers, dropout_p, train, bidirectional);// Now, we need to reverse the transposed we performed above.std::vector<Tensor> hy, cy;hy.reserve(total_layers); cy.reserve(total_layers);for (auto & hidden : result.final_hidden) {hy.push_back(std::move(std::get<0>(hidden)));cy.push_back(std::move(std::get<1>(hidden)));}return std::make_tuple(result.outputs, at::stack(hy, 0), at::stack(cy, 0));
}# line 652
template<typename CellType, template<typename,typename> class LayerT, template<typename,typename> class BidirLayerT, typename cell_params, typename io_type>
LayerOutput<io_type, std::vector<typename CellType::hidden_type>> _rnn_impl(const io_type& input,const std::vector<cell_params>& params,const std::vector<typename CellType::hidden_type>& hiddens,int64_t num_layers, double dropout_p, bool train, bool bidirectional) {using hidden_type = typename CellType::hidden_type;CellType cell;if (bidirectional) {using BidirLayer = BidirLayerT<hidden_type, cell_params>;auto bidir_result = apply_layer_stack(BidirLayer{cell}, input, pair_vec(hiddens), pair_vec(params), num_layers, dropout_p, train);return {bidir_result.outputs, unpair_vec(std::move(bidir_result.final_hidden))};} else {return apply_layer_stack(LayerT<hidden_type,cell_params>{cell}, input, hiddens, params, num_layers, dropout_p, train);}
}# line 623
template<typename io_type, typename hidden_type, typename weight_type>
LayerOutput<io_type, std::vector<hidden_type>>
apply_layer_stack(const Layer<io_type, hidden_type, weight_type>& layer, const io_type& input,const std::vector<hidden_type>& hiddens, const std::vector<weight_type>& weights,int64_t num_layers, double dropout_p, bool train) {TORCH_CHECK(num_layers == hiddens.size(), "Expected more hidden states in stacked_rnn");TORCH_CHECK(num_layers == weights.size(), "Expected more weights in stacked_rnn");auto layer_input = input;auto hidden_it = hiddens.begin();auto weight_it = weights.begin();std::vector<hidden_type> final_hiddens;for (int64_t l = 0; l < num_layers; ++l) {auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++));final_hiddens.push_back(layer_output.final_hidden);layer_input = layer_output.outputs;if (dropout_p != 0 && train && l < num_layers - 1) {layer_input = dropout(layer_input, dropout_p);}}return {layer_input, final_hiddens};
}# line
template <typename dir_hidden_type, typename cell_params>
struct FullBidirectionalLayer: Layer<Tensor, pair_of<dir_hidden_type>, pair_of<cell_params>> {using hidden_type = pair_of<dir_hidden_type>;using param_type = pair_of<cell_params>;using output_type = typename Layer<Tensor, hidden_type, param_type>::output_type;FullBidirectionalLayer(Cell<dir_hidden_type, cell_params>& cell): layer_(cell) {};output_type operator()(const Tensor& input,const hidden_type& input_hidden,const param_type& params) const override {std::vector<Tensor> step_inputs;if (input.device().is_cpu()) {auto input_w = params.first.linear_ih(input);step_inputs = input_w.unbind(0);auto fw_result = layer_(step_inputs, input_hidden.first, params.first, true);auto fw_output = at::stack(fw_result.outputs, 0);input_w = params.second.linear_ih(input);step_inputs = input_w.unbind(0);auto rev_step_inputs = reverse(std::move(step_inputs));auto rev_result =layer_(rev_step_inputs, input_hidden.second, params.second, true);std::reverse(rev_result.outputs.begin(), rev_result.outputs.end());auto rev_output = at::stack(rev_result.outputs, 0);return {at::cat({fw_output, rev_output}, fw_output.dim() - 1),std::make_pair(fw_result.final_hidden, rev_result.final_hidden)};}step_inputs = input.unbind(0);auto fw_result = layer_(step_inputs, input_hidden.first, params.first);auto fw_output = at::stack(fw_result.outputs, 0);auto rev_step_inputs = reverse(std::move(step_inputs));auto rev_result =layer_(rev_step_inputs, input_hidden.second, params.second);std::reverse(rev_result.outputs.begin(), rev_result.outputs.end());auto rev_output = at::stack(rev_result.outputs, 0);return {at::cat({fw_output, rev_output}, fw_output.dim() - 1),std::make_pair(fw_result.final_hidden, rev_result.final_hidden)};}std::vector<Tensor> reverse(std::vector<Tensor>&& x) const {std::reverse(x.begin(), x.end());return std::move(x);}FullLayer<dir_hidden_type, cell_params> layer_;
};# line 370
template<typename hidden_type, typename cell_params>
struct FullLayer : Layer<Tensor, hidden_type, cell_params> {using output_type =typename Layer<Tensor, hidden_type, cell_params>::output_type;using unstacked_output_type = LayerOutput<std::vector<Tensor>, hidden_type>;FullLayer(Cell<hidden_type, cell_params>& cell): cell_(cell) {};unstacked_output_type operator()(const std::vector<Tensor>& step_inputs,const hidden_type& input_hidden,const cell_params& params,bool pre_compute_input = false) const {std::vector<Tensor> step_outputs;auto hidden = input_hidden;for (const auto& input : step_inputs) {hidden = cell_(input, hidden, params, pre_compute_input);step_outputs.emplace_back(hidden_as_output(hidden));}return {step_outputs, hidden};}output_type operator()(const Tensor& inputs,const hidden_type& input_hidden,const cell_params& params) const override {if (inputs.device().is_cpu()) {const auto inputs_w = params.linear_ih(inputs);auto unstacked_output =(*this)(inputs_w.unbind(0), input_hidden, params, true);return {at::stack(unstacked_output.outputs, 0),unstacked_output.final_hidden};}auto unstacked_output = (*this)(inputs.unbind(0), input_hidden, params);return {at::stack(unstacked_output.outputs, 0),unstacked_output.final_hidden};}Cell<hidden_type, cell_params>& cell_;
};# line 273
template <typename cell_params>
struct LSTMCell : Cell<std::tuple<Tensor, Tensor>, cell_params> {using hidden_type = std::tuple<Tensor, Tensor>;hidden_type operator()(const Tensor& input,const hidden_type& hidden,const cell_params& params,bool pre_compute_input = false) const override {const auto& hx = std::get<0>(hidden);const auto& cx = std::get<1>(hidden);if (input.is_cuda()) {TORCH_CHECK(!pre_compute_input);auto igates = params.matmul_ih(input);auto hgates = params.matmul_hh(hx);auto result = at::_thnn_fused_lstm_cell(igates, hgates, cx, params.b_ih, params.b_hh);// Slice off the workspace argument (it's needed only for AD).return std::make_tuple(std::get<0>(result), std::get<1>(result));}const auto gates = params.linear_hh(hx).add_(pre_compute_input ? input : params.linear_ih(input));auto chunked_gates = gates.chunk(4, 1);auto ingate = chunked_gates[0].sigmoid_();auto forgetgate = chunked_gates[1].sigmoid_();auto cellgate = chunked_gates[2].tanh_();auto outgate = chunked_gates[3].sigmoid_();auto cy = (forgetgate * cx).add_(ingate * cellgate);auto hy = outgate * cy.tanh();return std::make_tuple(hy, cy);}};

3. 自己实现双向 LSTM

经过测试下面的 custom_bilstm 和 lstm 的效果是一致的（注意权重的初始化）。

class lstm(nn.Module):def __init__(self):super(lstm_o, self).__init__()self.rnn = nn.LSTM(512, 256, bidirectional=True, batch_first=True)def forward(self, input):self.rnn.flatten_parameters()recurrent, _ = self.rnn(input)return recurrent# 借助 nn.LSTMCell 实现双向 LSTM
class custom_bilstm(nn.Module):def __init__(self):super(custom_bilstm, self).__init__()self.rnn = nn.LSTMCell(512, 256)self.rnn1 = nn.LSTMCell(512, 256)def forward(self, input):recurrent, f_cx = self.rnn(input[:, 0, :])fwd = [recurrent]for i in range(1, input.shape[1]):recurrent, f_cx = self.rnn(input[:, i, :], (recurrent, f_cx))fwd.append(recurrent)forward = torch.stack(fwd, dim=0).squeeze(1)input_reverse = torch.flip(input, dims=[1])recurrent_b, b_cx = self.rnn1(input_reverse[:, 0, :])bwd = [recurrent_b]for i in range(1, input_reverse.shape[1]):recurrent_b, b_cx = self.rnn1(input_reverse[:, i, :], (recurrent_b, b_cx))bwd.append(recurrent_b)backward = torch.stack(bwd, dim=0).squeeze(1)backward_reverse = torch.flip(backward, dims=[0])return torch.cat((forward, backward_reverse), -1).unsqueeze(0)

Pytorch LSTM 代码解读及自定义双向 LSTM 算子相关推荐

长短期记忆网络（Long Short-Term Memory，LSTM）及其变体双向LSTM和GRU
LSTM(Long Short-Term Memory)长短期记忆网络,是一种时间递归神经网络,适合于处理和预测时间序列中间隔和延迟相对较长的重要事件.LSTM是解决循环神经网络RNN结构中存在的&q ...
Bilinear CNN PyTorch版代码解读
本文是个人对Bilinear CNN的代码的理解,代码来自于Hao Zhang,适用PyTorch 0.3.0,骨干网选择的是vgg16-pool5,应用于CUB200-2011数据集. 1.文件结构 ...
pytorch dropout代码解读
1.代码 class Net(nn.Module):def __init__(self):#对所有的层初始化super(Net, self).__init__()#父类的所有的属性self.conv1
猿创征文丨深度学习基于双向LSTM模型完成文本分类任务
大家好,我是猿童学,本期猿创征文的第三期,也是最后一期,给大家带来神经网络中的循环神经网络案例,基于双向LSTM模型完成文本分类任务,数据集来自kaggle,对电影评论进行文本分类. 电影评论可以蕴含 ...
深度学习基于双向 LSTM 模型完成文本分类任务
大家好,本期给大家带来神经网络中的循环神经网络案例,基于双向LSTM模型完成文本分类任务,数据集来自kaggle,对电影评论进行文本分类. 电影评论可以蕴含丰富的情感:比如喜欢.讨厌.等等．情感分析( ...
[论文笔记]基于 CNN+双向LSTM 实现服饰搭配的生成
论文:<Learning Fashion Compatibility with Bidirectional LSTMs> 论文地址:https://arxiv.org/abs/1707.0 ...
使用深度双向LSTM模型构造社区问答系统
/* 版权声明:可以任意转载,转载时请标明文章原始出处和作者信息 .*/ author: 张俊林关于社区问答系统的问题背景,我们在之前的"利用卷积神经网络构造社区问答系统"一文里 ...
LSTM模型、双向LSTM模型以及模型输入输出的理解
循环神经网路(RNN)在工作时一个重要的优点在于,其能够在输入和输出序列之间的映射过程中利用上下文相关信息.然而不幸的是,标准的循环神经网络(RNN)能够存取的上下文信息范围很有限.这个问题就使得隐含 ...
NNDL 实验七循环神经网络（4）基于双向LSTM的文本分类
6.4 实践:基于双向LSTM模型完成文本分类任务电影评论可以蕴含丰富的情感:比如喜欢.讨厌.等等．情感分析(Sentiment Analysis)是为一个文本分类问题,即使用判定给定的一段文本信 ...

Pytorch LSTM 代码解读及自定义双向 LSTM 算子

Pytorch LSTM 代码解读及自定义双向 LSTM 算子

1. 理论

Paper

解析

Pytorch LSTM 算子

2. 源代码

3. 自己实现双向 LSTM

Pytorch LSTM 代码解读及自定义双向 LSTM 算子相关推荐

最新文章

热门文章