c++实现LSTM，ADAM优化，预测大写数字

自己写的c++实现LSTM算法，有解释；使用了ADAM优化，大大加快了收敛速度，可以与我之前未使用优化的算法进行比较，大约有350倍的提升。有问题请发邮箱：fan1974815@126.com，也可以在下面提出问题，我会尽量解答。
直接上代码，3部分：头文件.h、源文件.cpp、调用方法，使用了vs2015平台
1、头文件
#pragma once
#include “iostream”
#include “math.h”
#include “stdlib.h”
#include “time.h”
#include “vector”
#include “assert.h”
#include"DataPreProc.h"
#include"string"
using namespace std;

class AdamLSTM
{
public:
AdamLSTM();
~AdamLSTM();
#define innode 1 //输入结点数，存储特征向量
#define hidenode 128 //隐藏结点数，
#define outnode 10 //输出结点数，每个时间点用一个向量代表结果
#define alpha 0.0005 //学习速率
#define timesteps 3 //time steps 时间点数量
#define epochs 25000
#define trainErrThreshold 0.9
#define beta1 0.8
#define beta2 0.999
#define eps 1e-8

#define random(x) rand()%(x)
#define randval(high) ( (double)rand() / RAND_MAX * high )
#define uniform_plus_minus_one ( (double)( 2.0 * rand() ) / ((double)RAND_MAX + 1.0) - 1.0 ) //均匀随机分布

#define CHINESE

double W_I[innode][hidenode];     //连接输入与隐含层单元中输入门的权值矩阵
double W_I_m[innode][hidenode];     //连接输入与隐含层单元中输入门的权值 的第一动量
double W_I_v[innode][hidenode];     //连接输入与隐含层单元中输入门的权值 的第二动量
double U_I[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵
double U_I_m[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵 的第一动量
double U_I_v[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵 的第二动量
double B_I[hidenode][1];   //连接输入与隐含层单元中输入门的偏置
double B_I_m[hidenode][1];   //连接输入与隐含层单元中输入门的偏置 的第一动量
double B_I_v[hidenode][1];   //连接输入与隐含层单元中输入门的偏置 的第二动量double W_F[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵
double W_F_m[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵
double W_F_v[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵
double U_F[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double U_F_m[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double U_F_v[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double B_F[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置
double B_F_m[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置
double B_F_v[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置double W_O[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double W_O_m[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double W_O_v[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double U_O[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double U_O_m[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double U_O_v[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double B_O[hidenode][1];    //连接输入与隐含层单元中输出门的偏置
double B_O_m[hidenode][1];    //连接输入与隐含层单元中输出门的偏置
double B_O_v[hidenode][1];    //连接输入与隐含层单元中输出门的偏置double W_G[innode][hidenode];     //用于产生新记忆的权值矩阵
double W_G_m[innode][hidenode];     //用于产生新记忆的权值矩阵
double W_G_v[innode][hidenode];     //用于产生新记忆的权值矩阵
double U_G[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double U_G_m[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double U_G_v[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double B_G_m[hidenode][1]; //用于产生新记忆的偏置
double B_G_v[hidenode][1]; //用于产生新记忆的偏置
double B_G[hidenode][1]; //用于产生新记忆的偏置double W_out[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double W_out_m[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double W_out_v[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double B_out[outnode][1];  //连接隐层与输出层的权值矩阵
double B_out_m[outnode][1];  //连接隐层与输出层的权值矩阵
double B_out_v[outnode][1];  //连接隐层与输出层的权值矩阵double dsigmoid(double y);
double sigmoid(double x);
//tanh的导数，y为tanh值
double dtanh(double y);
void winit(double w[], int n); //权值初始化
void winit_withiZero(double w[], int n); //用零初始化权值void train();
vector<string> encode2Truth(map<string, int> &dsMap, vector<double *> &predictedM);
vector<string> encode2Sample(map<string, int> &dsMap, vector<int> &sample);
vector<int> sample2Encode(map<string, int> &dsMap, vector<string> sample);
void BubbleSort(double  *p, int length, int * ind_diff);
void predict();

};

2、源文件
/*使用adam优化，公式网址：https://machinelearningmastery.com/adam-optimization-from-scratch/

*/
#include “stdafx.h”
#include “AdamLSTM.h”

AdamLSTM::AdamLSTM()
{
winit((double*)W_I, hidenode * innode);
winit((double*)U_I, hidenode * hidenode);
winit((double*)B_I, hidenode * 1);
winit((double*)W_F, hidenode * innode);
winit((double*)U_F, hidenode * hidenode);
winit((double*)B_F, hidenode * 1);
winit((double*)W_O, hidenode * innode);
winit((double*)U_O, hidenode * hidenode);
winit((double*)B_O, hidenode * 1);
winit((double*)W_G, hidenode * innode);
winit((double*)U_G, hidenode * hidenode);
winit((double*)B_G, hidenode * 1);
winit((double*)W_out, hidenode * outnode);
winit((double*)B_out, outnode * 1);

winit_withiZero((double*)W_I_m, hidenode * innode);
winit_withiZero((double*)W_I_v, hidenode * innode);
winit_withiZero((double*)U_I_m, hidenode * hidenode);
winit_withiZero((double*)U_I_v, hidenode * hidenode);
winit_withiZero((double*)B_I_m, hidenode * 1);
winit_withiZero((double*)B_I_v, hidenode * 1);winit_withiZero((double*)W_F_m, hidenode * innode);
winit_withiZero((double*)W_F_v, hidenode * innode);
winit_withiZero((double*)U_F_m, hidenode * hidenode);
winit_withiZero((double*)U_F_v, hidenode * hidenode);
winit_withiZero((double*)B_F_m, hidenode * 1);
winit_withiZero((double*)B_F_v, hidenode * 1);winit_withiZero((double*)W_O_m, hidenode * innode);
winit_withiZero((double*)W_O_v, hidenode * innode);
winit_withiZero((double*)U_O_m, hidenode * hidenode);
winit_withiZero((double*)U_O_v, hidenode * hidenode);
winit_withiZero((double*)B_O_m, hidenode * 1);
winit_withiZero((double*)B_O_v, hidenode * 1);winit_withiZero((double*)W_G_m, hidenode * innode);
winit_withiZero((double*)W_G_v, hidenode * innode);
winit_withiZero((double*)U_G_m, hidenode * hidenode);
winit_withiZero((double*)U_G_v, hidenode * hidenode);
winit_withiZero((double*)B_G_m, hidenode * 1);
winit_withiZero((double*)B_G_v, hidenode * 1);winit_withiZero((double*)W_out_m, hidenode * outnode);
winit_withiZero((double*)W_out_v, hidenode * outnode);winit_withiZero((double*)B_out_m, outnode * 1);
winit_withiZero((double*)B_out_v, outnode * 1);

}

AdamLSTM::~AdamLSTM()
{
}

double AdamLSTM::dsigmoid(double y)
{
return y * (1.0 - y);
}

double AdamLSTM::sigmoid(double x)
{
return 1.0 / (1.0 + exp(-x));
}

double AdamLSTM::dtanh(double y)
{
y = tanh(y);
return 1.0 - y * y;
}

void AdamLSTM::winit(double w[], int n)
{
for (int i = 0; i<n; i++)
{
w[i] = uniform_plus_minus_one; //均匀随机分布
/* cout << “w”<<i<<"=" << w[i] << endl;*/
}
}

void AdamLSTM::winit_withiZero(double w[], int n)
{
for (int i = 0; i<n; i++)
{
w[i] =0.0;
/* cout << “w”<<i<<"=" << w[i] << endl;*/
}
}

void AdamLSTM::train()
{
//char s[] = “十四五规划和二零三五年远景目标纲要明确实施积极应对人口老龄化国家战略制定人口长期发展战略优化生育政策”;
//string ss = “十四五规划和二零三五年远景目标纲要明确实施积极应对人口老龄化国家战略制定人口长期发展战略优化生育政策”;
//char s[] = “锄禾日当午汗滴和下土谁知盘中餐粒粒皆辛苦”;
//string ss = “锄禾日当午汗滴和下土谁知盘中餐粒粒皆辛苦”;

char s[] = "壹 贰 叁 肆 伍 陆 柒 捌 玖 拾";
string ss = "壹贰叁肆伍陆柒捌玖拾";
int epoch = 0, i, j, k, m, p;
//中间变量值vector<double*> I_vector;      //输入门
vector<double*> F_vector;      //遗忘门
vector<double*> O_vector;      //输出门
vector<double*> G_vector;      //新记忆
vector<double*> S_vector;      //状态值
vector<double*> h_vector;      //输出值
vector<double*> y_delta_vector;        //保存误差关于输出层的偏导
vector<double *>predictV;
vector<double*>truth_steps;//每个样本所有timesteps的真值，应该清零
DataPreProc dpp;
map<string, int> dsMap = dpp.loadDataset(s);
vector<int> curSample;//每个样本应该清零，存储当前样本中每个时间点的输入向量
vector<int> curTimestep_truth_10;//每个样本中每个timestep对应的真值，十进制；每训练一个样本应该清零int offset = 0;

#ifdef CHINESE
int end_offset = (timesteps + 0) * 2;//如果用随机产生样本的方法，此处的偏移应该加0而不是1，因为随机数界限不包含最大值
#else
int end_offset = (timesteps + 1);
#endif // CHINESE
int randomMax = ss.size() - end_offset;//随机产生样本索引时，随机值的范围最大值
cout << “randomMax” << randomMax << endl;

double e = 10000000.0;  //误差/*for (epoch = 0; epoch < epochs; epoch++) */ //训练次数
while (e > trainErrThreshold)
{//1、清零和初始化double * x = new double[innode];if ((offset + end_offset) >= ss.size()){offset = 0;}e = 0.0;//存储每个时间点计算出来的权重delta，并累加；一个样本训练完之后进行更新权重，这个先不考虑//获取样本curSample.clear();curTimestep_truth_10.clear();

#ifdef CHINESE
for (size_t i = 0; i < timesteps * 2; i = i + 2)//获取本样本的timesteps个时间点数据，因为是汉字占2个位置，因此2
{
curSample.push_back(dsMap[ss.substr(offset + i, 2)]);
curTimestep_truth_10.push_back(dsMap[ss.substr(offset + i + 2, 2)]);
//cout << curSample.back() << endl;
//cout << curTimestep_truth_10.back() << endl;
}
#else
for (size_t i = 0; i < timesteps; i = i + 1)//获取本样本的timesteps个时间点数据，因为是汉字占2个位置，因此2
{
curSample.push_back(dsMap[ss.substr(offset + i, 1)]);
curTimestep_truth_10.push_back(dsMap[ss.substr(offset + i + 1, 1)]);
//cout << curSample.back() << endl;
//cout << curTimestep_truth_10.back() << endl;
}
#endif // CHINESE
//对真值进行one_hot编码
truth_steps.clear();
for (size_t i = 0; i < timesteps; i++)
{
double LableD = new double[outnode] {0};//打印测试???
for (size_t j = 0; j < dsMap.size(); j++)
{
if ((j + 1) == curTimestep_truth_10[i])
{
LableD[j] = 1.0f;
/cout << “LableD:” << LableD[j] << endl;/
}
else
{
LableD[j] = 0.0f;
/ cout << “LableD:” << LableD[j] << endl;*/
}
}
/cout << “当前时间点真值:” << endl << curTimestep_truth_10[i] << endl;/
truth_steps.push_back(LableD);//矩阵必须要复制，否则每次都引用了相同的数据
/cout << “truth_steps=”<<i<<endl << truth_steps.back() << endl;/
}
//正向传播：一直计算完所有的time step
for (p = 0; p < timesteps; p++)//循环遍历二进制数组，从最低位开始,p相当于time steps
{
x[0] = (float)curSample[p];
/cout << “x[0]” << p << “=” << x[0] << endl;/
double *in_gate = new double[hidenode] {0}; //输入门取sigmoid函数后的值
double *out_gate = new double[hidenode] {0}; //输出门取sigmoid函数后的值
double *forget_gate = new double[hidenode] {0}; //遗忘门取sigmoid函数后的值
double *g_gate = new double[hidenode] {0}; //新记忆取tanh函数后的值
double *state = new double[hidenode] {0}; //状态值
double *h = new double[hidenode] {0}; //隐层输出值
double *output = new double[outnode] {0};//当前时间点的预测数组
double *y_pre = new double[outnode] {0};//当前时间点的预测数组
double * truthLabel = new double[outnode] {0};//当前时间点的真值数组
double * y_delta = new double[outnode] {0};//当前时间点的真值数组
if (p == 0)
{
//在0时刻是没有之前的隐含层的，所以初始化一个全为0的
double *S = new double[hidenode] {0}; //状态值
double *h = new double[hidenode] {0}; //输出值
for (size_t i = 0; i < hidenode; i++)
{
S[i] = 0.0;
h[i] = 0.0;
}
S_vector.push_back(S);
h_vector.push_back(h);
}
for (size_t j = 0; j < hidenode; j++)
{
double inGateValue = 0;
double forgetGateValue = 0;
double outGateValue = 0;
double gGateValue = 0;
//当前时间点输入
for (size_t k = 0; k < innode; k++)
{
forgetGateValue += x[k] * W_F[k][j];

             inGateValue += x[k] * W_I[k][j];gGateValue += x[k] * W_G[k][j];outGateValue += x[k] * W_O[k][j];}//前一个状态double * h_pre = h_vector.back();double * state_pre = S_vector.back();for (size_t i = 0; i < hidenode; i++){forgetGateValue += h_pre[i] * U_F[i][j];inGateValue += h_pre[i] * U_I[i][j];gGateValue += h_pre[i] * U_G[i][j];outGateValue += h_pre[i] * U_O[i][j];}//偏置forgetGateValue += B_F[j][0] * 1.0;inGateValue += B_I[j][0] * 1.0;gGateValue += B_G[j][0] * 1.0;outGateValue += B_O[j][0] * 1.0;in_gate[j] = sigmoid(inGateValue);out_gate[j] = sigmoid(outGateValue);forget_gate[j] = sigmoid(forgetGateValue);g_gate[j] = tanh(gGateValue);//这里应该是tanhdouble s_pre = state_pre[j];state[j] = forget_gate[j] * s_pre + g_gate[j] * in_gate[j];//状态h[j] = out_gate[j] * tanh(state[j]);//输出/*cout << "h[j]=" << h[j] << endl;*/}truthLabel = truth_steps[p];/*for (size_t m = 0; m < outnode; m++){cout << "truthLabel" << m << "=" << truthLabel[m] << endl;}*/for (k = 0; k < outnode; k++)//输出节点{//隐藏层传播到输出层，输出层的权重和sigmoid函数是自己根据实际需要添加的，一般情况下，LSTM就有个h输出for (j = 0; j < hidenode; j++){double tmp = h[j] * W_out[j][k];/*cout << "tmp" <<  "=" << tmp << endl;cout << "output" << j << "=" << output[k] << endl;*/output[k] += tmp;//cout << "h" << j << "=" << h[j] << endl;//cout << "W_out" << j << k << "=" << W_out[j][k] << endl;//cout << "output" << j << "=" << output[k] << endl;}output[k] += B_out[k][0] * 1.0;y_pre[k] = sigmoid(output[k]);               //输出层各单元输出/*   cout << "y_pre" << k << "=" << y_pre[k] << endl;*/}predictV.push_back(y_pre);I_vector.push_back(in_gate);F_vector.push_back(forget_gate);O_vector.push_back(out_gate);S_vector.push_back(state);G_vector.push_back(g_gate);h_vector.push_back(h);//保存标准误差关于输出层的偏导for (size_t k = 0; k < outnode; k++){y_delta[k] = (truthLabel[k] - output[k]) * dsigmoid(y_pre[k]);e += fabs(truthLabel[k] - output[k]);          //误差/*cout << "output" << k << "=" << output[k] << ";truthLabel" << k << "=" << truthLabel[k] << endl;*/}y_delta_vector.push_back(y_delta);}//误差反向传播//隐含层偏差，通过和当前时间点输出层的误差来计算double h_delta[hidenode]{ 0 };//当前时间点输出（不含输出层）的delta，包含两部分：后一个时间点的隐含层h(t+1)误差，输出层y的误差y_delta，这两部分相加double *y_deltaB = new double[outnode] { 0 };double *O_delta = new double[hidenode] {0};double *I_delta = new double[hidenode] {0};double *F_delta = new double[hidenode] {0};double *G_delta = new double[hidenode] {0};double *state_delta = new double[hidenode] {0};//后一个时间点的隐藏层误差double *O_future_delta = new double[hidenode] {0};double *I_future_delta = new double[hidenode] {0};double *F_future_delta = new double[hidenode] {0};double *G_future_delta = new double[hidenode] {0};double *state_future_delta = new double[hidenode] {0};double *forget_gate_future = new double[hidenode] {0};//存储前一个时间点的遗忘门节点值，不是delta//for (j = 0; j<hidenode; j++)//从最后一个时间点开始计算，此时门节点的delta都是0//{//    /*O_future_delta[j] = 0; *///  cout <<"what?:"<< O_future_delta[j] << endl;//  I_future_delta[j] = 0;//   F_future_delta[j] = 0;//   G_future_delta[j] = 0;//   state_future_delta[j] = 0;//   forget_gate_future[j] = 0;//}//从最后一个时间点反向计算for (p = timesteps - 1; p >= 0; p--){x[0] = (float)curSample[p];//当前隐藏层,取出每个门的节点值，double *in_gate = I_vector[p];     //输入门，当前时间点中，门的所有神经元节点值double *out_gate = O_vector[p];    //输出门double *forget_gate = F_vector[p]; //遗忘门double *g_gate = G_vector[p];      //新记忆double *state = S_vector[p + 1];     //状态值，算0元素，共有8+1=9个元素double *h = h_vector[p + 1];         //隐层输出值，算0元素，共有8+1=9个元素//前一个隐藏层double *h_pre = h_vector[p];double *state_pre = S_vector[p];y_deltaB = y_delta_vector[p];//对于网络中每个隐藏单元，计算误差项，并更新权值double mhat = 0.0, vhat = 0.0;double gradient = 0.0;for (j = 0; j < hidenode; j++){h_delta[j] = 0.0;for (k = 0; k<outnode; k++){h_delta[j] += y_deltaB[k] * W_out[j][k];/*cout << "h_delta" << j << h_delta[j] << endl;*/}for (k = 0; k<hidenode; k++)//当前的h也输入到下一个时间点的cell中了，因此也需要累加下一个时间点的相关delta{h_delta[j] += I_future_delta[k] * U_I[j][k];h_delta[j] += F_future_delta[k] * U_F[j][k];h_delta[j] += O_future_delta[k] * U_O[j][k];h_delta[j] += G_future_delta[k] * U_G[j][k];}//隐含层中每个门的每个神经元的误差O_delta[j] = h_delta[j] * tanh(state[j]) * dsigmoid(out_gate[j]);state_delta[j] = h_delta[j] * out_gate[j] * dtanh(state[j]) +state_future_delta[j] * forget_gate_future[j];F_delta[j] = state_delta[j] * state_pre[j] * dsigmoid(forget_gate[j]);I_delta[j] = state_delta[j] * g_gate[j] * dsigmoid(in_gate[j]);G_delta[j] = state_delta[j] * in_gate[j] * dtanh(g_gate[j]);//此处用dtanh???//更新前一个时间点隐含层和当前时间点隐含层之间的权值for (k = 0; k<hidenode; k++){gradient = I_delta[j] * h_pre[k];//梯度计算：g(t) = f'(x(t-1))U_I_m[k][j] = beta1 * U_I_m[k][j] + (1.0 - beta1) * gradient;//第一个动量：m(t) = beta1 * m(t-1) + (1 – beta1) * g(t)U_I_v[k][j] = beta2 * U_I_v[k][j] + (1.0 - beta2) * pow(gradient,2);//第二个动量：v(t) = beta2 * v(t-1) + (1 – beta2) * g(t)^2mhat = U_I_m[k][j] * 1.0 / (1.0 - pow(beta1,(epoch + 1)));//mhat(t) = m(t) / (1 – beta1^t)vhat = U_I_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));//vhat(t) = v(t) / (1 – beta2^t)U_I[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);//x(t) =x(t-1) – alpha * mhat(t) / (sqrt(vhat(t)) + epslon)//U_I[k][j] += alpha * I_delta[j] * h_pre[k];gradient = F_delta[j] * h_pre[k];U_F_m[k][j] = beta1 * U_F_m[k][j] + (1.0 - beta1) * gradient;U_F_v[k][j] = beta2 * U_F_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = U_F_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = U_F_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));U_F[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);//U_F[k][j] += alpha * F_delta[j] * h_pre[k];gradient = O_delta[j] * h_pre[k];U_O_m[k][j] = beta1 * U_O_m[k][j] + (1.0 - beta1) * gradient;U_O_v[k][j] = beta2 * U_O_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = U_O_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = U_O_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));U_O[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*U_O[k][j] += alpha * O_delta[j] * h_pre[k];*/gradient = G_delta[j] * h_pre[k];U_G_m[k][j] = beta1 * U_G_m[k][j] + (1.0 - beta1) * gradient;U_G_v[k][j] = beta2 * U_G_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = U_G_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = U_G_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));U_G[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*U_G[k][j] += alpha * G_delta[j] * h_pre[k];*/}//更新当前时间点输入层和当前时间点隐含层之间的连接权for (k = 0; k<innode; k++){gradient = I_delta[j] * x[k];W_I_m[k][j] = beta1 * W_I_m[k][j] + (1.0 - beta1) * gradient;W_I_v[k][j] = beta2 * W_I_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = W_I_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = W_I_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));W_I[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*W_I[k][j] += alpha * I_delta[j] * x[k];*/gradient = F_delta[j] * x[k];W_F_m[k][j] = beta1 * W_F_m[k][j] + (1.0 - beta1) * gradient;W_F_v[k][j] = beta2 * W_F_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = W_F_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = W_F_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));W_F[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*W_F[k][j] += alpha * ;*//*    W_O[k][j] += alpha * ;*/gradient = O_delta[j] * x[k];W_O_m[k][j] = beta1 * W_O_m[k][j] + (1.0 - beta1) * gradient;W_O_v[k][j] = beta2 * W_O_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = W_O_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = W_O_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));W_O[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*W_G[k][j] += alpha * ;*/gradient = G_delta[j] * x[k];W_G_m[k][j] = beta1 * W_G_m[k][j] + (1.0 - beta1) * gradient;W_G_v[k][j] = beta2 * W_G_v[k][j] + (1.0 - beta2) * pow(gradient, 2);mhat = W_G_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = W_G_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));W_G[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);}//更新偏置/*B_I[j][0] += alpha * ;*/gradient = I_delta[j];B_I_m[j][0] = beta1 * B_I_m[j][0] + (1.0 - beta1) * gradient;B_I_v[j][0] = beta2 * B_I_v[j][0] + (1.0 - beta2) * pow(gradient, 2);mhat = B_I_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = B_I_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));B_I[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*B_O[j][0] += alpha * O_delta[j];*/gradient = O_delta[j];B_O_m[j][0] = beta1 * B_O_m[j][0] + (1.0 - beta1) * gradient;B_O_v[j][0] = beta2 * B_O_v[j][0] + (1.0 - beta2) * pow(gradient, 2);mhat = B_O_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = B_O_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));B_O[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);/*B_F[j][0] += alpha * F_delta[j];*/gradient = F_delta[j];B_F_m[j][0] = beta1 * B_F_m[j][0] + (1.0 - beta1) * gradient;B_F_v[j][0] = beta2 * B_F_v[j][0] + (1.0 - beta2) * pow(gradient, 2);mhat = B_F_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = B_F_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));B_F[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);//B_G[j][0] += alpha * G_delta[j];gradient = G_delta[j];B_G_m[j][0] = beta1 * B_G_m[j][0] + (1.0 - beta1) * gradient;B_G_v[j][0] = beta2 * B_G_v[j][0] + (1.0 - beta2) * pow(gradient, 2);mhat = B_G_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = B_G_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));B_G[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);}for (k = 0; k<outnode; k++)  //对于网络中每个输出单元，更新权值，这个在一般的LSTM并没有，一般的LSTM只有h层{//更新隐含层和输出层之间的连接权for (j = 0; j < hidenode; j++){/*W_out[j][k] += alpha * y_deltaB[k] * h[j];*/gradient = y_deltaB[k] * h[j];W_out_m[j][k] = beta1 * W_out_m[j][k] + (1.0 - beta1) * gradient;W_out_v[j][k] = beta2 * W_out_v[j][k] + (1.0 - beta2) * pow(gradient, 2);mhat = W_out_m[j][k] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = W_out_v[j][k] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));W_out[j][k] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);}/*B_out[k][0] += alpha * y_deltaB[k];*/gradient = y_deltaB[k] ;B_out_m[k][0] = beta1 * B_out_m[k][0] + (1.0 - beta1) * gradient;B_out_v[k][0] = beta2 * B_out_v[k][0] + (1.0 - beta2) * pow(gradient, 2);mhat = B_out_m[k][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));vhat = B_out_v[k][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));B_out[k][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);}if (p == timesteps - 1)//此处应该回收下面的内存{delete  []O_future_delta;delete  []F_future_delta;delete  []I_future_delta;delete  []G_future_delta;delete  []state_future_delta;delete  []forget_gate_future;O_future_delta = NULL;F_future_delta = NULL;I_future_delta = NULL;G_future_delta = NULL;state_future_delta = NULL;forget_gate_future = NULL;}O_future_delta = O_delta;F_future_delta = F_delta;I_future_delta = I_delta;G_future_delta = G_delta;state_future_delta = state_delta;forget_gate_future = forget_gate;}delete  []O_future_delta;delete  []F_future_delta;delete  []I_future_delta;delete  []G_future_delta;delete  []state_future_delta;delete []forget_gate_future;O_future_delta = NULL;F_future_delta = NULL;I_future_delta = NULL;G_future_delta = NULL;state_future_delta = NULL;forget_gate_future = NULL;delete []y_deltaB;y_deltaB = NULL;if (epoch % 1000 == 0){cout << "第  " << epoch << "  epoch：" << endl;cout << "样本数据：" << endl;vector<string> vsamp = encode2Sample(dsMap, curSample);for (k = 0; k < timesteps; k++)cout << " " << vsamp[k];cout << endl;cout << "error： " << e << endl;cout << "pred： ";vector<string> vpre = encode2Truth(dsMap, predictV);for (k = 0; k < timesteps; k++)cout << " " << vpre[k];cout << endl;vector<string> vtru = encode2Truth(dsMap, truth_steps);cout << "true： ";for (k = 0; k < timesteps; k++)cout << " " << vtru[k];cout << endl;cout << endl;}//释放new分配的内存for (i = 0; i < predictV.size(); i++){delete []predictV[i];predictV[i] = NULL;}for (i = 0; i<I_vector.size(); i++){delete []I_vector[i];I_vector[i] = NULL;}for (i = 0; i < O_vector.size(); i++){delete []O_vector[i];O_vector[i] = NULL;}for (i = 0; i < G_vector.size(); i++){delete []G_vector[i];G_vector[i] = NULL;}for (i = 0; i < S_vector.size(); i++){delete []S_vector[i];S_vector[i] = NULL;}for (i = 0; i < h_vector.size(); i++){delete []h_vector[i];h_vector[i] = NULL;}for (i = 0; i < truth_steps.size(); i++){delete []truth_steps[i];truth_steps[i] = NULL;}predictV.clear();truth_steps.clear();I_vector.clear();F_vector.clear();O_vector.clear();G_vector.clear();S_vector.clear();h_vector.clear();y_delta_vector.clear();delete []x;x = NULL;//随机获取一个新样本offset = random(randomMax);

#ifdef CHINESE
while (offset % 2 != 0)//必须是2的倍数，否则获取汉字时进入死循环
{
offset = random(randomMax);
}
#endif // CHINESE

 /*cout << "当前偏移：" << offset << endl;*/epoch++;
}

}

vector AdamLSTM::encode2Truth(map<string, int>& dsMap, vector<double*>& predictedM)
{
vector sRes;
sRes.clear();
int maxInd = 0;
for (size_t i = 0; i < predictedM.size(); i++)
{
double * pre = predictedM[i];
for (size_t j = 0; j < outnode; j++)
{
pre[j] = abs(pre[j]);
/cout << “当前节点 " << j << " 预测的值：” << pre[j] << endl;/
}
int ind[outnode] = { 0 };
BubbleSort(pre, outnode, ind);
maxInd = ind[outnode - 1];
//cout <<“当前时间点 “<<i<<” 最大值索引：”<< maxInd << endl;
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if ((*it).second == (maxInd + 1))
{
sRes.push_back((*it).first);
}
}
}
return sRes;
}

vector AdamLSTM::encode2Sample(map<string, int>& dsMap, vector& sample)
{
vector sRes;
sRes.clear();
for (size_t i = 0; i < sample.size(); i++)
{
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if (it->second == sample[i])
{
sRes.push_back(it->first);
}
}
}
return sRes;
}

vector AdamLSTM::sample2Encode(map<string, int>& dsMap, vector sample)
{
vector res;
for (size_t i = 0; i < sample.size(); i++)
{
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if (it->first == sample[i])
{
res.push_back(it->second);
}
}
}
return res;
}

void AdamLSTM::BubbleSort(double * p, int length, int * ind_diff)
{
for (int m = 0; m < length; m++)
{
ind_diff[m] = m;
}

for (int i = 0; i < length; i++)
{for (int j = 0; j < length - i - 1; j++){if (p[j] > p[j + 1]){double temp = p[j];p[j] = p[j + 1];p[j + 1] = temp;int ind_temp = ind_diff[j];ind_diff[j] = ind_diff[j + 1];ind_diff[j + 1] = ind_temp;}}
}

}

void AdamLSTM::predict()
{
DataPreProc ddp;

char dataset[] = "壹 贰 叁 肆 伍 陆 柒 捌 玖 拾";
map<std::string, int> ds = ddp.loadDataset(dataset);
//输入样本数据
char s[200] = { NULL };
cout << "输入样本数据，不超过3个汉字且在训练数据集范围内(空格隔离)：" << endl;
cin.getline(s, 200);
string ss(s);
vector<string> re = ddp.split(ss, " ");
vector<int> input = sample2Encode(ds, re);
vector<double*> predictV;      //预测值
vector<double*> I_vector;      //输入门
vector<double*> F_vector;      //遗忘门
vector<double*> O_vector;      //输出门
vector<double*> G_vector;      //新记忆
vector<double*> S_vector;      //状态值
vector<double*> h_vector;      //输出值
double *x = new double[innode];
//在0时刻是没有之前的隐含层的，所以初始化一个全为0的
double *S = new double[hidenode] {0};     //状态值
double *h = new double[hidenode] {0};     //输出值vector<std::string> result(re.begin(), re.begin() + timesteps);
int cnt = 0;
while (cnt < 5)
{/*正向传播：一直计算完所有的time step*/for (int p = 0; p < timesteps; p++)//循环遍历二进制数组，从最低位开始,p相当于time steps{if (p == 0){for (size_t i = 0; i < hidenode; i++){S[i] = 0.0;h[i] = 0.0;}S_vector.push_back(S);h_vector.push_back(h);}x[0] = (float)input[p];//cout << "x[0]" << p << "=" << x[0] << endl;double *in_gate = new double[hidenode] {0};     //输入门取sigmoid函数后的值double *out_gate = new double[hidenode] {0};    //输出门取sigmoid函数后的值double *forget_gate = new double[hidenode] {0}; //遗忘门取sigmoid函数后的值double *g_gate = new double[hidenode] {0};      //新记忆取tanh函数后的值double *state = new double[hidenode] {0};       //状态值double *h = new double[hidenode] {0}; //隐层输出值double *output = new double[outnode] {0};//当前时间点的预测数组double *y_pre = new double[outnode] {0};//当前时间点的预测数组//前一个状态double * h_pre = h_vector.back();double * state_pre = S_vector.back();for (size_t j = 0; j < hidenode; j++){double inGateValue = 0;double forgetGateValue = 0;double outGateValue = 0;double gGateValue = 0;//当前时间点输入for (size_t k = 0; k < innode; k++){forgetGateValue += x[k] * W_F[k][j];inGateValue += x[k] * W_I[k][j];gGateValue += x[k] * W_G[k][j];outGateValue += x[k] * W_O[k][j];}for (size_t i = 0; i < hidenode; i++){forgetGateValue += h_pre[i] * U_F[i][j];inGateValue += h_pre[i] * U_I[i][j];gGateValue += h_pre[i] * U_G[i][j];outGateValue += h_pre[i] * U_O[i][j];}//偏置forgetGateValue += B_F[j][0] * 1.0;inGateValue += B_I[j][0] * 1.0;gGateValue += B_G[j][0] * 1.0;outGateValue += B_O[j][0] * 1.0;in_gate[j] = sigmoid(inGateValue);out_gate[j] = sigmoid(outGateValue);forget_gate[j] = sigmoid(forgetGateValue);g_gate[j] = tanh(gGateValue);//这里应该是tanhdouble s_pre = state_pre[j];state[j] = forget_gate[j] * s_pre + g_gate[j] * in_gate[j];//状态h[j] = out_gate[j] * tanh(state[j]);//输出/*cout << "h[j]=" << h[j] << endl;*/}/*for (size_t m = 0; m < outnode; m++){cout << "truthLabel" << m << "=" << truthLabel[m] << endl;}*/for (int k = 0; k < outnode; k++)//输出节点{//隐藏层传播到输出层，输出层的权重和sigmoid函数是自己根据实际需要添加的，一般情况下，LSTM就有个h输出for (int j = 0; j < hidenode; j++){double tmp = h[j] * W_out[j][k];/*cout << "tmp" <<  "=" << tmp << endl;cout << "output" << j << "=" << output[k] << endl;*/output[k] += tmp;//cout << "h" << j << "=" << h[j] << endl;//cout << "W_out" << j << k << "=" << W_out[j][k] << endl;//cout << "output" << j << "=" << output[k] << endl;}output[k] += B_out[k][0] * 1.0;y_pre[k] = sigmoid(output[k]);               //输出层各单元输出/*   cout << "y_pre" << k << "=" << y_pre[k] << endl;*/}predictV.push_back(y_pre);S_vector.push_back(state);h_vector.push_back(h);}vector<string> vpre = encode2Truth(ds, predictV);predictV.clear();S_vector.clear();h_vector.clear();//cout << "预测值：" << vpre[timesteps - 1] << endl;result.push_back(vpre[timesteps - 1]);cout << "预测" << cnt << ":";for (size_t i = 0; i < result.size(); i++){cout << result[i];}cout << endl;//修改输入re.erase(re.begin());re.push_back(vpre[timesteps - 1]);input.clear();input = sample2Encode(ds, re);//for (size_t i = 0; i < input.size(); i++)//{// cout << "input value:" << input[i] << endl;//}//cout << "re" <<re.back()<< endl;//继续预测下一个cnt++;
}
result.clear();

}

3、调用方法
int main()
{

//ActvAndDrt aad;srand(time(NULL));
AdamLSTM adamLstm;
adamLstm.train();
cout << "训练结束" << endl;
adamLstm.predict();
system("pause");
return 0;

}

c++实现LSTM，ADAM优化，预测大写数字相关推荐

基于爬行动物搜索RSA优化LSTM的时间序列预测
0 引言基于LSTM进行时间序列预测方法简单有效.LSTM的出现为时间序列预测提供了一个新的研究方向.然而,与大部分网络模型一样,LSTM效果受其超参数设置的影响.为此,本文采用爬行动物搜索Rept ...
【Matlab风电功率预测】EMD优化LSTM风电功率预测【含源码 1402期】
一.代码运行视频(哔哩哔哩) [Matlab风电功率预测]EMD优化LSTM风电功率预测[含源码 1402期] 二.matlab版本及参考文献 1 matlab版本 2014a 2 参考文献 [1]曾 ...
【Matlab风电功率预测】粒子群算法优化LSTM风电功率预测【含源码 941期】
一.代码运行视频(哔哩哔哩) [Matlab风电功率预测]粒子群算法优化LSTM风电功率预测[含源码 941期] 二.matlab版本及参考文献 1 matlab版本 2014a 2 参考文献 [1] ...
3层-CNN卷积神经网络预测MNIST数字
3层-CNN卷积神经网络预测MNIST数字本文创建一个简单的三层卷积网络来预测 MNIST 数字.这个深层网络由两个带有 ReLU 和 maxpool 的卷积层以及两个全连接层组成. MNIST 由 ...
基于LSTM的序列预测: 飞机月流量预测
基于LSTM的序列预测: 飞机月流量预测循环神经网络,如RNN,LSTM等模型,比较适合用于序列预测,下面以一个比较经典的飞机月流量数据集,介绍LSTM的使用方法和训练过程. 完整的项目代码下载:h ...
基于Keras的LSTM多变量时间序列预测（北京PM2.5数据集pollution.csv）
基于Keras的LSTM多变量时间序列预测传统的线性模型难以解决多变量或多输入问题,而神经网络如LSTM则擅长于处理多个变量的问题,该特性使 ...
利用LSTM进行股价预测
利用LSTM进行股价预测效果原理代码应用效果原理 LSTM即长短记忆网络,是一种很强的RNN,这种网络的特性是以前的输入会影响现在的输出,具体原理请自行搜索. 算法流程: 获取yahoo财 ...
基于LSTM的剩余寿命预测（PyTorch实现）
首先,我们需要准备数据.对于剩余寿命预测问题,我们需要有一些历史数据来训练我们的模型,并且需要一些测试数据来验证模型的性能.假设我们有一个包含多个传感器读数的数据集,我们可以将其转化为一个序列预测问题 ...
PyTorch基础-Adam优化器使用-06
当不知道使用什么优化器的时候可以使用adam优化器代码 import numpy as np import torch from torch import nn,optim from torch.a ...
用tensorflow搭建RNN(LSTM)进行MNIST 手写数字辨识
用tensorflow搭建RNN(LSTM)进行MNIST 手写数字辨识循环神经网络RNN相比传统的神经网络在处理序列化数据时更有优势,因为RNN能够将加入上(下)文信息进行考虑.一个简单的RNN如 ...

c++实现LSTM，ADAM优化，预测大写数字

c++实现LSTM，ADAM优化，预测大写数字相关推荐

最新文章

热门文章