运行结果:

【code】

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <pthread.h>#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
#include <time.h>#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)const int vocab_hash_size = 30; // Maximum 30 * 0.7 = 21M words in the vocabularytypedef float real;             // Precision of float numbers//每个词的基本数据结构
struct vocab_word {long long cn;    //词频,从训练集中计数得到或直接提供词频文件int *point;       //Haffman树中从根节点到该词的路径,存放的是路径上每个节点的索引//word为该词的字面值//code为该词的haffman编码//codelen为该词haffman编码的长度char *word, *code, codelen;
};char train_file[MAX_STRING], output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
//词表,该数组的下标表示这个词在此表中的位置,也称之为这个词在词表中的索引
struct vocab_word *vocab;int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 1, num_threads = 1, min_reduce = 1;//词hash表,该数组的下标为每个词的hash值,由词的字面值ASCII码计算得到。vocab_hash[hash]中存储的是该词在词表中的索引
int *vocab_hash;//vocab_max_size是一个辅助变量,每次当词表大小超出vocab_max_size时,一次性将词表大小增加1000
//vocab_size为训练集中不同单词的个数,即词表的大小
//layer1_size为词向量的长度
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 10;long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 1e-3;
//syn0存储的是词表中每个词的词向量
//syn1存储的是Haffman树中每个非叶节点的向量
//syn1neg是负采样时每个词的辅助向量
//expTable是提前计算好的Sigmond函数表
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;int hs = 0, negative = 5;
const int table_size = 1e2;
int *table;//计算每个函数的能量分布表,在负采样中用到
void InitUnigramTable() {int a, i;long long train_words_pow = 0;real d1, power = 0.75;//为能量表table分配内存空间,共有table_size项,table_size为一个既定的数1e8table = (int *) malloc(table_size * sizeof(int));//遍历词表,根据词频计算能量总值for (a = 0; a < vocab_size; a++)train_words_pow += pow(vocab[a].cn, power);i = 0;//d1:表示已遍历词的能量值占总能量的比d1 = pow(vocab[i].cn, power) / (real) train_words_pow;printf("\ntable_size:%d", table_size);printf("\ntrain_words_pow:%lld,d1:%f\n", train_words_pow, d1);//a:能量表table的索引//i:词表的索引for (a = 0; a < table_size; a++) {//i号单词占据table中a位置table[a] = i;//能量表反映的是一个单词的能量分布,如果该单词的能量越大,所占table的位置就越多//如果当前单词的能量总和d1小于平均值,i递增,同时更新d1;反之如果能量高的话,保持i不变,以占据更多的位置if (a / (real) table_size > d1) {i++;d1 += pow(vocab[i].cn, power) / (real) train_words_pow;}//如果词表遍历完毕后能量表还没填满,将能量表中剩下的位置用词表中最后一个词填充if (i >= vocab_size)i = vocab_size - 1;}for (a = 0; a < table_size; a++) {printf("\t%d", table[a]);if ((a + 1) % 10 == 0) {printf("\n");}}
}//从文件中读入一个词到word,以space' ',tab'\t',EOL'\n'为词的分界符
//截去一个词中长度超过MAX_STRING的部分
//每一行的末尾输出一个</s>
void ReadWord(char *word, FILE *fin) {int a = 0, ch;while (!feof(fin)) {ch = fgetc(fin);if (ch == 13)continue;if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {if (a > 0) {if (ch == '\n')ungetc(ch, fin);break;}if (ch == '\n') {strcpy(word, (char *) "</s>");return;} elsecontinue;}word[a] = ch;a++;if (a >= MAX_STRING - 1)a--;   // Truncate too long words}word[a] = 0;
}//返回一个词的hash值,由词的字面值计算得到,可能存在不同词拥有相同hash值的冲突情况
int GetWordHash(char *word) {unsigned long long a, hash = 0;for (a = 0; a < strlen(word); a++)hash = hash * 257 + word[a];hash = hash % vocab_hash_size;return hash;
}//返回一个词在词表中的位置,若不存在则返回-1
//先计算词的hash值,然后在词hash表中,以该值为下标,查看对应的值
//如果为-1说明这个词不存在索引,即不存在在词表中,返回-1
//如果该索引在词表中对应的词与正在查找的词不符,说明发生了hash值冲突,按照开放地址法去寻找这个词
int SearchVocab(char *word) {unsigned int hash = GetWordHash(word);while (1) {if (vocab_hash[hash] == -1)return -1;if (!strcmp(word, vocab[vocab_hash[hash]].word))return vocab_hash[hash];hash = (hash + 1) % vocab_hash_size;}return -1;
}//从文件中读入一个词,并返回这个词在词表中的位置,相当于将之前的两个函数包装了起来
int ReadWordIndex(FILE *fin) {char word[MAX_STRING];ReadWord(word, fin);if (feof(fin))return -1;return SearchVocab(word);
}//为一个词构建一个vocab_word结构对象,并添加到词表中
//词频初始化为0,hash值用之前的函数计算,
//返回该词在词表中的位置
int AddWordToVocab(char *word) {unsigned int hash, length = strlen(word) + 1;if (length > MAX_STRING)length = MAX_STRING;vocab[vocab_size].word = (char *) calloc(length, sizeof(char));strcpy(vocab[vocab_size].word, word);vocab[vocab_size].cn = 0;vocab_size++;//每当词表数目即将超过最大值时,一次性为其申请添加一千个词结构体的内存空间if (vocab_size + 2 >= vocab_max_size) {vocab_max_size += 1000;vocab = (struct vocab_word *) realloc(vocab, vocab_max_size * sizeof(struct vocab_word));}hash = GetWordHash(word);//如果该hash值与其他词产生冲突,则使用开放地址法解决冲突(为这个词寻找一个hash值空位)while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;//将该词在词表中的位置赋给这个找到的hash值空位vocab_hash[hash] = vocab_size - 1;return vocab_size - 1;
}//按照词频从大到小排序
int VocabCompare(const void *a, const void *b) {return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
}//统计词频,按照词频对词表中的项从大到小排序
void SortVocab() {int a, size;unsigned int hash;//对词表进行排序,将</s>放在第一个位置qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);//充值hash表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;size = vocab_size;train_words = 0;for (a = 0; a < size; a++) {//将出现次数小于min_count的词从词表中去除,出现次数大于min_count的重新计算hash值,更新hash词表if ((vocab[a].cn < min_count) && (a != 0)) {vocab_size--;free(vocab[a].word);} else {//hash值计算hash = GetWordHash(vocab[a].word);//hash值冲突解决while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;vocab_hash[hash] = a;//计算总词数train_words += vocab[a].cn;}}//由于删除了词频较低的词,这里调整词表的内存空间vocab = (struct vocab_word *) realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));// 为Haffman树的构建预先申请空间for (a = 0; a < vocab_size; a++) {vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));}
}//从词表中删除出现次数小于min_reduce的词,没执行一次该函数min_reduce自动加一
void ReduceVocab() {int a, b = 0;unsigned int hash;for (a = 0; a < vocab_size; a++)if (vocab[a].cn > min_reduce) {vocab[b].cn = vocab[a].cn;vocab[b].word = vocab[a].word;b++;} elsefree(vocab[a].word);vocab_size = b;//重置hash表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;//更新hash表for (a = 0; a < vocab_size; a++) {//hash值计算hash = GetWordHash(vocab[a].word);//hash值冲突解决while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;vocab_hash[hash] = a;}fflush(stdout);min_reduce++;
}//利用统计到的词频构建Haffman二叉树
//根据Haffman树的特性,出现频率越高的词其二叉树上的路径越短,即二进制编码越短
void CreateBinaryTree() {long long a, b, i, min1i, min2i, pos1, pos2;//用来暂存一个词到根节点的Haffman树路径long long point[MAX_CODE_LENGTH];//用来暂存一个词的Haffman编码char code[MAX_CODE_LENGTH];//内存分配,Haffman二叉树中,若有n个叶子节点,则一共会有2n-1个节点//count数组前vocab_size个元素为Haffman树的叶子节点,初始化为词表中所有词的词频//count数组后vocab_size个元素为Haffman书中即将生成的非叶子节点(合并节点)的词频,初始化为一个大值1e15long long *count = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//binary数组记录各节点相对于其父节点的二进制编码(0/1)long long *binary = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//paarent数组记录每个节点的父节点long long *parent_node = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//count数组的初始化for (a = 0; a < vocab_size; a++)count[a] = vocab[a].cn;for (a = vocab_size; a < vocab_size * 2; a++)count[a] = 1e15;//以下部分为创建Haffman树的算法,默认词表已经按词频由高到低排序//pos1,pos2为别为词表中词频次低和最低的两个词的下标(初始时就是词表最末尾两个)//</s>词也包含在树内pos1 = vocab_size - 1;pos2 = vocab_size;//最多进行vocab_size-1次循环操作,每次添加一个节点,即可构成完整的树for (a = 0; a < vocab_size - 1; a++) {//比较当前的pos1和pos2,在min1i、min2i中记录当前词频最小和次小节点的索引//min1i和min2i可能是叶子节点也可能是合并后的中间节点if (pos1 >= 0) {//如果count[pos1]比较小,则pos1左移,反之pos2右移if (count[pos1] < count[pos2]) {min1i = pos1;pos1--;} else {min1i = pos2;pos2++;}} else {min1i = pos2;pos2++;}if (pos1 >= 0) {//如果count[pos1]比较小,则pos1左移,反之pos2右移if (count[pos1] < count[pos2]) {min2i = pos1;pos1--;} else {min2i = pos2;pos2++;}} else {min2i = pos2;pos2++;}//在count数组的后半段存储合并节点的词频(即最小count[min1i]和次小count[min2i]词频之和)count[vocab_size + a] = count[min1i] + count[min2i];//记录min1i和min2i节点的父节点parent_node[min1i] = vocab_size + a;parent_node[min2i] = vocab_size + a;//这里令每个节点的左右子节点中,词频较低的为1(则词频较高的为0)binary[min2i] = 1;}for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", ii);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", count[ii]);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", binary[ii]);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", parent_node[ii]);}printf("\n");printf("\n");//根据得到的Haffman二叉树为每个词(树中的叶子节点)分配Haffman编码//由于要为所有词分配编码,因此循环vocab_size次for (a = 0; a < vocab_size; a++) {b = a;i = 0;while (1) {//不断向上寻找叶子结点的父节点,将binary数组中存储的路径的二进制编码增加到code数组末尾code[i] = binary[b];//在point数组中增加路径节点的编号point[i] = b;//Haffman编码的当前长度,从叶子结点到当前节点的深度i++;b = parent_node[b];//由于Haffman树一共有vocab_size*2-1个节点,所以vocab_size*2-2为根节点if (b == vocab_size * 2 - 2)break;}//在词表中更新该词的信息//Haffman编码的长度,即叶子结点到根节点的深度vocab[a].codelen = i;//Haffman路径中存储的中间节点编号要在现在得到的基础上减去vocab_size,即不算叶子结点,单纯在中间节点中的编号//所以现在根节点的编号为(vocab_size*2-2) - vocab_size = vocab_size - 2vocab[a].point[0] = vocab_size - 2;//Haffman编码和路径都应该是从根节点到叶子结点的,因此需要对之前得到的code和point进行反向。for (b = 0; b < i; b++) {vocab[a].code[i - b - 1] = code[b];vocab[a].point[i - b] = point[b] - vocab_size;}}printf("vocab_size:%d\n", vocab_size);for (b = 0; b < vocab_size; b++) {vocab_word temp = vocab[b];printf("%s\t", temp.word);int codeLen = temp.codelen;printf("%d\t(\t", codeLen);for (int a = 0; a < codeLen; a++) {printf("%d\t", temp.code[a]);}printf(")\t\t\t\t\t\t");printf("point:(\t");for (int a = 0; a < codeLen; a++) {printf("%d\t", temp.point[a]);}printf(")\n");}free(count);free(binary);free(parent_node);
}//从训练文件中获取所有词汇并构建词表和hash比
void LearnVocabFromTrainFile() {char word[MAX_STRING];FILE *fin;long long a, i;//初始化hash词表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;//打开训练文件fin = fopen(train_file, "rb");if (fin == NULL) {printf("ERROR: training data file not found!\n");exit(1);}//初始化词表大小vocab_size = 0;//将</s>添加到词表的最前端AddWordToVocab((char *) "</s>");//开始处理训练文件while (1) {//从文件中读入一个词ReadWord(word, fin);if (feof(fin))break;//对总词数加一,并输出当前训练信息train_words++;if ((debug_mode > 1) && (train_words % 100000 == 0)) {printf("%lldK%c", train_words / 1000, 13);fflush(stdout);}//搜索这个词在词表中的位置i = SearchVocab(word);//如果词表中不存在这个词,则将该词添加到词表中,创建其在hash表中的值,初始化词频为1;反之,词频加一if (i == -1) {a = AddWordToVocab(word);vocab[a].cn = 1;} elsevocab[i].cn++;//如果词表大小超过上限,则做一次词表删减操作,将当前词频最低的词删除if (vocab_size > vocab_hash_size * 0.7)ReduceVocab();}//对词表进行排序,剔除词频低于阈值min_count的值,输出当前词表大小和总词数SortVocab();if (debug_mode > 0) {printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//获取训练文件的大小,关闭文件句柄file_size = ftell(fin);fclose(fin);
}//将单词和对应的词频输出到文件中
void SaveVocab() {long long i;FILE *fo = fopen(save_vocab_file, "wb");for (i = 0; i < vocab_size; i++)fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);fclose(fo);
}//从词汇表文件中读词并构建词表和hash表
//由于词汇表中的词语不存在重复,因此与LearnVocabFromTrainFile相比没有做重复词汇的检测
void ReadVocab() {long long a, i = 0;char c;char word[MAX_STRING];//打开词汇表文件FILE *fin = fopen(read_vocab_file, "rb");if (fin == NULL) {printf("Vocabulary file not found\n");exit(1);}//初始化hash词表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;vocab_size = 0;//开始处理词汇表文件while (1) {//从文件中读入一个词ReadWord(word, fin);if (feof(fin))break;//将该词添加到词表中,创建其在hash表中的值,并通过输入的词汇表文件中的值来更新这个词的词频a = AddWordToVocab(word);fscanf(fin, "%lld%c", &vocab[a].cn, &c);i++;}//对词表进行排序,剔除词频低于阈值min_count的值,输出当前词表大小和总词数SortVocab();if (debug_mode > 0) {printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//打开训练文件,将文件指针移至文件末尾,获取训练文件的大小fin = fopen(train_file, "rb");if (fin == NULL) {printf("ERROR: training data file not found!\n");exit(1);}fseek(fin, 0, SEEK_END);file_size = ftell(fin);//关闭文件句柄fclose(fin);
}//初始化神经网络结构
void InitNet() {long long a, b;unsigned long long next_random = 1;//syn0存储的是词表中每个词的词向量//这里为syn0分配内存空间//调用posiz_memalign来获取一块数量为vocab_size * layer1_size,128byte页对齐的内存//其中layer1_size是词向量的长度a = posix_memalign((void ** )&syn0, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn0 == NULL) {printf("Memory allocation failed\n");exit(1);}//多层Softmax回归if (hs) {//syn1存储的是Haffman树中每个非叶节点的向量//这里为syn1分配内存空间a = posix_memalign((void ** )&syn1, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn1 == NULL) {printf("Memory allocation failed\n");exit(1);}//初始化syn1为0for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++)syn1[a * layer1_size + b] = 0;}//如果要使用负采样,则需要为syn1neg分配内存空间//syn1neg是负采样时每个词的辅助向量if (negative > 0) {a = posix_memalign((void ** )&syn1neg, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn1neg == NULL) {printf("Memory allocation failed\n");exit(1);}//初始化syn1neg为0for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++)syn1neg[a * layer1_size + b] = 0;}for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++) {next_random = next_random * (unsigned long long) 25214903917 + 11;//初始化词向量syn0,每一维的值为[-0.5, 0.5]/layer1_size范围内的随机数syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real) 65536)- 0.5) / layer1_size;}//创建Haffman二叉树CreateBinaryTree();
}//该函数为线程函数,是训练算法代码实现的主要部分
//默认在执行该线程函数前,已经完成词表排序、Haffman树的生成以及每个词的Haffman编码计算
void *TrainModelThread(void *id) {long long a, b, d;//cw:窗口长度(中心词除外)long long cw;//word: 在提取句子时用来表示当前词在词表中的索引//last_word: 用于在窗口扫描辅助,记录当前扫描到的上下文单词//setence_length: 当前处理的句子长度//setence_position: 当前处理的单词在当前句子中的位置long long word, last_word, sentence_length = 0, sentence_position = 0;//word_count: 当前线程当前时刻已训练的语料的长度//last_word_count: 当前线程上一次记录时已训练的语料长度long long word_count = 0, last_word_count = 0;//sen:当前从文件中读取的待处理句子,存放的是每个词在词表中的索引long long sen[MAX_SENTENCE_LENGTH + 1];//l1:在skip-gram模型中,在syn0中定位当前词词向量的起始位置//l2:在syn1或syn1neg中定位中间节点向量或负采样向量的起始位置//target:在负采样中存储当前样本//label:在负采样中存储当前样本的标记long long l1, l2, c, target, label, local_iter = iter;//next_random:用来辅助生成随机数unsigned long long next_random = (long long) id;real f, g;clock_t now;//neu1:输入词向量,在CBOW模型中是Context(x)中各个词的向量和,在skip-gram模型中是中心词的词向量real *neu1 = (real *) calloc(layer1_size, sizeof(real));//neuele:累计误差项real *neu1e = (real *) calloc(layer1_size, sizeof(real));FILE *fi = fopen(train_file, "rb");//每个进程对应一段文本,根据当前线程的id找到该线程对应文本的初始位置//file_size就是之前LearnVocabFromTrainFile和ReadVocab函数中获取的训练文件的大小fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);//开始主循环while (1) {//每训练约10000词输出一次训练进度if (word_count - last_word_count > 10) {//word_count_actual是所有线程总共当前处理的词数word_count_actual += word_count - last_word_count;last_word_count = word_count;if ((debug_mode > 1)) {now = clock();//输出信息包括://当前的学习率alpha;//训练总进度(当前训练的总词数/(迭代次数*训练样本总词数)+1);//每个线程每秒处理的词数printf( "%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ",13, alpha, word_count_actual / (real) (iter * train_words + 1) * 100,word_count_actual / ((real) (now - start + 1) / (real) CLOCKS_PER_SEC * 1000));fflush(stdout);}//在初始学习率的基础上,随着实际训练词数的上升,逐步降低当前学习率(自适应调整学习率)alpha = starting_alpha * (1 - word_count_actual / (real) (iter * train_words + 1));//调整的过程中保证学习率不低于starting_alpha * 0.0001if (alpha < starting_alpha * 0.0001)alpha = starting_alpha * 0.0001;}//从训练样本中取出一个句子,句子间以回车分割if (sentence_length == 0) {while (1) {//从文件中读入一个词,将该词在词表中的索引赋给wordword = ReadWordIndex(fi);if (feof(fi))break;if (word == -1)continue;word_count++;//如果读到的时回车,表示句子结束if (word == 0)break;//对高频词进行随机下采样,丢弃掉一些高频词,能够使低频词向量更加准确,同时加快训练速度//可以看作是一种平滑方法if (sample > 0) {real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;next_random = next_random * (unsigned long long) 25214903917 + 11;//以1-ran的概率舍弃高频词if (ran < (next_random & 0xFFFF) / (real) 65536)continue;}sen[sentence_length] = word;sentence_length++;//如果句子长度超出最大长度则截断if (sentence_length >= MAX_SENTENCE_LENGTH)break;}//定位到句子头sentence_position = 0;}//如果当前线程处理的词数超过了它应该处理的最大值,那么开始新一轮迭代//如果迭代数超过上限,则停止迭代if (feof(fi) || (word_count > train_words / num_threads)) {word_count_actual += word_count - last_word_count;local_iter--;if (local_iter == 0)break;word_count = 0;last_word_count = 0;sentence_length = 0;fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);continue;}//取出当前单词word = sen[sentence_position];if (word == -1)continue;//初始化输入词向量for (c = 0; c < layer1_size; c++)neu1[c] = 0;//初始化累计误差项for (c = 0; c < layer1_size; c++)neu1e[c] = 0;//生成一个[0, window-1]的随机数,用来确定|context(w)|窗口的实际宽度(提高训练速率?)next_random = next_random * (unsigned long long) 25214903917 + 11;b = next_random % window;/********如果使用的是CBOW模型:输入是某单词周围窗口单词的词向量,来预测该中心单词本身*********/if (cbow) {cw = 0;//一个词的窗口为[setence_position - window + b, sentence_position + window - b]//因此窗口总长度为 2*window - 2*b + 1for (a = b; a < window * 2 + 1 - b; a++)if (a != window) {     //去除窗口的中心词,这是我们要预测的内容,仅仅提取上下文c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;//sen数组中存放的是句子中的每个词在词表中的索引last_word = sen[c];if (last_word == -1)continue;//计算窗口中词向量的和for (c = 0; c < layer1_size; c++)neu1[c] += syn0[c + last_word * layer1_size];//统计实际窗口中的有效词数cw++;}if (cw) {//求平均向量和for (c = 0; c < layer1_size; c++)neu1[c] /= cw;//如果采用分层softmax优化//根据Haffman树上从根节点到当前词的叶节点的路径,遍历所有经过的中间节点if (hs)for (d = 0; d < vocab[word].codelen; d++) {f = 0;//l2为当前遍历到的中间节点的向量在syn1中的起始位置l2 = vocab[word].point[d] * layer1_size;//f为输入向量neu1与中间结点向量的内积for (c = 0; c < layer1_size; c++)f += neu1[c] * syn1[c + l2];//检测f有没有超出Sigmoid函数表的范围if (f <= -MAX_EXP)continue;else if (f >= MAX_EXP)continue;//如果没有超出范围则对f进行Sigmoid变换elsef = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//g是梯度和学习率的乘积//学习率越大,则错误分类的惩罚也越大,对中间向量的修正量也越大//注意!word2vec中将Haffman编码为1的节点定义为负类,而将编码为0的节点定义为正类//即一个节点的label = 1 - dg = (1 - vocab[word].code[d] - f) * alpha;//根据计算得到的修正量g和中间节点的向量更新累计误差for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1[c + l2];//根据计算得到的修正量g和输入向量更新中间节点的向量值//很好理解,假设vocab[word].code[d]编码为1,即负类,其节点label为1-1=0//sigmoid函数得到的值为(0,1)范围内的数,大于label,很自然的,我们需要把这个中间节点的向量调小//而此时的g = (label - f)*alpha是一个负值,作用在中间节点向量上时,刚好起到调小效果//调小的幅度与sigmoid函数的计算值偏离label的幅度成正比for (c = 0; c < layer1_size; c++)syn1[c + l2] += g * neu1[c];}//如果采用负采样优化//遍历所有正负样本(1个正样本+negative个负样本)if (negative > 0)for (d = 0; d < negative + 1; d++) {if (d == 0) {//第一次循环处理的是目标单词,即正样本target = word;label = 1;} else {//从能量表中随机抽取负样本next_random = next_random * (unsigned long long) 25214903917 + 11;target = table[(next_random >> 16) % table_size];if (target == 0)target = next_random % (vocab_size - 1) + 1;if (target == word)continue;label = 0;}//在负采样优化中,每个词在syn1neg数组中对应一个辅助向量//此时的l2为syn1neg中目标单词向量的起始位置l2 = target * layer1_size;f = 0;//f为输入向量neu1与辅助向量的内积for (c = 0; c < layer1_size; c++)f += neu1[c] * syn1neg[c + l2];if (f > MAX_EXP)g = (label - 1) * alpha;else if (f < -MAX_EXP)g = (label - 0) * alpha;//g = (label - f)*alphaelseg = (label - expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;//用辅助向量和g更新累计误差for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1neg[c + l2];//用输入向量和g更新辅助向量for (c = 0; c < layer1_size; c++)syn1neg[c + l2] += g * neu1[c];}//根据获得的的累计误差,更新context(w)中每个词的词向量for (a = b; a < window * 2 + 1 - b; a++)if (a != window) {c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;last_word = sen[c];if (last_word == -1)continue;for (c = 0; c < layer1_size; c++)syn0[c + last_word * layer1_size] += neu1e[c];}}}/********如果使用的是skip-gram模型:输入是中心单词,来预测该单词的上下文*********/else {//因为需要预测Context(w)中的每个词,因此需要循环2window - 2b + 1次遍历整个窗口//遍历时跳过中心单词for (a = b; a < window * 2 + 1 - b; a++)if (a != window) {c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;//last_word为当前待预测的上下文单词last_word = sen[c];if (last_word == -1)continue;//l1为当前单词的词向量在syn0中的起始位置l1 = last_word * layer1_size;//初始化累计误差for (c = 0; c < layer1_size; c++)neu1e[c] = 0;//如果采用分层softmax优化//根据Haffman树上从根节点到当前词的叶节点的路径,遍历所有经过的中间节点if (hs)for (d = 0; d < vocab[word].codelen; d++) {f = 0;l2 = vocab[word].point[d] * layer1_size;//注意!这里用到了模型对称:p(u|w) = p(w|u),其中w为中心词,u为context(w)中每个词//也就是skip-gram虽然是给中心词预测上下文,真正训练的时候还是用上下文预测中心词//与CBOW不同的是这里的u是单个词的词向量,而不是窗口向量之和//算法流程基本和CBOW的hs一样,这里不再赘述for (c = 0; c < layer1_size; c++)f += syn0[c + l1] * syn1[c + l2];if (f <= -MAX_EXP)continue;else if (f >= MAX_EXP)continue;elsef = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];g = (1 - vocab[word].code[d] - f) * alpha;for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1[c + l2];for (c = 0; c < layer1_size; c++)syn1[c + l2] += g * syn0[c + l1];}//如果采用负采样优化//遍历所有正负样本(1个正样本+negative个负样本)//算法流程基本和CBOW的ns一样,也采用的是模型对称if (negative > 0)for (d = 0; d < negative + 1; d++) {if (d == 0) {target = word;label = 1;} else {next_random = next_random * (unsigned long long) 25214903917 + 11;target = table[(next_random >> 16) % table_size];if (target == 0)target = next_random % (vocab_size - 1) + 1;if (target == word)continue;label = 0;}l2 = target * layer1_size;f = 0;for (c = 0; c < layer1_size; c++)f += syn0[c + l1] * syn1neg[c + l2];if (f > MAX_EXP)g = (label - 1) * alpha;else if (f < -MAX_EXP)g = (label - 0) * alpha;elseg = (label - expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1neg[c + l2];for (c = 0; c < layer1_size; c++)syn1neg[c + l2] += g * syn0[c + l1];}for (c = 0; c < layer1_size; c++)syn0[c + l1] += neu1e[c];}}//完成了一个词的训练,句子中位置往后移一个词sentence_position++;//处理完一句句子后,将句子长度置为零,进入循环,重新读取句子并进行逐词计算if (sentence_position >= sentence_length) {sentence_length = 0;continue;}}fclose(fi);free(neu1);free(neu1e);pthread_exit(NULL);
}//完整的模型训练流程函数
void TrainModel() {long a, b, c, d;FILE *fo;//创建多线程,线程数为num_threadspthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));printf("Starting training using file %s\n", train_file);//设置初始学习率starting_alpha = alpha;//如果有词汇表文件,则从中加载生成词表和hash表,否则从训练文件中获得printf("read_vocab_file:%d\t", read_vocab_file[0]);if (read_vocab_file[0] != 0)ReadVocab();elseLearnVocabFromTrainFile();//根据需要,可以将词表中的词和词频输出到文件if (save_vocab_file[0] != 0)SaveVocab();if (output_file[0] == 0)return;//初始化训练网络InitNet();//如果使用负采样优化,则需要初始化能量表if (negative > 0)InitUnigramTable();//开始计时start = clock();//创建训练线程for (a = 0; a < num_threads; a++)pthread_create(&pt[a], NULL, TrainModelThread, (void *) (intptr_t) a);for (a = 0; a < num_threads; a++)pthread_join(pt[a], NULL);fo = fopen(output_file, "wb");//如果classes参数为0,则输出所有词向量到文件中if (classes == 0) {fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);for (a = 0; a < vocab_size; a++) {fprintf(fo, "%s ", vocab[a].word);if (binary)for (b = 0; b < layer1_size; b++)fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);elsefor (b = 0; b < layer1_size; b++)fprintf(fo, "%lf ", syn0[a * layer1_size + b]);fprintf(fo, "\n");}}//如果classes参数不为0,则需要对词向量进行K-means聚类,输出词类//classes为最后要分成的类的个数else {//clcn:总类数//iter:总迭代次数//closeid:用来存储计算过程中离某个词最近的类编号int clcn = classes, iter = 10, closeid;//centcn:属于每个类的单词数int *centcn = (int *) malloc(classes * sizeof(int));//cl:每个单词所属的类编号int *cl = (int *) calloc(vocab_size, sizeof(int));//x:用来存储每次计算得到的词向量和类中心的内积,值越大说明距离越近//closev:用来最大的内积,即距离最近real closev, x;//cent:每个类的中心向量real *cent = (real *) calloc(classes * layer1_size, sizeof(real));//先给所有单词随机指派类for (a = 0; a < vocab_size; a++)cl[a] = a % clcn;//一共迭代iter次for (a = 0; a < iter; a++) {//初始化类中心向量数组为0for (b = 0; b < clcn * layer1_size; b++)cent[b] = 0;//初始化每个类含有的单词数为1for (b = 0; b < clcn; b++)centcn[b] = 1;//将刚才随意分配的所属于同一个类的词向量相加,并且计算属于每个类的词数for (c = 0; c < vocab_size; c++) {for (d = 0; d < layer1_size; d++)cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];centcn[cl[c]]++;}for (b = 0; b < clcn; b++) {closev = 0;for (c = 0; c < layer1_size; c++) {//计算每个类的平均中心向量cent[layer1_size * b + c] /= centcn[b];//closev为类平均中心向量的二范数的平方closev += cent[layer1_size * b + c]* cent[layer1_size * b + c];}//对closev开方,此时的closev即为类平均中心向量的二范数closev = sqrt(closev);//用得到的范数对中心向量进行归一化for (c = 0; c < layer1_size; c++)cent[layer1_size * b + c] /= closev;}//遍历词表中的每个词,为其重新分配距离最近的类for (c = 0; c < vocab_size; c++) {closev = -10;closeid = 0;for (d = 0; d < clcn; d++) {x = 0;//对词向量和归一化的类中心向量做内积for (b = 0; b < layer1_size; b++)x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];//内积越大说明两点之间距离越近//取所有类中与这个词的词向量内积最大的一个类,将词分到这个类中if (x > closev) {closev = x;closeid = d;}}cl[c] = closeid;}}//经过多次迭代后,逐渐会将词向量向正确的类靠拢//输出K-means聚类结果到文件中for (a = 0; a < vocab_size; a++)fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);free(centcn);free(cent);free(cl);}fclose(fo);
}//当参数缺失时,输出提示信息
int ArgPos(char *str, int argc, char **argv) {int a;for (a = 1; a < argc; a++)if (!strcmp(str, argv[a])) {if (a == argc - 1) {printf("Argument missing for %s\n", str);exit(1);}return a;}return -1;
}void prepare() {int i;vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));printf("%d", vocab_hash[0]);expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));for (i = 0; i < EXP_TABLE_SIZE; i++) {expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() tableexpTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)}
}
int main(int argc, char **argv) {int i;prepare();strcpy(train_file, "record/input.txt");strcpy(save_vocab_file, "record/vocab.txt");strcpy(output_file, "record/output.txt");/**argc = 2;if (argc == 1) {printf("WORD VECTOR estimation toolkit v 0.1c\n\n");printf("Options:\n");printf("Parameters for training:\n");printf("\t-train <file>\n");printf("\t\tUse text data from <file> to train the model\n");printf("\t-output <file>\n");printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");printf("\t-size <int>\n");printf("\t\tSet size of word vectors; default is 100\n");printf("\t-window <int>\n");printf("\t\tSet max skip length between words; default is 5\n");printf("\t-sample <float>\n");printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");printf("\t-hs <int>\n");printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");printf("\t-negative <int>\n");printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");printf("\t-threads <int>\n");printf("\t\tUse <int> threads (default 12)\n");printf("\t-iter <int>\n");printf("\t\tRun more training iterations (default 5)\n");printf("\t-min-count <int>\n");printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");printf("\t-alpha <float>\n");printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");printf("\t-classes <int>\n");printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");printf("\t-debug <int>\n");printf("\t\tSet the debug mode (default = 2 = more info during training)\n");printf("\t-binary <int>\n");printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");printf("\t-save-vocab <file>\n");printf("\t\tThe vocabulary will be saved to <file>\n");printf("\t-read-vocab <file>\n");printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");printf("\t-cbow <int>\n");printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");printf("\nExamples:\n");printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");return 0;}output_file[0] = 0;save_vocab_file[0] = 0;read_vocab_file[0] = 0;if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);if (cbow) alpha = 0.05;if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);**/vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));for (i = 0; i < EXP_TABLE_SIZE; i++) {expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() tableexpTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)}TrainModel();return 0;
}

输入数据

bb cc
bb
dd ee
bb
cc ac
bb cc ee
bb cc
ac bb
ee xx
bb
ac cc
ee bb

vocab.txt

</s> 12
bb 8
cc 5
ee 4
ac 3
xx 1
dd 1

output.txt

7 10
</s> 0.040027 0.044194 -0.038303 -0.032780 0.013666 0.030211 0.009409 0.002113 -0.036035 0.022185
bb -0.043564 0.012495 -0.007513 -0.009572 -0.033157 -0.018822 0.025793 0.030254 0.029691 0.015974
cc 0.015448 -0.038026 -0.040958 0.049696 0.038013 0.030901 -0.006039 0.040157 -0.004950 0.007347
ee -0.001492 -0.029832 0.013123 -0.013374 -0.038254 0.047542 0.043793 -0.010951 -0.002261 0.005092
ac -0.036377 -0.040071 0.045547 0.000630 -0.025824 -0.030421 -0.030765 0.016969 0.002014 0.013310
xx -0.042136 -0.038078 -0.001300 0.011436 0.025497 -0.031700 0.040796 0.009270 0.011197 -0.006084
dd 0.029865 -0.022878 -0.020975 0.021584 -0.007532 0.010307 0.018045 -0.040886 -0.019830 0.029137

在windows环境下可以直接运行的word2vec.c 并带有详细注释相关推荐

  1. windows环境下MySQL服务端和客户端安装,超详细

    windows环境下MySQL服务端和客户端安装,超详细 MySQL简介 MySQL的安装步骤 服务端安装 客户端安装 MySQL简介 MySQL是关系型数据库,由n张互相关联的表组成.一般是c或c+ ...

  2. windows环境下运行.sh文件

    .sh是shell script格式的文件,在Linux环境下是可以直接运行的,到文件所在目录下,执行 ./test.sh 如果是在Windows环境下,在CMD窗口是无法直接执行 ./test.sh ...

  3. Windows环境下,如何在Docker里运行SAP UI5应用

    本文面向的读者是对Docker技术有一些基本概念,但因为没有测试环境,所以没有动手操作过的朋友们. 最近Jerry因为要做一个新的SAP云产品开发,得搭各种开发环境,其中之一就是Docker. Jer ...

  4. 如何在 Windows 环境下配置 PHP 开发运行环境

    我们都知道 PHP 运行需要的环境通常被称为 AMP. 其中 A 是 Apache 应用服务器,M 是 Mysql,P 就是 PHP 了. 实际上,很多人可能会使用一个测试 Mysql,但是针对一些小 ...

  5. Windows环境下运行Jenkins项目,输出乱码

    Windows环境下运行Jenkins项目,输出乱码 问题描述 ​ python自动化测试项目部署到jenkins上执行时报错UnicodeEncodeError: 'gbk' codec can't ...

  6. Windows环境下Unicode编程总结和将ANSI转换到Unicode 将Unicode转换到ANSI

    Windows环境下Unicode编程总结 UNICODE环境设置 在安装Visual Studio时,在选择VC++时需要加入unicode选项,保证相关的库文件可以拷贝到system32下. UN ...

  7. mysql8.0卸载出现问题,Windows环境下MySQL 8.0 的安装、配置与卸载

    软件版本 Windows:Windows10 MySQL:mysql-8.0.17-winx64.zip 安装步骤 1.配置环境变量 2.新建my.ini文件 文件位置:C:\Program File ...

  8. php-cgi和php-fpm,Windows环境下解决Nginx+php并发访问阻塞问题。

    php-cgi 是运行php, php-fpm是守护php-cgi进程 nginx配置目录运行php         location  ~ \.php$         {              ...

  9. 在Windows环境下搭建Android开发环境

    标题:在Windows环境下搭建 Android 开发环境 作者:CrazyPebble 时间:2011年2月28日 声明:此文在参考其他网上资料以及笔者实践总结写下,一来自己可以做一些总结,二来给跟 ...

最新文章

  1. ping 用数字串代替IP地址
  2. Spring 基于 Java 的配置 - 如何不用Beans.xml照样描述bean之间的依赖关系
  3. oracle怎么优化动态sql语句,oracle动态sql语句处理
  4. C++PrimerPlus学习——第十四章编程练习
  5. Javascript图像处理——图像形态学
  6. Spring Cloud Bus 消息总线实现配置自动刷新
  7. codeforce Gym 101102A Coins (01背包变形)
  8. 腾讯专注关键词的深度语义匹配模型
  9. 查找算法——插值查找
  10. windows server 2012 安装 VC14(VC2015) 安装失败解决方案
  11. Technorati 推出博客广告媒体
  12. js html监听ctrl v,js监听组合按键
  13. 现在的FM电台呀,什么玩艺儿
  14. 最近发现有很多人一直在问苹果ID双重认证怎么关闭。
  15. 三菱FX系列PLC以太网连接kepwareopc软件
  16. 是面试官放水,还是公司太缺人?这都能过,字节跳动原来这么容易进...
  17. windows设置定时任务执行程序命令
  18. D-HARRY2020春夏新品
  19. SVG描边动画实现过程
  20. 淘宝api,获取店铺所有商品接口

热门文章

  1. C#:什么是委托和事件及其相关(转)
  2. 提取文件名+复制+改名+批量创建文件程序(Excel VBA版)
  3. 2.任何一个自然数m的立方均可写成m个连续奇数之和
  4. 博弈论(巴什博奕,威佐夫博弈,尼姆博弈,斐波那契博弈)
  5. 基于django的轻量级CMS Mezzanine搭建笔记
  6. 西雅图Oracle公寓租赁,在西雅图租房必须知道的那些事
  7. 网络设置、ssh服务
  8. 返乡之路不容易之12306余票查询并给出备选方案v2
  9. easyexcel 列头合并_2020-05-19:EasyExcel自定义合并单元格
  10. Dynamics CRM Xrm.Utility.openEntityForm passing lookup parameters