在windows环境下可以直接运行的word2vec.c 并带有详细注释
运行结果:
【code】
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <pthread.h>#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
#include <time.h>#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)const int vocab_hash_size = 30; // Maximum 30 * 0.7 = 21M words in the vocabularytypedef float real; // Precision of float numbers//每个词的基本数据结构
struct vocab_word {long long cn; //词频,从训练集中计数得到或直接提供词频文件int *point; //Haffman树中从根节点到该词的路径,存放的是路径上每个节点的索引//word为该词的字面值//code为该词的haffman编码//codelen为该词haffman编码的长度char *word, *code, codelen;
};char train_file[MAX_STRING], output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
//词表,该数组的下标表示这个词在此表中的位置,也称之为这个词在词表中的索引
struct vocab_word *vocab;int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 1, num_threads = 1, min_reduce = 1;//词hash表,该数组的下标为每个词的hash值,由词的字面值ASCII码计算得到。vocab_hash[hash]中存储的是该词在词表中的索引
int *vocab_hash;//vocab_max_size是一个辅助变量,每次当词表大小超出vocab_max_size时,一次性将词表大小增加1000
//vocab_size为训练集中不同单词的个数,即词表的大小
//layer1_size为词向量的长度
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 10;long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 1e-3;
//syn0存储的是词表中每个词的词向量
//syn1存储的是Haffman树中每个非叶节点的向量
//syn1neg是负采样时每个词的辅助向量
//expTable是提前计算好的Sigmond函数表
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;int hs = 0, negative = 5;
const int table_size = 1e2;
int *table;//计算每个函数的能量分布表,在负采样中用到
void InitUnigramTable() {int a, i;long long train_words_pow = 0;real d1, power = 0.75;//为能量表table分配内存空间,共有table_size项,table_size为一个既定的数1e8table = (int *) malloc(table_size * sizeof(int));//遍历词表,根据词频计算能量总值for (a = 0; a < vocab_size; a++)train_words_pow += pow(vocab[a].cn, power);i = 0;//d1:表示已遍历词的能量值占总能量的比d1 = pow(vocab[i].cn, power) / (real) train_words_pow;printf("\ntable_size:%d", table_size);printf("\ntrain_words_pow:%lld,d1:%f\n", train_words_pow, d1);//a:能量表table的索引//i:词表的索引for (a = 0; a < table_size; a++) {//i号单词占据table中a位置table[a] = i;//能量表反映的是一个单词的能量分布,如果该单词的能量越大,所占table的位置就越多//如果当前单词的能量总和d1小于平均值,i递增,同时更新d1;反之如果能量高的话,保持i不变,以占据更多的位置if (a / (real) table_size > d1) {i++;d1 += pow(vocab[i].cn, power) / (real) train_words_pow;}//如果词表遍历完毕后能量表还没填满,将能量表中剩下的位置用词表中最后一个词填充if (i >= vocab_size)i = vocab_size - 1;}for (a = 0; a < table_size; a++) {printf("\t%d", table[a]);if ((a + 1) % 10 == 0) {printf("\n");}}
}//从文件中读入一个词到word,以space' ',tab'\t',EOL'\n'为词的分界符
//截去一个词中长度超过MAX_STRING的部分
//每一行的末尾输出一个</s>
void ReadWord(char *word, FILE *fin) {int a = 0, ch;while (!feof(fin)) {ch = fgetc(fin);if (ch == 13)continue;if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {if (a > 0) {if (ch == '\n')ungetc(ch, fin);break;}if (ch == '\n') {strcpy(word, (char *) "</s>");return;} elsecontinue;}word[a] = ch;a++;if (a >= MAX_STRING - 1)a--; // Truncate too long words}word[a] = 0;
}//返回一个词的hash值,由词的字面值计算得到,可能存在不同词拥有相同hash值的冲突情况
int GetWordHash(char *word) {unsigned long long a, hash = 0;for (a = 0; a < strlen(word); a++)hash = hash * 257 + word[a];hash = hash % vocab_hash_size;return hash;
}//返回一个词在词表中的位置,若不存在则返回-1
//先计算词的hash值,然后在词hash表中,以该值为下标,查看对应的值
//如果为-1说明这个词不存在索引,即不存在在词表中,返回-1
//如果该索引在词表中对应的词与正在查找的词不符,说明发生了hash值冲突,按照开放地址法去寻找这个词
int SearchVocab(char *word) {unsigned int hash = GetWordHash(word);while (1) {if (vocab_hash[hash] == -1)return -1;if (!strcmp(word, vocab[vocab_hash[hash]].word))return vocab_hash[hash];hash = (hash + 1) % vocab_hash_size;}return -1;
}//从文件中读入一个词,并返回这个词在词表中的位置,相当于将之前的两个函数包装了起来
int ReadWordIndex(FILE *fin) {char word[MAX_STRING];ReadWord(word, fin);if (feof(fin))return -1;return SearchVocab(word);
}//为一个词构建一个vocab_word结构对象,并添加到词表中
//词频初始化为0,hash值用之前的函数计算,
//返回该词在词表中的位置
int AddWordToVocab(char *word) {unsigned int hash, length = strlen(word) + 1;if (length > MAX_STRING)length = MAX_STRING;vocab[vocab_size].word = (char *) calloc(length, sizeof(char));strcpy(vocab[vocab_size].word, word);vocab[vocab_size].cn = 0;vocab_size++;//每当词表数目即将超过最大值时,一次性为其申请添加一千个词结构体的内存空间if (vocab_size + 2 >= vocab_max_size) {vocab_max_size += 1000;vocab = (struct vocab_word *) realloc(vocab, vocab_max_size * sizeof(struct vocab_word));}hash = GetWordHash(word);//如果该hash值与其他词产生冲突,则使用开放地址法解决冲突(为这个词寻找一个hash值空位)while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;//将该词在词表中的位置赋给这个找到的hash值空位vocab_hash[hash] = vocab_size - 1;return vocab_size - 1;
}//按照词频从大到小排序
int VocabCompare(const void *a, const void *b) {return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
}//统计词频,按照词频对词表中的项从大到小排序
void SortVocab() {int a, size;unsigned int hash;//对词表进行排序,将</s>放在第一个位置qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);//充值hash表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;size = vocab_size;train_words = 0;for (a = 0; a < size; a++) {//将出现次数小于min_count的词从词表中去除,出现次数大于min_count的重新计算hash值,更新hash词表if ((vocab[a].cn < min_count) && (a != 0)) {vocab_size--;free(vocab[a].word);} else {//hash值计算hash = GetWordHash(vocab[a].word);//hash值冲突解决while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;vocab_hash[hash] = a;//计算总词数train_words += vocab[a].cn;}}//由于删除了词频较低的词,这里调整词表的内存空间vocab = (struct vocab_word *) realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));// 为Haffman树的构建预先申请空间for (a = 0; a < vocab_size; a++) {vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));}
}//从词表中删除出现次数小于min_reduce的词,没执行一次该函数min_reduce自动加一
void ReduceVocab() {int a, b = 0;unsigned int hash;for (a = 0; a < vocab_size; a++)if (vocab[a].cn > min_reduce) {vocab[b].cn = vocab[a].cn;vocab[b].word = vocab[a].word;b++;} elsefree(vocab[a].word);vocab_size = b;//重置hash表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;//更新hash表for (a = 0; a < vocab_size; a++) {//hash值计算hash = GetWordHash(vocab[a].word);//hash值冲突解决while (vocab_hash[hash] != -1)hash = (hash + 1) % vocab_hash_size;vocab_hash[hash] = a;}fflush(stdout);min_reduce++;
}//利用统计到的词频构建Haffman二叉树
//根据Haffman树的特性,出现频率越高的词其二叉树上的路径越短,即二进制编码越短
void CreateBinaryTree() {long long a, b, i, min1i, min2i, pos1, pos2;//用来暂存一个词到根节点的Haffman树路径long long point[MAX_CODE_LENGTH];//用来暂存一个词的Haffman编码char code[MAX_CODE_LENGTH];//内存分配,Haffman二叉树中,若有n个叶子节点,则一共会有2n-1个节点//count数组前vocab_size个元素为Haffman树的叶子节点,初始化为词表中所有词的词频//count数组后vocab_size个元素为Haffman书中即将生成的非叶子节点(合并节点)的词频,初始化为一个大值1e15long long *count = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//binary数组记录各节点相对于其父节点的二进制编码(0/1)long long *binary = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//paarent数组记录每个节点的父节点long long *parent_node = (long long *) calloc(vocab_size * 2 + 1, sizeof(long long));//count数组的初始化for (a = 0; a < vocab_size; a++)count[a] = vocab[a].cn;for (a = vocab_size; a < vocab_size * 2; a++)count[a] = 1e15;//以下部分为创建Haffman树的算法,默认词表已经按词频由高到低排序//pos1,pos2为别为词表中词频次低和最低的两个词的下标(初始时就是词表最末尾两个)//</s>词也包含在树内pos1 = vocab_size - 1;pos2 = vocab_size;//最多进行vocab_size-1次循环操作,每次添加一个节点,即可构成完整的树for (a = 0; a < vocab_size - 1; a++) {//比较当前的pos1和pos2,在min1i、min2i中记录当前词频最小和次小节点的索引//min1i和min2i可能是叶子节点也可能是合并后的中间节点if (pos1 >= 0) {//如果count[pos1]比较小,则pos1左移,反之pos2右移if (count[pos1] < count[pos2]) {min1i = pos1;pos1--;} else {min1i = pos2;pos2++;}} else {min1i = pos2;pos2++;}if (pos1 >= 0) {//如果count[pos1]比较小,则pos1左移,反之pos2右移if (count[pos1] < count[pos2]) {min2i = pos1;pos1--;} else {min2i = pos2;pos2++;}} else {min2i = pos2;pos2++;}//在count数组的后半段存储合并节点的词频(即最小count[min1i]和次小count[min2i]词频之和)count[vocab_size + a] = count[min1i] + count[min2i];//记录min1i和min2i节点的父节点parent_node[min1i] = vocab_size + a;parent_node[min2i] = vocab_size + a;//这里令每个节点的左右子节点中,词频较低的为1(则词频较高的为0)binary[min2i] = 1;}for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", ii);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", count[ii]);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", binary[ii]);}printf("\n");for (int ii = 0; ii < 2 * vocab_size + 1; ii++) {printf("%d\t", parent_node[ii]);}printf("\n");printf("\n");//根据得到的Haffman二叉树为每个词(树中的叶子节点)分配Haffman编码//由于要为所有词分配编码,因此循环vocab_size次for (a = 0; a < vocab_size; a++) {b = a;i = 0;while (1) {//不断向上寻找叶子结点的父节点,将binary数组中存储的路径的二进制编码增加到code数组末尾code[i] = binary[b];//在point数组中增加路径节点的编号point[i] = b;//Haffman编码的当前长度,从叶子结点到当前节点的深度i++;b = parent_node[b];//由于Haffman树一共有vocab_size*2-1个节点,所以vocab_size*2-2为根节点if (b == vocab_size * 2 - 2)break;}//在词表中更新该词的信息//Haffman编码的长度,即叶子结点到根节点的深度vocab[a].codelen = i;//Haffman路径中存储的中间节点编号要在现在得到的基础上减去vocab_size,即不算叶子结点,单纯在中间节点中的编号//所以现在根节点的编号为(vocab_size*2-2) - vocab_size = vocab_size - 2vocab[a].point[0] = vocab_size - 2;//Haffman编码和路径都应该是从根节点到叶子结点的,因此需要对之前得到的code和point进行反向。for (b = 0; b < i; b++) {vocab[a].code[i - b - 1] = code[b];vocab[a].point[i - b] = point[b] - vocab_size;}}printf("vocab_size:%d\n", vocab_size);for (b = 0; b < vocab_size; b++) {vocab_word temp = vocab[b];printf("%s\t", temp.word);int codeLen = temp.codelen;printf("%d\t(\t", codeLen);for (int a = 0; a < codeLen; a++) {printf("%d\t", temp.code[a]);}printf(")\t\t\t\t\t\t");printf("point:(\t");for (int a = 0; a < codeLen; a++) {printf("%d\t", temp.point[a]);}printf(")\n");}free(count);free(binary);free(parent_node);
}//从训练文件中获取所有词汇并构建词表和hash比
void LearnVocabFromTrainFile() {char word[MAX_STRING];FILE *fin;long long a, i;//初始化hash词表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;//打开训练文件fin = fopen(train_file, "rb");if (fin == NULL) {printf("ERROR: training data file not found!\n");exit(1);}//初始化词表大小vocab_size = 0;//将</s>添加到词表的最前端AddWordToVocab((char *) "</s>");//开始处理训练文件while (1) {//从文件中读入一个词ReadWord(word, fin);if (feof(fin))break;//对总词数加一,并输出当前训练信息train_words++;if ((debug_mode > 1) && (train_words % 100000 == 0)) {printf("%lldK%c", train_words / 1000, 13);fflush(stdout);}//搜索这个词在词表中的位置i = SearchVocab(word);//如果词表中不存在这个词,则将该词添加到词表中,创建其在hash表中的值,初始化词频为1;反之,词频加一if (i == -1) {a = AddWordToVocab(word);vocab[a].cn = 1;} elsevocab[i].cn++;//如果词表大小超过上限,则做一次词表删减操作,将当前词频最低的词删除if (vocab_size > vocab_hash_size * 0.7)ReduceVocab();}//对词表进行排序,剔除词频低于阈值min_count的值,输出当前词表大小和总词数SortVocab();if (debug_mode > 0) {printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//获取训练文件的大小,关闭文件句柄file_size = ftell(fin);fclose(fin);
}//将单词和对应的词频输出到文件中
void SaveVocab() {long long i;FILE *fo = fopen(save_vocab_file, "wb");for (i = 0; i < vocab_size; i++)fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);fclose(fo);
}//从词汇表文件中读词并构建词表和hash表
//由于词汇表中的词语不存在重复,因此与LearnVocabFromTrainFile相比没有做重复词汇的检测
void ReadVocab() {long long a, i = 0;char c;char word[MAX_STRING];//打开词汇表文件FILE *fin = fopen(read_vocab_file, "rb");if (fin == NULL) {printf("Vocabulary file not found\n");exit(1);}//初始化hash词表for (a = 0; a < vocab_hash_size; a++)vocab_hash[a] = -1;vocab_size = 0;//开始处理词汇表文件while (1) {//从文件中读入一个词ReadWord(word, fin);if (feof(fin))break;//将该词添加到词表中,创建其在hash表中的值,并通过输入的词汇表文件中的值来更新这个词的词频a = AddWordToVocab(word);fscanf(fin, "%lld%c", &vocab[a].cn, &c);i++;}//对词表进行排序,剔除词频低于阈值min_count的值,输出当前词表大小和总词数SortVocab();if (debug_mode > 0) {printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//打开训练文件,将文件指针移至文件末尾,获取训练文件的大小fin = fopen(train_file, "rb");if (fin == NULL) {printf("ERROR: training data file not found!\n");exit(1);}fseek(fin, 0, SEEK_END);file_size = ftell(fin);//关闭文件句柄fclose(fin);
}//初始化神经网络结构
void InitNet() {long long a, b;unsigned long long next_random = 1;//syn0存储的是词表中每个词的词向量//这里为syn0分配内存空间//调用posiz_memalign来获取一块数量为vocab_size * layer1_size,128byte页对齐的内存//其中layer1_size是词向量的长度a = posix_memalign((void ** )&syn0, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn0 == NULL) {printf("Memory allocation failed\n");exit(1);}//多层Softmax回归if (hs) {//syn1存储的是Haffman树中每个非叶节点的向量//这里为syn1分配内存空间a = posix_memalign((void ** )&syn1, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn1 == NULL) {printf("Memory allocation failed\n");exit(1);}//初始化syn1为0for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++)syn1[a * layer1_size + b] = 0;}//如果要使用负采样,则需要为syn1neg分配内存空间//syn1neg是负采样时每个词的辅助向量if (negative > 0) {a = posix_memalign((void ** )&syn1neg, 128,(long long )vocab_size * layer1_size * sizeof(real));if (syn1neg == NULL) {printf("Memory allocation failed\n");exit(1);}//初始化syn1neg为0for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++)syn1neg[a * layer1_size + b] = 0;}for (a = 0; a < vocab_size; a++)for (b = 0; b < layer1_size; b++) {next_random = next_random * (unsigned long long) 25214903917 + 11;//初始化词向量syn0,每一维的值为[-0.5, 0.5]/layer1_size范围内的随机数syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real) 65536)- 0.5) / layer1_size;}//创建Haffman二叉树CreateBinaryTree();
}//该函数为线程函数,是训练算法代码实现的主要部分
//默认在执行该线程函数前,已经完成词表排序、Haffman树的生成以及每个词的Haffman编码计算
void *TrainModelThread(void *id) {long long a, b, d;//cw:窗口长度(中心词除外)long long cw;//word: 在提取句子时用来表示当前词在词表中的索引//last_word: 用于在窗口扫描辅助,记录当前扫描到的上下文单词//setence_length: 当前处理的句子长度//setence_position: 当前处理的单词在当前句子中的位置long long word, last_word, sentence_length = 0, sentence_position = 0;//word_count: 当前线程当前时刻已训练的语料的长度//last_word_count: 当前线程上一次记录时已训练的语料长度long long word_count = 0, last_word_count = 0;//sen:当前从文件中读取的待处理句子,存放的是每个词在词表中的索引long long sen[MAX_SENTENCE_LENGTH + 1];//l1:在skip-gram模型中,在syn0中定位当前词词向量的起始位置//l2:在syn1或syn1neg中定位中间节点向量或负采样向量的起始位置//target:在负采样中存储当前样本//label:在负采样中存储当前样本的标记long long l1, l2, c, target, label, local_iter = iter;//next_random:用来辅助生成随机数unsigned long long next_random = (long long) id;real f, g;clock_t now;//neu1:输入词向量,在CBOW模型中是Context(x)中各个词的向量和,在skip-gram模型中是中心词的词向量real *neu1 = (real *) calloc(layer1_size, sizeof(real));//neuele:累计误差项real *neu1e = (real *) calloc(layer1_size, sizeof(real));FILE *fi = fopen(train_file, "rb");//每个进程对应一段文本,根据当前线程的id找到该线程对应文本的初始位置//file_size就是之前LearnVocabFromTrainFile和ReadVocab函数中获取的训练文件的大小fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);//开始主循环while (1) {//每训练约10000词输出一次训练进度if (word_count - last_word_count > 10) {//word_count_actual是所有线程总共当前处理的词数word_count_actual += word_count - last_word_count;last_word_count = word_count;if ((debug_mode > 1)) {now = clock();//输出信息包括://当前的学习率alpha;//训练总进度(当前训练的总词数/(迭代次数*训练样本总词数)+1);//每个线程每秒处理的词数printf( "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",13, alpha, word_count_actual / (real) (iter * train_words + 1) * 100,word_count_actual / ((real) (now - start + 1) / (real) CLOCKS_PER_SEC * 1000));fflush(stdout);}//在初始学习率的基础上,随着实际训练词数的上升,逐步降低当前学习率(自适应调整学习率)alpha = starting_alpha * (1 - word_count_actual / (real) (iter * train_words + 1));//调整的过程中保证学习率不低于starting_alpha * 0.0001if (alpha < starting_alpha * 0.0001)alpha = starting_alpha * 0.0001;}//从训练样本中取出一个句子,句子间以回车分割if (sentence_length == 0) {while (1) {//从文件中读入一个词,将该词在词表中的索引赋给wordword = ReadWordIndex(fi);if (feof(fi))break;if (word == -1)continue;word_count++;//如果读到的时回车,表示句子结束if (word == 0)break;//对高频词进行随机下采样,丢弃掉一些高频词,能够使低频词向量更加准确,同时加快训练速度//可以看作是一种平滑方法if (sample > 0) {real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;next_random = next_random * (unsigned long long) 25214903917 + 11;//以1-ran的概率舍弃高频词if (ran < (next_random & 0xFFFF) / (real) 65536)continue;}sen[sentence_length] = word;sentence_length++;//如果句子长度超出最大长度则截断if (sentence_length >= MAX_SENTENCE_LENGTH)break;}//定位到句子头sentence_position = 0;}//如果当前线程处理的词数超过了它应该处理的最大值,那么开始新一轮迭代//如果迭代数超过上限,则停止迭代if (feof(fi) || (word_count > train_words / num_threads)) {word_count_actual += word_count - last_word_count;local_iter--;if (local_iter == 0)break;word_count = 0;last_word_count = 0;sentence_length = 0;fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);continue;}//取出当前单词word = sen[sentence_position];if (word == -1)continue;//初始化输入词向量for (c = 0; c < layer1_size; c++)neu1[c] = 0;//初始化累计误差项for (c = 0; c < layer1_size; c++)neu1e[c] = 0;//生成一个[0, window-1]的随机数,用来确定|context(w)|窗口的实际宽度(提高训练速率?)next_random = next_random * (unsigned long long) 25214903917 + 11;b = next_random % window;/********如果使用的是CBOW模型:输入是某单词周围窗口单词的词向量,来预测该中心单词本身*********/if (cbow) {cw = 0;//一个词的窗口为[setence_position - window + b, sentence_position + window - b]//因此窗口总长度为 2*window - 2*b + 1for (a = b; a < window * 2 + 1 - b; a++)if (a != window) { //去除窗口的中心词,这是我们要预测的内容,仅仅提取上下文c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;//sen数组中存放的是句子中的每个词在词表中的索引last_word = sen[c];if (last_word == -1)continue;//计算窗口中词向量的和for (c = 0; c < layer1_size; c++)neu1[c] += syn0[c + last_word * layer1_size];//统计实际窗口中的有效词数cw++;}if (cw) {//求平均向量和for (c = 0; c < layer1_size; c++)neu1[c] /= cw;//如果采用分层softmax优化//根据Haffman树上从根节点到当前词的叶节点的路径,遍历所有经过的中间节点if (hs)for (d = 0; d < vocab[word].codelen; d++) {f = 0;//l2为当前遍历到的中间节点的向量在syn1中的起始位置l2 = vocab[word].point[d] * layer1_size;//f为输入向量neu1与中间结点向量的内积for (c = 0; c < layer1_size; c++)f += neu1[c] * syn1[c + l2];//检测f有没有超出Sigmoid函数表的范围if (f <= -MAX_EXP)continue;else if (f >= MAX_EXP)continue;//如果没有超出范围则对f进行Sigmoid变换elsef = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//g是梯度和学习率的乘积//学习率越大,则错误分类的惩罚也越大,对中间向量的修正量也越大//注意!word2vec中将Haffman编码为1的节点定义为负类,而将编码为0的节点定义为正类//即一个节点的label = 1 - dg = (1 - vocab[word].code[d] - f) * alpha;//根据计算得到的修正量g和中间节点的向量更新累计误差for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1[c + l2];//根据计算得到的修正量g和输入向量更新中间节点的向量值//很好理解,假设vocab[word].code[d]编码为1,即负类,其节点label为1-1=0//sigmoid函数得到的值为(0,1)范围内的数,大于label,很自然的,我们需要把这个中间节点的向量调小//而此时的g = (label - f)*alpha是一个负值,作用在中间节点向量上时,刚好起到调小效果//调小的幅度与sigmoid函数的计算值偏离label的幅度成正比for (c = 0; c < layer1_size; c++)syn1[c + l2] += g * neu1[c];}//如果采用负采样优化//遍历所有正负样本(1个正样本+negative个负样本)if (negative > 0)for (d = 0; d < negative + 1; d++) {if (d == 0) {//第一次循环处理的是目标单词,即正样本target = word;label = 1;} else {//从能量表中随机抽取负样本next_random = next_random * (unsigned long long) 25214903917 + 11;target = table[(next_random >> 16) % table_size];if (target == 0)target = next_random % (vocab_size - 1) + 1;if (target == word)continue;label = 0;}//在负采样优化中,每个词在syn1neg数组中对应一个辅助向量//此时的l2为syn1neg中目标单词向量的起始位置l2 = target * layer1_size;f = 0;//f为输入向量neu1与辅助向量的内积for (c = 0; c < layer1_size; c++)f += neu1[c] * syn1neg[c + l2];if (f > MAX_EXP)g = (label - 1) * alpha;else if (f < -MAX_EXP)g = (label - 0) * alpha;//g = (label - f)*alphaelseg = (label - expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;//用辅助向量和g更新累计误差for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1neg[c + l2];//用输入向量和g更新辅助向量for (c = 0; c < layer1_size; c++)syn1neg[c + l2] += g * neu1[c];}//根据获得的的累计误差,更新context(w)中每个词的词向量for (a = b; a < window * 2 + 1 - b; a++)if (a != window) {c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;last_word = sen[c];if (last_word == -1)continue;for (c = 0; c < layer1_size; c++)syn0[c + last_word * layer1_size] += neu1e[c];}}}/********如果使用的是skip-gram模型:输入是中心单词,来预测该单词的上下文*********/else {//因为需要预测Context(w)中的每个词,因此需要循环2window - 2b + 1次遍历整个窗口//遍历时跳过中心单词for (a = b; a < window * 2 + 1 - b; a++)if (a != window) {c = sentence_position - window + a;if (c < 0)continue;if (c >= sentence_length)continue;//last_word为当前待预测的上下文单词last_word = sen[c];if (last_word == -1)continue;//l1为当前单词的词向量在syn0中的起始位置l1 = last_word * layer1_size;//初始化累计误差for (c = 0; c < layer1_size; c++)neu1e[c] = 0;//如果采用分层softmax优化//根据Haffman树上从根节点到当前词的叶节点的路径,遍历所有经过的中间节点if (hs)for (d = 0; d < vocab[word].codelen; d++) {f = 0;l2 = vocab[word].point[d] * layer1_size;//注意!这里用到了模型对称:p(u|w) = p(w|u),其中w为中心词,u为context(w)中每个词//也就是skip-gram虽然是给中心词预测上下文,真正训练的时候还是用上下文预测中心词//与CBOW不同的是这里的u是单个词的词向量,而不是窗口向量之和//算法流程基本和CBOW的hs一样,这里不再赘述for (c = 0; c < layer1_size; c++)f += syn0[c + l1] * syn1[c + l2];if (f <= -MAX_EXP)continue;else if (f >= MAX_EXP)continue;elsef = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];g = (1 - vocab[word].code[d] - f) * alpha;for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1[c + l2];for (c = 0; c < layer1_size; c++)syn1[c + l2] += g * syn0[c + l1];}//如果采用负采样优化//遍历所有正负样本(1个正样本+negative个负样本)//算法流程基本和CBOW的ns一样,也采用的是模型对称if (negative > 0)for (d = 0; d < negative + 1; d++) {if (d == 0) {target = word;label = 1;} else {next_random = next_random * (unsigned long long) 25214903917 + 11;target = table[(next_random >> 16) % table_size];if (target == 0)target = next_random % (vocab_size - 1) + 1;if (target == word)continue;label = 0;}l2 = target * layer1_size;f = 0;for (c = 0; c < layer1_size; c++)f += syn0[c + l1] * syn1neg[c + l2];if (f > MAX_EXP)g = (label - 1) * alpha;else if (f < -MAX_EXP)g = (label - 0) * alpha;elseg = (label - expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;for (c = 0; c < layer1_size; c++)neu1e[c] += g * syn1neg[c + l2];for (c = 0; c < layer1_size; c++)syn1neg[c + l2] += g * syn0[c + l1];}for (c = 0; c < layer1_size; c++)syn0[c + l1] += neu1e[c];}}//完成了一个词的训练,句子中位置往后移一个词sentence_position++;//处理完一句句子后,将句子长度置为零,进入循环,重新读取句子并进行逐词计算if (sentence_position >= sentence_length) {sentence_length = 0;continue;}}fclose(fi);free(neu1);free(neu1e);pthread_exit(NULL);
}//完整的模型训练流程函数
void TrainModel() {long a, b, c, d;FILE *fo;//创建多线程,线程数为num_threadspthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));printf("Starting training using file %s\n", train_file);//设置初始学习率starting_alpha = alpha;//如果有词汇表文件,则从中加载生成词表和hash表,否则从训练文件中获得printf("read_vocab_file:%d\t", read_vocab_file[0]);if (read_vocab_file[0] != 0)ReadVocab();elseLearnVocabFromTrainFile();//根据需要,可以将词表中的词和词频输出到文件if (save_vocab_file[0] != 0)SaveVocab();if (output_file[0] == 0)return;//初始化训练网络InitNet();//如果使用负采样优化,则需要初始化能量表if (negative > 0)InitUnigramTable();//开始计时start = clock();//创建训练线程for (a = 0; a < num_threads; a++)pthread_create(&pt[a], NULL, TrainModelThread, (void *) (intptr_t) a);for (a = 0; a < num_threads; a++)pthread_join(pt[a], NULL);fo = fopen(output_file, "wb");//如果classes参数为0,则输出所有词向量到文件中if (classes == 0) {fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);for (a = 0; a < vocab_size; a++) {fprintf(fo, "%s ", vocab[a].word);if (binary)for (b = 0; b < layer1_size; b++)fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);elsefor (b = 0; b < layer1_size; b++)fprintf(fo, "%lf ", syn0[a * layer1_size + b]);fprintf(fo, "\n");}}//如果classes参数不为0,则需要对词向量进行K-means聚类,输出词类//classes为最后要分成的类的个数else {//clcn:总类数//iter:总迭代次数//closeid:用来存储计算过程中离某个词最近的类编号int clcn = classes, iter = 10, closeid;//centcn:属于每个类的单词数int *centcn = (int *) malloc(classes * sizeof(int));//cl:每个单词所属的类编号int *cl = (int *) calloc(vocab_size, sizeof(int));//x:用来存储每次计算得到的词向量和类中心的内积,值越大说明距离越近//closev:用来最大的内积,即距离最近real closev, x;//cent:每个类的中心向量real *cent = (real *) calloc(classes * layer1_size, sizeof(real));//先给所有单词随机指派类for (a = 0; a < vocab_size; a++)cl[a] = a % clcn;//一共迭代iter次for (a = 0; a < iter; a++) {//初始化类中心向量数组为0for (b = 0; b < clcn * layer1_size; b++)cent[b] = 0;//初始化每个类含有的单词数为1for (b = 0; b < clcn; b++)centcn[b] = 1;//将刚才随意分配的所属于同一个类的词向量相加,并且计算属于每个类的词数for (c = 0; c < vocab_size; c++) {for (d = 0; d < layer1_size; d++)cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];centcn[cl[c]]++;}for (b = 0; b < clcn; b++) {closev = 0;for (c = 0; c < layer1_size; c++) {//计算每个类的平均中心向量cent[layer1_size * b + c] /= centcn[b];//closev为类平均中心向量的二范数的平方closev += cent[layer1_size * b + c]* cent[layer1_size * b + c];}//对closev开方,此时的closev即为类平均中心向量的二范数closev = sqrt(closev);//用得到的范数对中心向量进行归一化for (c = 0; c < layer1_size; c++)cent[layer1_size * b + c] /= closev;}//遍历词表中的每个词,为其重新分配距离最近的类for (c = 0; c < vocab_size; c++) {closev = -10;closeid = 0;for (d = 0; d < clcn; d++) {x = 0;//对词向量和归一化的类中心向量做内积for (b = 0; b < layer1_size; b++)x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];//内积越大说明两点之间距离越近//取所有类中与这个词的词向量内积最大的一个类,将词分到这个类中if (x > closev) {closev = x;closeid = d;}}cl[c] = closeid;}}//经过多次迭代后,逐渐会将词向量向正确的类靠拢//输出K-means聚类结果到文件中for (a = 0; a < vocab_size; a++)fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);free(centcn);free(cent);free(cl);}fclose(fo);
}//当参数缺失时,输出提示信息
int ArgPos(char *str, int argc, char **argv) {int a;for (a = 1; a < argc; a++)if (!strcmp(str, argv[a])) {if (a == argc - 1) {printf("Argument missing for %s\n", str);exit(1);}return a;}return -1;
}void prepare() {int i;vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));printf("%d", vocab_hash[0]);expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));for (i = 0; i < EXP_TABLE_SIZE; i++) {expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() tableexpTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)}
}
int main(int argc, char **argv) {int i;prepare();strcpy(train_file, "record/input.txt");strcpy(save_vocab_file, "record/vocab.txt");strcpy(output_file, "record/output.txt");/**argc = 2;if (argc == 1) {printf("WORD VECTOR estimation toolkit v 0.1c\n\n");printf("Options:\n");printf("Parameters for training:\n");printf("\t-train <file>\n");printf("\t\tUse text data from <file> to train the model\n");printf("\t-output <file>\n");printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");printf("\t-size <int>\n");printf("\t\tSet size of word vectors; default is 100\n");printf("\t-window <int>\n");printf("\t\tSet max skip length between words; default is 5\n");printf("\t-sample <float>\n");printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");printf("\t-hs <int>\n");printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");printf("\t-negative <int>\n");printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");printf("\t-threads <int>\n");printf("\t\tUse <int> threads (default 12)\n");printf("\t-iter <int>\n");printf("\t\tRun more training iterations (default 5)\n");printf("\t-min-count <int>\n");printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");printf("\t-alpha <float>\n");printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");printf("\t-classes <int>\n");printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");printf("\t-debug <int>\n");printf("\t\tSet the debug mode (default = 2 = more info during training)\n");printf("\t-binary <int>\n");printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");printf("\t-save-vocab <file>\n");printf("\t\tThe vocabulary will be saved to <file>\n");printf("\t-read-vocab <file>\n");printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");printf("\t-cbow <int>\n");printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");printf("\nExamples:\n");printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");return 0;}output_file[0] = 0;save_vocab_file[0] = 0;read_vocab_file[0] = 0;if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);if (cbow) alpha = 0.05;if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);**/vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word));vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));for (i = 0; i < EXP_TABLE_SIZE; i++) {expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() tableexpTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)}TrainModel();return 0;
}
输入数据
bb cc
bb
dd ee
bb
cc ac
bb cc ee
bb cc
ac bb
ee xx
bb
ac cc
ee bb
vocab.txt
</s> 12
bb 8
cc 5
ee 4
ac 3
xx 1
dd 1
output.txt
7 10
</s> 0.040027 0.044194 -0.038303 -0.032780 0.013666 0.030211 0.009409 0.002113 -0.036035 0.022185
bb -0.043564 0.012495 -0.007513 -0.009572 -0.033157 -0.018822 0.025793 0.030254 0.029691 0.015974
cc 0.015448 -0.038026 -0.040958 0.049696 0.038013 0.030901 -0.006039 0.040157 -0.004950 0.007347
ee -0.001492 -0.029832 0.013123 -0.013374 -0.038254 0.047542 0.043793 -0.010951 -0.002261 0.005092
ac -0.036377 -0.040071 0.045547 0.000630 -0.025824 -0.030421 -0.030765 0.016969 0.002014 0.013310
xx -0.042136 -0.038078 -0.001300 0.011436 0.025497 -0.031700 0.040796 0.009270 0.011197 -0.006084
dd 0.029865 -0.022878 -0.020975 0.021584 -0.007532 0.010307 0.018045 -0.040886 -0.019830 0.029137
在windows环境下可以直接运行的word2vec.c 并带有详细注释相关推荐
- windows环境下MySQL服务端和客户端安装,超详细
windows环境下MySQL服务端和客户端安装,超详细 MySQL简介 MySQL的安装步骤 服务端安装 客户端安装 MySQL简介 MySQL是关系型数据库,由n张互相关联的表组成.一般是c或c+ ...
- windows环境下运行.sh文件
.sh是shell script格式的文件,在Linux环境下是可以直接运行的,到文件所在目录下,执行 ./test.sh 如果是在Windows环境下,在CMD窗口是无法直接执行 ./test.sh ...
- Windows环境下,如何在Docker里运行SAP UI5应用
本文面向的读者是对Docker技术有一些基本概念,但因为没有测试环境,所以没有动手操作过的朋友们. 最近Jerry因为要做一个新的SAP云产品开发,得搭各种开发环境,其中之一就是Docker. Jer ...
- 如何在 Windows 环境下配置 PHP 开发运行环境
我们都知道 PHP 运行需要的环境通常被称为 AMP. 其中 A 是 Apache 应用服务器,M 是 Mysql,P 就是 PHP 了. 实际上,很多人可能会使用一个测试 Mysql,但是针对一些小 ...
- Windows环境下运行Jenkins项目,输出乱码
Windows环境下运行Jenkins项目,输出乱码 问题描述 python自动化测试项目部署到jenkins上执行时报错UnicodeEncodeError: 'gbk' codec can't ...
- Windows环境下Unicode编程总结和将ANSI转换到Unicode 将Unicode转换到ANSI
Windows环境下Unicode编程总结 UNICODE环境设置 在安装Visual Studio时,在选择VC++时需要加入unicode选项,保证相关的库文件可以拷贝到system32下. UN ...
- mysql8.0卸载出现问题,Windows环境下MySQL 8.0 的安装、配置与卸载
软件版本 Windows:Windows10 MySQL:mysql-8.0.17-winx64.zip 安装步骤 1.配置环境变量 2.新建my.ini文件 文件位置:C:\Program File ...
- php-cgi和php-fpm,Windows环境下解决Nginx+php并发访问阻塞问题。
php-cgi 是运行php, php-fpm是守护php-cgi进程 nginx配置目录运行php location ~ \.php$ { ...
- 在Windows环境下搭建Android开发环境
标题:在Windows环境下搭建 Android 开发环境 作者:CrazyPebble 时间:2011年2月28日 声明:此文在参考其他网上资料以及笔者实践总结写下,一来自己可以做一些总结,二来给跟 ...
最新文章
- ping 用数字串代替IP地址
- Spring 基于 Java 的配置 - 如何不用Beans.xml照样描述bean之间的依赖关系
- oracle怎么优化动态sql语句,oracle动态sql语句处理
- C++PrimerPlus学习——第十四章编程练习
- Javascript图像处理——图像形态学
- Spring Cloud Bus 消息总线实现配置自动刷新
- codeforce Gym 101102A Coins (01背包变形)
- 腾讯专注关键词的深度语义匹配模型
- 查找算法——插值查找
- windows server 2012 安装 VC14(VC2015) 安装失败解决方案
- Technorati 推出博客广告媒体
- js html监听ctrl v,js监听组合按键
- 现在的FM电台呀,什么玩艺儿
- 最近发现有很多人一直在问苹果ID双重认证怎么关闭。
- 三菱FX系列PLC以太网连接kepwareopc软件
- 是面试官放水,还是公司太缺人?这都能过,字节跳动原来这么容易进...
- windows设置定时任务执行程序命令
- D-HARRY2020春夏新品
- SVG描边动画实现过程
- 淘宝api,获取店铺所有商品接口
热门文章
- C#:什么是委托和事件及其相关(转)
- 提取文件名+复制+改名+批量创建文件程序(Excel VBA版)
- 2.任何一个自然数m的立方均可写成m个连续奇数之和
- 博弈论(巴什博奕,威佐夫博弈,尼姆博弈,斐波那契博弈)
- 基于django的轻量级CMS Mezzanine搭建笔记
- 西雅图Oracle公寓租赁,在西雅图租房必须知道的那些事
- 网络设置、ssh服务
- 返乡之路不容易之12306余票查询并给出备选方案v2
- easyexcel 列头合并_2020-05-19:EasyExcel自定义合并单元格
- Dynamics CRM Xrm.Utility.openEntityForm passing lookup parameters