使用 NLTK 对文本进行清洗，索引工具

EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''FILENAME = 'data/chat.txt'limit = {'maxq' : 20,'minq' : 0,'maxa' : 20,'mina' : 3}UNK = 'unk'
VOCAB_SIZE = 6000import random
import sysimport nltk
import itertools
from collections import defaultdictimport numpy as npimport pickledef ddefault():return 1'''read lines from filereturn [list of lines]'''
def read_lines(filename):return open(filename).read().split('\n')[:-1]'''split sentences in one lineinto multiple linesreturn [list of lines]'''
def split_line(line):return line.split('.')'''remove anything that isn't in the vocabularyreturn str(pure ta/en)'''
def filter_line(line, whitelist):return ''.join([ ch for ch in line if ch in whitelist ])'''read list of words, create index to word,word to index dictionariesreturn tuple( vocab->(word, count), idx2w, w2idx )'''
def index_(tokenized_sentences, vocab_size):# get frequency distributionfreq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))# get vocabulary of 'vocab_size' most used wordsvocab = freq_dist.most_common(vocab_size)# index2wordindex2word = ['_'] + [UNK] + [ x[0] for x in vocab ]# word2indexword2index = dict([(w,i) for i,w in enumerate(index2word)] )return index2word, word2index, freq_dist'''filter too long and too short sequencesreturn tuple( filtered_ta, filtered_en )'''
def filter_data(sequences):filtered_q, filtered_a = [], []raw_data_len = len(sequences)//2for i in range(0, len(sequences), 2):qlen, alen = len(sequences[i].split(' ')), len(sequences[i+1].split(' '))if qlen >= limit['minq'] and qlen <= limit['maxq']:if alen >= limit['mina'] and alen <= limit['maxa']:filtered_q.append(sequences[i])filtered_a.append(sequences[i+1])# print the fraction of the original data, filteredfilt_data_len = len(filtered_q)filtered = int((raw_data_len - filt_data_len)*100/raw_data_len)print(str(filtered) + '% filtered from original data')return filtered_q, filtered_a'''create the final dataset : - convert list of items to arrays of indices- add zero paddingreturn ( [array_en([indices]), array_ta([indices]) )'''
def zero_pad(qtokenized, atokenized, w2idx):# num of rowsdata_len = len(qtokenized)# numpy arrays to store indicesidx_q = np.zeros([data_len, limit['maxq']], dtype=np.int32) idx_a = np.zeros([data_len, limit['maxa']], dtype=np.int32)for i in range(data_len):q_indices = pad_seq(qtokenized[i], w2idx, limit['maxq'])a_indices = pad_seq(atokenized[i], w2idx, limit['maxa'])#print(len(idx_q[i]), len(q_indices))#print(len(idx_a[i]), len(a_indices))idx_q[i] = np.array(q_indices)idx_a[i] = np.array(a_indices)return idx_q, idx_a'''replace words with indices in a sequencereplace with unknown if word not in lookupreturn [list of indices]'''
def pad_seq(seq, lookup, maxlen):indices = []for word in seq:if word in lookup:indices.append(lookup[word])else:indices.append(lookup[UNK])return indices + [0]*(maxlen - len(seq))def process_data():print('\n>> Read lines from file')lines = read_lines(filename=FILENAME)# change to lower case (just for en)lines = [ line.lower() for line in lines ]print('\n:: Sample from read(p) lines')print(lines[121:125])# filter out unnecessary charactersprint('\n>> Filter lines')lines = [ filter_line(line, EN_WHITELIST) for line in lines ]print(lines[121:125])# filter out too long or too short sequencesprint('\n>> 2nd layer of filtering')qlines, alines = filter_data(lines)print('\nq : {0} ; a : {1}'.format(qlines[60], alines[60]))print('\nq : {0} ; a : {1}'.format(qlines[61], alines[61]))# convert list of [lines of text] into list of [list of words ]print('\n>> Segment lines into words')qtokenized = [ wordlist.split(' ') for wordlist in qlines ]atokenized = [ wordlist.split(' ') for wordlist in alines ]print('\n:: Sample from segmented list of words')print('\nq : {0} ; a : {1}'.format(qtokenized[60], atokenized[60]))print('\nq : {0} ; a : {1}'.format(qtokenized[61], atokenized[61]))# indexing -> idx2w, w2idx : en/taprint('\n >> Index words')idx2w, w2idx, freq_dist = index_( qtokenized + atokenized, vocab_size=VOCAB_SIZE)print('\n >> Zero Padding')idx_q, idx_a = zero_pad(qtokenized, atokenized, w2idx)print('\n >> Save numpy arrays to disk')# save themnp.save('idx_q.npy', idx_q)np.save('idx_a.npy', idx_a)# let us now save the necessary dictionariesmetadata = {'w2idx' : w2idx,'idx2w' : idx2w,'limit' : limit,'freq_dist' : freq_dist}# write to disk : data control dictionarieswith open('metadata.pkl', 'wb') as f:pickle.dump(metadata, f)def load_data(PATH=''):# read data control dictionarieswith open(PATH + 'metadata.pkl', 'rb') as f:metadata = pickle.load(f)# read numpy arraysidx_ta = np.load(PATH + 'idx_q.npy')idx_en = np.load(PATH + 'idx_a.npy')return metadata, idx_q, idx_aif __name__ == '__main__':process_data()

使用 NLTK 对文本进行清洗，索引工具相关推荐

python nlp_【NLP】Python NLTK获取文本语料和词汇资源
作者:白宁超 2016年11月7日13:15:24 摘要:NLTK是由宾夕法尼亚大学计算机和信息科学使用python语言实现的一种自然语言工具包,其收集的大量公开数据集.模型上提供了全面.易用的接口, ...
Google开源word2vec，文本相似度计算工具
Google开源word2vec,文本相似度计算工具谷歌已经使用Deep Learning技术开发了许多新方法来解析语言,目前,谷歌开源了一款基于Deep Learning的学习工具--word2v ...
在线文本字符串批量替换工具
在线文本字符串批量替换工具在线文本字符串批量替换工具工具支持将2个数据项随机混合批量生成数据项列表,分别输入2项数据后,设置生成数量和数据项分隔符,点击生成按钮即可完成数据随机混合拼接,支持导出到 ...
在线文本按列截取工具
在线文本按列截取工具在线文本按列截取工具本工具支持按列截取文本行列表,如截取所有文本行第1至6位,则将开始截取位置设置为1,结束截取长度设置为6,如需从后面截取,请使用复数的开始截取位置. 若文本 ...
在线文本中英文数字清除工具
在线文本中英文数字清除工具在线文本中英文数字清除工具工具支持清除文本中的所有中文,英文或数字,请根据实际需要勾选清除的文本类型,数据实时处理更新.工具支持清除文本中的所有中文,英文或数字,请根据实 ...
在线文本中插入符号工具
在线文本中插入符号工具在线文本中插入符号工具工具能够在文本中任意两个字符之间都插入指定的符号,非常简单就可以完成绚丽个性的文本段落. https://tooltt.com/txt-symbol/
文本前后空格去除工具
文本前后空格去除工具文本前后空格去除工具文本前后空格去除,文本前后空格去除,文本前后空格去除,文本前后空格去除 https://tooltt.com/txt-trim/
图像文本检测的标注工具_如何检测图像中的文本
图像文本检测的标注工具 Images are a great way to communicate without text but oftentimes images are used/abused ...
结巴分词关键词相似度_中文文本相似度计算工具集
[磐创AI导读]:前两篇文章中我们介绍了一些机器学习不错的项目合集和深度学习入门资源合集,本篇文章将对中文文本相似度计算工具做一次汇总.喜欢我们文章的小伙伴,欢迎大家点击上方蓝字关注我们的公众号:磐创 ...

使用 NLTK 对文本进行清洗，索引工具

使用 NLTK 对文本进行清洗，索引工具相关推荐

最新文章

热门文章