python 新词发现

wordseg.py :

#coding=utf-8"""
基于信息熵和互信息非监督中文分词
Reference: http://www.matrix67.com/blog/archives/5044
"""
import re
import math
import jsondef entropyOfList(ls):"""给到一个列表(词,频次), 计算熵，用于计算左右熵sum(-p[i]*log(p[i])"""elements = {}for e in ls:elements[e] = elements.get(e, 0) + 1length = float(len(ls))return sum([-v/length*math.log(v/length) for v in list(elements.values())])def genSubparts(string):"""Partition a string into all possible two parts, e.g.given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")]For string of length 1, return empty list"""length = len(string)res = []for i in range(1, length):res.append((string[0:i], string[i:]))return resdef indexOfSortedSuffix(doc, max_word_len):"""生成最大词长度为max_word_len前提下，doc字符串所有可能的词的索引组合（起始和结束索引）"""indexes = []length = len(doc)for i in range(0, length):for j in range(i + 1, min(i + 1 + max_word_len, length + 1)):indexes.append((i, j))return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]])class WordInfo(object):"""Store information of each word, including its freqency, left neighbors and right neighbors"""def __init__(self, text):super(WordInfo, self).__init__()self.text = textself.num = 0self.freq = 0.0self.left = []self.right = []self.aggregation = 0def update(self, left, right):"""Increase frequency of this word, then append left/right neighbors@param left a single character on the left side of this word@param right as left is, but on the right side"""self.num += 1if left: self.left.append(left)if right: self.right.append(right)def compute(self, length):"""Compute frequency and entropy of this word@param length length of the document for training to get words"""self.freq = float(self.num)/lengthself.left = entropyOfList(self.left)self.right = entropyOfList(self.right)def computeAggregation(self, words_dict):"""计算词的凝固度@param words_dict frequency dict of all candidate words"""parts = genSubparts(self.text)if len(parts) > 0:self.aggregation = min([self.freq/(words_dict[p1_p2[0]].freq*words_dict[p1_p2[1]].freq) for p1_p2 in parts])class WordSegment(object):"""Main class for Chinese word segmentation1. Generate words from a long enough document2. Do the segmentation work with the document"""# if a word is combination of other shorter words, then treat it as a long wordL = 0# if a word is combination of other shorter words, then treat it as the set of shortest wordsS = 1# if a word contains other shorter words, then return all possible resultsALL = 2def __init__(self, doc, max_word_len=5,min_num=3, min_freq=0.00005, min_entropy=2.0, min_aggregation=50):super(WordSegment, self).__init__()self.max_word_len = max_word_lenself.min_num = min_numself.min_freq = min_freqself.min_entropy = min_entropyself.min_aggregation = min_aggregation# print(doc)self.word_infos = self.genWords(doc)# print([x.text for x in self.word_infos if len(x.text)>1])# print('$$$$')# Result infomations, i.e., average data of all wordsword_count = float(len(self.word_infos))self.avg_len = sum([len(w.text) for w in self.word_infos])/word_countself.avg_freq = sum([w.freq for w in self.word_infos])/word_countself.avg_left_entropy = sum([w.left for w in self.word_infos])/word_countself.avg_right_entropy = sum([w.right for w in self.word_infos])/word_countself.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count# Filter out the results satisfy all the requirementsfilter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation \and v.num > self.min_num and v.freq > self.min_freq \and v.left > self.min_entropy and v.right > self.min_entropyself.word_with_freq = [(w.text, w.freq,w.num) for w in list(filter(filter_func, self.word_infos))]self.words = [w[0] for w in self.word_with_freq]self.word_with_user_sort=sorted(self.word_infos,key=lambda v:self.sort_word(v), reverse=True)self.words_with_user_sort=[(w.text,self.sort_word(w)) for w in self.word_with_user_sort]def sort_word(self,word):return min(word.left,word.right)*(word.aggregation)def genWords(self, doc):"""Generate all candidate words with their frequency/entropy/aggregation informations@param doc the document used for words generation"""pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')doc = re.sub(pattern, ' ', doc)# print(doc)suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len)#print(suffix_indexes)word_cands = {}# compute frequency and neighborsfor suf in suffix_indexes:word = doc[suf[0]:suf[1]]if word not in word_cands:word_cands[word] = WordInfo(word)word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1])# compute probability and entropylength = len(doc)for k in word_cands:word_cands[k].compute(length)values = sorted(list(word_cands.values()), key=lambda x: len(x.text))for v in values:#单字的词不计算if len(v.text) == 1 or ' 'in v.text: continuev.computeAggregation(word_cands)return sorted(values, key=lambda v: v.freq, reverse=True)def segSentence(self, sentence, method=ALL):"""Segment a sentence with the words generated from a document@param sentence the sentence to be handled@param method segmentation method"""i = 0res = []while i < len(sentence):if method == self.L or method == self.S:j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1]for j in j_range:if j == 1 or sentence[i:i + j] in self.words:res.append(sentence[i:i + j])i += jbreakelse:to_inc = 1for j in range(2, self.max_word_len + 1):if i + j <= len(sentence) and sentence[i:i + j] in self.words:res.append(sentence[i:i + j])if to_inc == 1: to_inc = jif to_inc == 1: res.append(sentence[i])i += to_increturn res

mian.py

from wordseg import *
file=open('text2.txt',encoding='gbk',mode='r')
ls=[x.strip() for x in file.readlines()]
content=ls[0]
ws = WordSegment(content, max_word_len=5, min_aggregation=1, min_entropy=0.5,min_num=1)
print('average len: ', ws.avg_len)
print('average frequency: ', ws.avg_freq)
print('average left entropy: ', ws.avg_left_entropy)
print('average right entropy: ', ws.avg_right_entropy)
print('average aggregation: ', ws.avg_aggregation)
print(ws.words_with_user_sort)
print(len(ws.word_with_user_sort))
res=[x[0] for x in ws.words_with_user_sort if len(x)>1 and x[1]>0]
print(res)

python 新词发现相关推荐

Python自然语言处理相,新词发现，主题模型，隐马尔模型词性标注，Word2Vec，情感分析...
向AI转型的程序员都关注了这个号???????????? 机器学习AI算法工程公众号:datayx 代码环境:python --version 3.5.2 tensorflow keras 代码 ...
python | 高效统计语言模型kenlm：新词发现、分词、智能纠错
之前看到苏神[重新写了之前的新词发现算法:更快更好的新词发现]中提到了kenlm,之前也自己玩过,没在意,现在遇到一些大规模的文本问题,模块确实好用,前几天还遇到几个差点"弃疗"的 ...
python函数封装计算n_python | 高效使用统计语言模型kenlm：新词发现、分词、智能纠错等...
py-kenlm-model python | 高效使用统计语言模型kenlm:新词发现.分词.智能纠错等之前看到苏神[重新写了之前的新词发现算法:更快更好的新词发现]中提到了kenlm,之前也自己 ...
python实现词语填空_python简单实现新词发现
基于新信息熵的新词发现原理<互联网时代的社会语言学:基于SNS的文本数据挖掘>这篇文章已经讲得非常清楚了,在这里主要是通过代码复现这篇文章. 实现的模块主要分为四个部分:从文章中提取所有可 ...
Python代码发现链表中的环并输出环中的第一个元素
Python代码发现链表中的环并输出环中的第一个元素 # Python代码发现链表中的环并输出环中的第一个元素 # Find first node of loop in a linked list # ...
无监督构建词库：更快更好的新词发现算法
作者丨苏剑林单位丨追一科技研究方向丨NLP,神经网络个人主页丨kexue.fm 新词发现是 NLP 的基础任务之一,主要是希望通过无监督发掘一些语言特征(主要是统计特征),来判断一批语料中哪些字 ...
新词发现：中文新词识别技术简介
一.前言新词识别,也可称为未登录词识别,严格来说,新词是指随时代发展而新出现或旧词新用的词,如:给力.山寨等:而未登录词是在词典中未存在的词,但实际使用中,两者并没有严格的区分,下文均以新词指代. ...
互信息和左右熵的新词发现（笔记）
推荐:http://spaces.ac.cn/archives/3491/ http://www.matrix67.com/blog/archives/5044 http://www.hankcs.c ...
切切切词！新词发现算法TopWORDS的原理及实现｜实在智能AI+RPA学院
切切切词!新词发现算法TopWORDS的原理及实现|实在智能AI+RPA学院一.介绍 TopWORDS [参考文献1]是发表在PNAS的一种新词发现算法,它在没有任何先验知识的条件下,快速地从大规模 ...
NLP：自然语言处理技术之词语级别相关术语解释(如上位词/WordNet)、基于词汇层面的词法分析六大任务(分词/词性标注/词干提取-词形还原/新词发现/形态分析/拼写校正)的简介及其应用
NLP:自然语言处理技术之词语级别相关术语解释(如上位词/WordNet).基于词汇层面的词法分析(Lexical Analysis)六大任务(分词/词性标注/词干提取-词形还原/新词发现/形态分析/ ...

python 新词发现

wordseg.py :

mian.py

python 新词发现相关推荐

最新文章

热门文章