wordseg.py :

Reference: http://www.matrix67.com/blog/archives/5044
import re
import math
import jsondef entropyOfList(ls):"""给到一个列表(词,频次), 计算熵,用于计算左右熵sum(-p[i]*log(p[i])"""elements = {}for e in ls:elements[e] = elements.get(e, 0) + 1length = float(len(ls))return sum([-v/length*math.log(v/length) for v in list(elements.values())])def genSubparts(string):"""Partition a string into all possible two parts, e.g.given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")]For string of length 1, return empty list"""length = len(string)res = []for i in range(1, length):res.append((string[0:i], string[i:]))return resdef indexOfSortedSuffix(doc, max_word_len):"""生成最大词长度为max_word_len前提下,doc字符串所有可能的词的索引组合(起始和结束索引)"""indexes = []length = len(doc)for i in range(0, length):for j in range(i + 1, min(i + 1 + max_word_len, length + 1)):indexes.append((i, j))return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]])class WordInfo(object):"""Store information of each word, including its freqency, left neighbors and right neighbors"""def __init__(self, text):super(WordInfo, self).__init__()self.text = textself.num = 0self.freq = 0.0self.left = []self.right = []self.aggregation = 0def update(self, left, right):"""Increase frequency of this word, then append left/right neighbors@param left a single character on the left side of this word@param right as left is, but on the right side"""self.num += 1if left: self.left.append(left)if right: self.right.append(right)def compute(self, length):"""Compute frequency and entropy of this word@param length length of the document for training to get words"""self.freq = float(self.num)/lengthself.left = entropyOfList(self.left)self.right = entropyOfList(self.right)def computeAggregation(self, words_dict):"""计算词的凝固度@param words_dict frequency dict of all candidate words"""parts = genSubparts(self.text)if len(parts) > 0:self.aggregation = min([self.freq/(words_dict[p1_p2[0]].freq*words_dict[p1_p2[1]].freq) for p1_p2 in parts])class WordSegment(object):"""Main class for Chinese word segmentation1. Generate words from a long enough document2. Do the segmentation work with the document"""# if a word is combination of other shorter words, then treat it as a long wordL = 0# if a word is combination of other shorter words, then treat it as the set of shortest wordsS = 1# if a word contains other shorter words, then return all possible resultsALL = 2def __init__(self, doc, max_word_len=5,min_num=3, min_freq=0.00005, min_entropy=2.0, min_aggregation=50):super(WordSegment, self).__init__()self.max_word_len = max_word_lenself.min_num = min_numself.min_freq = min_freqself.min_entropy = min_entropyself.min_aggregation = min_aggregation# print(doc)self.word_infos = self.genWords(doc)# print([x.text for x in self.word_infos if len(x.text)>1])# print('$$$$')# Result infomations, i.e., average data of all wordsword_count = float(len(self.word_infos))self.avg_len = sum([len(w.text) for w in self.word_infos])/word_countself.avg_freq = sum([w.freq for w in self.word_infos])/word_countself.avg_left_entropy = sum([w.left for w in self.word_infos])/word_countself.avg_right_entropy = sum([w.right for w in self.word_infos])/word_countself.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count# Filter out the results satisfy all the requirementsfilter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation \and v.num > self.min_num and v.freq > self.min_freq \and v.left > self.min_entropy and v.right > self.min_entropyself.word_with_freq = [(w.text, w.freq,w.num) for w in list(filter(filter_func, self.word_infos))]self.words = [w[0] for w in self.word_with_freq]self.word_with_user_sort=sorted(self.word_infos,key=lambda v:self.sort_word(v), reverse=True)self.words_with_user_sort=[(w.text,self.sort_word(w)) for w in self.word_with_user_sort]def sort_word(self,word):return min(word.left,word.right)*(word.aggregation)def genWords(self, doc):"""Generate all candidate words with their frequency/entropy/aggregation informations@param doc the document used for words generation"""pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+')doc = re.sub(pattern, ' ', doc)# print(doc)suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len)#print(suffix_indexes)word_cands = {}# compute frequency and neighborsfor suf in suffix_indexes:word = doc[suf[0]:suf[1]]if word not in word_cands:word_cands[word] = WordInfo(word)word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1])# compute probability and entropylength = len(doc)for k in word_cands:word_cands[k].compute(length)values = sorted(list(word_cands.values()), key=lambda x: len(x.text))for v in values:#单字的词不计算if len(v.text) == 1 or ' 'in v.text: continuev.computeAggregation(word_cands)return sorted(values, key=lambda v: v.freq, reverse=True)def segSentence(self, sentence, method=ALL):"""Segment a sentence with the words generated from a document@param sentence the sentence to be handled@param method segmentation method"""i = 0res = []while i < len(sentence):if method == self.L or method == self.S:j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1]for j in j_range:if j == 1 or sentence[i:i + j] in self.words:res.append(sentence[i:i + j])i += jbreakelse:to_inc = 1for j in range(2, self.max_word_len + 1):if i + j <= len(sentence) and sentence[i:i + j] in self.words:res.append(sentence[i:i + j])if to_inc == 1: to_inc = jif to_inc == 1: res.append(sentence[i])i += to_increturn res


from wordseg import *
ls=[x.strip() for x in file.readlines()]
ws = WordSegment(content, max_word_len=5, min_aggregation=1, min_entropy=0.5,min_num=1)
print('average len: ', ws.avg_len)
print('average frequency: ', ws.avg_freq)
print('average left entropy: ', ws.avg_left_entropy)
print('average right entropy: ', ws.avg_right_entropy)
print('average aggregation: ', ws.avg_aggregation)
res=[x[0] for x in ws.words_with_user_sort if len(x)>1 and x[1]>0]

