Python比较文本相似度的7种方法（详细）

1词袋模型

from gensim import corpora
from gensim import models
from gensim import similarities
#from corpora.corpus import Corpus
# 1 分词
# 1.1 历史比较文档的分词
all_location_list = []
for doc in location_list:doc_list = [word for word in jieba.cut_for_search(doc)]# doc_list = [word for word in jieba.cut(doc)]all_location_list.append(doc_list)# 1.2 测试文档的分词doc_test="A市A市经济学院体育学院"
doc_test_list = [word for word in jieba.cut_for_search(doc_test)]
# doc_test_list = [word for word in jieba.cut(doc_test)]# 2 制作语料库
# 2.1 获取词袋
dictionary = corpora.Dictionary(all_location_list)# 2.2 制作语料库
# 历史文档的二元组向量转换
corpus = [dictionary.doc2bow(doc) for doc in all_location_list]
# 测试文档的二元组向量转换
doc_test_vec = dictionary.doc2bow(doc_test_list)# 3 相似度分析
# 3.1 使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)
# 获取测试文档中，每个词的TF-IDF值
tfidf[doc_test_vec]# 3.2 对每个目标文档，分析测试文档的相似度
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]# 根3.3 据相似度排序
sorted(enumerate(sim), key=lambda item: -item[1])

2TF-IDF

import jieba
from gensim import corpora,models,similaritiesall_location_list = []
for doc in location_list:doc_list = [word for word in jieba.cut_for_search(doc)]all_location_list.append(doc_list)# 制作语料库,获取词袋
dictionary = corpora.Dictionary(all_location_list)
corpus = [dictionary.doc2bow(doc) for doc in all_location_list]
# 使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)#特征数
featureNUM = len(dictionary.token2id.keys())
#通过TfIdf对整个语料库进行转换并将其编入索引，以准备相似性查询
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNUM)
#稀疏向量.dictionary.doc2bow(doc)是把文档doc变成一个稀疏向量，[(0, 1), (1, 1)]，表明id为0,1的词汇出现了1次，至于其他词汇，没有出现。doc_test= 'A市A市魅力之城商铺无排烟管道，小区'
doc_test_list = [word for word in jieba.cut_for_search(doc_test)]
# 测试文档的二元组向量转换
new_vec = dictionary.doc2bow(doc_test_list)
# 获取测试文档中，每个词的TF-IDF值
tfidf[new_vec]
#计算向量相似度
sim = index[tfidf[new_vec]]
print(sim)for i in range(len(location_list)):doc_test= location_list[i]#     w_ID = biaoge2_paqu.loc[i,'问题ID']w_ID = biaoge2.loc[i,'问题ID']if w_ID:passelse:
#         p = biaoge2_paqu['问题ID'].max() + 1
#         biaoge2_paqu.loc[i,'问题ID'] = p
#     w1_ID = biaoge2_paqu.loc[i,'问题ID']p = biaoge2['问题ID'].max() + 1biaoge2.loc[i,'问题ID'] = pw1_ID = biaoge2.loc[i,'问题ID']doc_test_list = [word for word in jieba.cut_for_search(doc_test)]# 测试文档的二元组向量转换new_vec = dictionary.doc2bow(doc_test_list)# 获取测试文档中，每个词的TF-IDF值tfidf[new_vec]#计算向量相似度sim = index[tfidf[new_vec]]for j in range(len(biaoge2)):w2_ID = biaoge2.loc[j,'问题ID']if w2_ID:passelif list(sim)[j]:biaoge2.loc[j,'问题ID'] = w1_ID
#     print(sim)

3余弦相似度

import jieba
import re
import numpy as np
import os
import pandas as pd #os.chdir(r'C:\Users\Lenovo\Desktop\01040730kg73')
os.chdir(r'C:\Users\Administrator\Desktop\示例数据')data4 = pd.read_excel('4.xlsx')
data4_message = data4['详情']
data4_answer = data4['意见']
message_list = list(data4_message)# 数据去敏
def qingli(s):string1 = s.apply(lambda x: re.sub('[0-9]', '*',str(x)))#去除数字m=re.compile('\s+')#定义空格string2 = string1.apply(lambda x: re.sub(m, '*',x))#去除空格punctuation = """，！？｡＂#＄％＆＇（）＊＋－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""re_punctuation = "[{}]+".format(punctuation)#去除标点符号string3 = string2.apply(lambda x: re.sub(re_punctuation, '*', x))a = string3.apply(lambda x: re.sub('\*','',x))return a# 输入一条留言，关键词统计和词频统计，以列表形式返回
def Count(infile):t = {}f = infilecount = len(f)s = infilei = 0words = jieba.lcut(infile)for word in words:if word != "" and t.__contains__(word):num = t[word]t[word] = num + 1elif word != "":t[word] = 1i = i + 1# 字典按键值降序dic = sorted(t.items(), key=lambda t: t[1], reverse=True)return (dic)# 合并两篇文档的关键词
def MergeWord(T1,T2):MergeWord = []duplicateWord = 0for ch in range(len(T1)):MergeWord.append(T1[ch][0])for ch in range(len(T2)):if T2[ch][0] in MergeWord:duplicateWord = duplicateWord + 1else:MergeWord.append(T2[ch][0])# print('重复次数 = ' + str(duplicateWord))# 打印合并关键词# print(MergeWord)return MergeWord# 得出文档向量
def CalVector(T1,MergeWord):TF1 = [0] * len(MergeWord)for ch in range(len(T1)):TermFrequence = T1[ch][1]word = T1[ch][0]i = 0while i < len(MergeWord):if word == MergeWord[i]:TF1[i] = TermFrequencebreakelse:i = i + 1return TF1def CalConDis(v1,v2,lengthVector):# 计算出两个向量的乘积B = 0i = 0while i < lengthVector:B = v1[i] * v2[i] + Bi = i + 1# 计算两个向量的模的乘积A = 0A1 = 0A2 = 0i = 0while i < lengthVector:A1 = A1 + v1[i] * v1[i]i = i + 1i = 0while i < lengthVector:A2 = A2 + v2[i] * v2[i]i = i + 1A = np.math.sqrt(A1) * np.math.sqrt(A2)print('留言和回复的相似度 = ' + format(float(B) / A,".3f"))for i in range(len(data4_message)):#数据清洗D_message = qingli(data4_message)D_answer = qingli(data4_answer)# 词频统计T_message = Count(D_message[i])T_answer = Count(D_answer[i])# 相同关键词mergeword = MergeWord(T_message,T_answer)#向量化V_message = CalVector(T_message,mergeword)V_answer = CalVector(T_answer,mergeword)# 计算余弦距离#cos值越趋向于1，则说明两篇文档越相似，反之越不相似。print('第'+str(i)+'条')CalConDis(V_message,V_answer,len(V_message))

4Python自带比较相似度函数

import difflib
def string_similar(s1, s2):return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
for i in range(len(data4_message)):s1 = data4_message[i]s2 = data4_answer[i]print(string_similar(s1, s2))

5word2vec

import redef qingli(s):#pattern  = r"(https?://|[@#])\S*"#a = re.sub(pattern, '', s)#string1 = s.apply(lambda x:re.sub('[A-z]','*',str(x)))#去除字母string2 = s.apply(lambda x: re.sub('[0-9]', '*',str(x)))#去除数字m=re.compile('\s+')#定义空格string3 = string2.apply(lambda x: re.sub(m, '*',x))#去除空格punctuation = """，！？｡＂#＄％＆＇（）＊＋－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""re_punctuation = "[{}]+".format(punctuation)#去除标点符号string3 = string2.apply(lambda x: re.sub(re_punctuation, '*', x))a = string3.apply(lambda x: re.sub('\*','',x))return adata4_message_qingli = qingli(data4_all_message)
data4_answer_qingli = qingli(data4_answer)
data4_all_message_qingli = data4_message_qingli+data4_answer_qingli

def stopwordslist(filepath):  stopwords = [line.strip() for line in open(filepath, 'r', encoding='GB18030').readlines()]  return stopwords  stopwords = stopwordslist("stopword.txt")def preprocess_text_unsupervised(content_lines, sentences):for line in content_lines:try:segs = jieba.cut(line)segs = filter(lambda x:len(x)>1, segs)segs = filter(lambda x:x not in stopwords, segs)sentences.append(list(segs))except Exception:print(line)continue
#生成无监督训练数据
sentences = []preprocess_text_unsupervised(data4_all_message_qingli, sentences)
sentences

model=gensim.models.word2vec.Word2Vec(sentences,min_count=1,sg=1,size=100,window=5)
model.most_similar(['管理'])

#需要去除停用词才可达到效果！
def vector_similarity(s1, s2):def sentence_vector(s):words = jieba.lcut(s)#words = jieba.analyse.extract_tags(s,allowPOS=('n','nr','nr1','nr2','nrj','nrf','ns','nsf','nt','nz','nl','ng','nrfg'))ba = []for i in range(len(words)):if len(words[i])<=1:ba.append(words[i])words=list(set(words)-set(ba))words=list(set(words)-set(stopwords))v = np.zeros(100)for word in words:v += model[word]v /= len(words)return vv1, v2 = sentence_vector(s1), sentence_vector(s2)return np.dot(v1, v2) / (norm(v1) * norm(v2))s1 = data4_message_qingli[1]
s2 = data4_answer_qingli[1]
s3 = '您好，由于本人爱人身份证过期，回I6市办了临时身份证，正式身份证要1个月后才能拿到，现在又办不了加急，医院不给办出生证明，必须要正式身份证才给办理，但是小孩刚出生，因黄旦太高住院花了不少钱，急着办落地险，希望能报销一部分，现在医院不给办出生证明无法办理新生儿落地险，等正式身份证拿到，已然过了办理落地险的时间，我很疑惑，临时身份证效力等同正式身份证，信息一样可以手动录入，为什么就是不给办理？'
vector_similarity(s1, s2)

6JS距离

import string
from io import StringIOfrom math import log
import numpy as np
KLD=(lambda p,q:sum([_p * log(_p,2)-_p * log(_q,2) for (_p,_q) in zip(p,q)]))def JSD_core(p,q):p,q=zip(*filter(lambda (x,y):x!=0 or y!=0, zip(p,q))) #去掉二者都是0的概率值M = [0.5*(_p+_q) for _p,_q in zip(p,q)]p=p+np.spacing(1)q=q+np.spacing(1)M=M+np.spacing(1)
#     print p,q,Mreturn 0.5*KLD(p,M)+0.5*KLD(q,M)reg=lambda x:[x.count(i) for i in string.ascii_lowercase]  #频数分布
rate=lambda y:[round(i*1.0/sum(reg(y)),4) for i in reg(y)]  #概率分布
s1 = data4_message[1]
s2 = data4_answer[1]
# s1='ahaebssa'
# s2='awohwsess'
print (JSD_core(rate(s1),rate(s2)))

import numpy as np
import scipy.stats
p=np.asarray([0.65,0.25,0.07,0.03])
q=np.array([0.6,0.25,0.1,0.05])
q2=np.array([0.1,0.2,0.3,0.4])
def JS_divergence(p,q):M=(p+q)/2return 0.5*scipy.stats.entropy(p, M)+0.5*scipy.stats.entropy(q, M)
print(JS_divergence(p,q))  # 0.003093977084273652
print(JS_divergence(p,q2)) # 0.24719159952098618
print(JS_divergence(p,p)) # 0.0

7simtext（参考#https://www.colabug.com/2020/0419/7278348/amp/）

# simtext相似度：
# simtext可以计算两文档间四大文本相似性指标，分别为：
#     Sim_Cosine cosine相似性
#     Sim_Jaccard Jaccard相似性
#     Sim_MinEdit 最小编辑距离
#     Sim_Simple 微软Word中的track changes
from simtext import similarityfor i in range(len(data4_message)):text1 = data4_message[i]text2 = data4_answer[i]sim = similarity()res = sim.compute(text1, text2)print('第'+str(i)+'条')print(res)

Python比较文本相似度的7种方法（详细）相关推荐

中文文本关键词抽取的三种方法-python
利用Python实现中文文本关键词抽取的三种方法转自github 文本关键词抽取,是对文本信息进行高度凝练的一种有效手段,通过3-5个词语准确概括文本的主题,帮助读者快速理解文本信息.目前,用于文本 ...
python sklearn.neural_network.MLPClassifier() 神经网络改变模型复杂度的四种方法
MLPClassifier() 改变模型复杂度的四种方法调整神经网络每一个隐藏层上的节点数调节神经网络隐藏层的层数调节activation的方式通过调整alpha值来改变模型正则化的程度(增大 ...
基于Python实现中文文本关键词抽取的三种方法课程报告+项目源码及数据
资源下载地址:https://download.csdn.net/download/sheziqiong/85737856 资源下载地址:https://download.csdn.net/downl ...
python使用教程cmd啥意思-对python中执行DOS命令的3种方法总结
1. 使用os.system("cmd") 特点是执行的时候程序会打出cmd在Linux上执行的信息. import os os.system("ls") 2. ...
python运行命令_对python中执行DOS命令的3种方法总结
1. 使用os.system("cmd") 特点是执行的时候程序会打出cmd在Linux上执行的信息. import os os.system("ls") 2. ...
python运行方法_对python中执行DOS命令的3种方法总结
1. 使用os.system("cmd") 特点是执行的时候程序会打出cmd在Linux上执行的信息. import os os.system("ls") 2. ...
python 行情数据,拼多多股票：Python获取股票行情数据的一种方法
Python获取股票行情数据的一种方法拼多多股票本号帮大家找了一个可免费获取股票行情数据的接口. Tushare社区目前主要维护新版本:tushare pro,数据更稳定拼多多股票质量更高,可获取 ...
python csv库,Python 中导入csv数据的三种方法
Python 中导入csv数据的三种方法,具体内容如下所示: 1.通过标准的Python库导入CSV文件: Python提供了一个标准的类库CSV文件.这个类库中的reader()函数用来导入CSV文 ...
python打开文件不存在-Python判断文件是否存在的三种方法
原标题:Python判断文件是否存在的三种方法通常在读写文件之前,需要判断文件或目录是否存在,不然某些处理方法可能会使程序出错.所以最好在做任何操作之前,先判断文件是否存在. 这里将介绍三种判断文件 ...

Python比较文本相似度的7种方法（详细）

Python比较文本相似度的7种方法（详细）相关推荐

最新文章

热门文章