






|----PyCharm Python3.8



文件命名为 cosin_computer.py

import math
import redef compute_cosine(text_a, text_b):# 找单词及词频words1 = text_a.split(' ')words2 = text_b.split(' ')# print(words1)words1_dict = {}words2_dict = {}for word in words1:# word = word.strip(",.?!;")word = re.sub('[^a-zA-Z]', '', word)word = word.lower()# print(word)if word != '' and word in words1_dict:num = words1_dict[word]words1_dict[word] = num + 1elif word != '':words1_dict[word] = 1else:continuefor word in words2:# word = word.strip(",.?!;")word = re.sub('[^a-zA-Z]', '', word)word = word.lower()if word != '' and  word in words2_dict:num = words2_dict[word]words2_dict[word] = num + 1elif word != '':words2_dict[word] = 1else:continue# print(words1_dict)# print(words2_dict)# return Truedic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)# print(dic1)# print(dic2)# 得到词向量words_key = []for i in range(len(dic1)):words_key.append(dic1[i][0])  # 向数组中添加元素for i in range(len(dic2)):if dic2[i][0] in words_key:# print 'has_key', dic2[i][0]passelse:  # 合并words_key.append(dic2[i][0])# print(words_key)vect1 = []vect2 = []for word in words_key:if word in words1_dict:vect1.append(words1_dict[word])else:vect1.append(0)if word in words2_dict:vect2.append(words2_dict[word])else:vect2.append(0)# print(vect1)# print(vect2)# 计算余弦相似度sum = 0sq1 = 0sq2 = 0for i in range(len(vect1)):sum += vect1[i] * vect2[i]sq1 += pow(vect1[i], 2)sq2 += pow(vect2[i], 2)try:result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)except ZeroDivisionError:result = 0.0# print(result)return result


主函数 main.py

import os
from Py20201027_TextProcess.cosin_computer import compute_cosine #根据自己的情况做调整
# 加载文件
file = open('E:/F01_Researches/R01_ProductServiceSystem/PSS09-会议文献/CIRP Conference on Industrial Product-Service Systems/11th_ScienceDirect_citations.txt','r',encoding='UTF-8')
# 获取所有行
lines = file.readlines()
line_count = len(lines)
# 打印前10行
for i in range(1,line_count,11):print(i,'-->',lines[i])if i > 100:break
'''file_root = 'E:/F01_Researches/R01_ProductServiceSystem/PSS09-会议文献/CIRP Conference on Industrial Product-Service Systems'
# 加载文件列表
files = os.listdir(os.path.join(file_root,'11th'))
# 标题预处理
new_files = []
for i in range(len(files)):temp = files[i].split('-')temp = temp[0:-1]x = ' '.join(temp)new_files.append(x)# 行预处理
new_lines = []
for i in range(len(lines)):temp = lines[i].split('-')x = ' '.join(temp)new_lines.append(x)
#for i in range(10):
#    print("Before: ", files[i], '|-->After: ', new_files[i])count = 1
# 相似度计算
for i in range(1,line_count,11): #遍历text中的每个题目print('第',count,'-->',lines[i]) #打印当前行max_sim = 0.max_str_index = 0print('\t\tSim: ',end='\t')for j in range(len(new_files)):sim = compute_cosine(lines[i],new_files[j])print(sim, end='\t')if sim > max_sim:max_sim = simmax_str_index = j#print('\t',new_files[j],'\t','相似度:',sim)print('\n\t\t相似度最高的为:',files[max_str_index],'\t最大相似度为:',max_sim)count += 1



