余弦定理对比文本相似度实现查重

1、在pom.xml中添加分词器与word读取依赖

<!-- ik.中文分词器依赖--><dependency><groupId>com.janeluo</groupId><artifactId>ikanalyzer</artifactId><version>2012_u6</version></dependency><!-- lucene依赖 --><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-highlighter</artifactId><version>4.7.2</version></dependency>
<!--word读取--><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>3.14-beta1</version></dependency>

2、jsp 使用

<button type="button" style="width: 8%;outline: none;margin-left: 1.5%;margin-bottom: 15px" onclick="ExerciseCheck()" class="btn btn-primary">作业查重</button>
<script type="text/javascript">
function ExerciseCheck() {var CourseID = $("#ds_course").val();var ChapterID = $("#ds_cnumber").val();var MinChapterID = $("#ds_snumber").val();$.ajax({type:"POST",url:"/exercise/ExerciseRecheck",dataType:"json",data:{CourseID:CourseID,ChapterID:ChapterID,MinChapterID:MinChapterID},async:false,success: function(data){$("#listExport").html($("#listExport").html()+ "<a style=\"width: 100%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px\" href=\"/excel/ListExports?CourseID="+CourseID+"&ChapterID="+ChapterID+"&MinChapterID="+MinChapterID+"\" class=\"btn btn-primary\">导出名单</a>");openCheckWin();document.getElementById('checkView').innerHTML = "";if (data == null || data == ""){$("#checkView").html($("#checkView").html()+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"+ "<span>暂无内容</span>"+ "</li>");}else{var json = eval(data);$.each(json, function (index) {var DetectionUserID = json[index].detectionUserID;var DetectionUserName = json[index].detectionUserName;var MeasuredUserID = json[index].measuredUserID;var MeasuredUserName = json[index].measuredUserName;var Similarity = json[index].similarity;$("#checkView").html($("#checkView").html()+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"+ "<span>" +"学号："+ DetectionUserID + "&nbsp;&nbsp;" +"姓名："+ DetectionUserName + "</span>"+ "</li>");});}}});
}function openCheckWin(){document.getElementById("CheckWin").style.display = "block";
}
</script>
<div class="floatingWin" style="border-radius: 5px;margin-left: 28%;width: 40%;display: none;position: absolute;background: #FFFFFF;height: 450px;z-index: 111111111111111111111111111111" id="CheckWin"><div id="listExport" style="width: 13%;float: left;margin-left: 1.5%"></div><button type="button" style="width: 14%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px" onclick="closeCheckWin()" class="btn btn-primary">关闭</button><div class="form-group"><span class="text-muted" style="margin-left: 1.5%">疑似抄袭名单</span><ul class="list-group" id="checkView" style="overflow: auto"></ul></div>
</div>

3、controller

@ResponseBody@RequestMapping("/ExerciseRecheck")public List<ExerciseCheck> ExerciseRecheck(String CourseID,String ChapterID,String MinChapterID,HttpServletRequest request) throws Exception {List<Exercise> list1 = exerciseService.QuerySectionExercise(CourseID,ChapterID,MinChapterID);List<Exercise> list2 = list1;List<ExerciseCheck> exerciseChecks = new ArrayList<ExerciseCheck>();if(list1.size() < 2){System.out.println("作业数小于2无法查重！");}else {int l = 0;for(int i = 0;i < list1.size();i++){String file1 = new WordRead().readWord(list1.get(i).getChapterExercise(),request).replaceAll("\r|\n", "");for (int j = 0;j<list2.size();j++){if( i != j){String file2 = new WordRead().readWord(list2.get(j).getChapterExercise(),request).replaceAll("\r|\n", "");Double f = new CosineSimilarAlgorithm().cosSimilarityByString(file1,file2);if(f > 0.6){ExerciseCheck ec = new ExerciseCheck();ec.setDetectionUserID(list1.get(i).getUserID());ec.setDetectionUserName(list1.get(i).getUserName());ec.setMeasuredUserID(list2.get(j).getUserID());ec.setMeasuredUserName(list2.get(j).getUserName());ec.setSimilarity(f.toString());exerciseChecks.add(l,ec);l++;continue;}}}}}return exerciseChecks;}

4、读取word文件内容

 package com.graduation.util;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.InputStream;import java.io.OutputStream;import java.util.List;import java.util.UUID;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.usermodel.Picture;import org.apache.poi.hwpf.usermodel.Range;import org.apache.poi.hwpf.usermodel.Table;import org.apache.poi.hwpf.usermodel.TableCell;import org.apache.poi.hwpf.usermodel.TableIterator;import org.apache.poi.hwpf.usermodel.TableRow;import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;import javax.servlet.http.HttpServletRequest;public class WordRead {public static String readWord(String filename,HttpServletRequest request) throws Exception{String path = request.getServletContext().getRealPath("");System.out.println(path);String FilePath=path + "\\static\\exercises\\";//从我们的上传文件夹中去取String BASE_PATH = FilePath;filename = filename+".doc";File file = new File(BASE_PATH + filename);System.out.println(BASE_PATH + filename);HWPFDocument doc = new HWPFDocument(new FileInputStream(file));//通过 Doc对象直接获取TextStringBuilder sb = doc.getText();//        System.out.println("文本："+sb.toString());//通过Range对象获取TextRange range = doc.getRange();String text = range.text();//        System.out.println(text);//获取段落数目//在Word中，一个回车符就是一个段落了int nums = range.numParagraphs();//        System.out.println("多少个段落："+nums);//获取doc中的图片数List<Picture> pics = doc.getPicturesTable().getAllPictures();for(Picture pic:pics){//图片在doc文件中的位置,分析Doc 转化成其他文本时需要用到int start = pic.getStartOffset();int width = pic.getWidth();int height = pic.getHeight();String mimeType = pic.getMimeType();System.out.printf("开始位置%d\t图片大小度%d,高%d,\t图片类型%s\r\n",start,width,height,mimeType);}//1.通过Picture的writeImageContent方法 写文件//2.获取Picture的byte 自己写copyPic2Disk(pics, new File(BASE_PATH));//遍历range范围内的table。TableIterator tableIter = new TableIterator(range);while (tableIter.hasNext()) {Table table = tableIter.next();//开始位置int start = table.getStartOffset();//结束位置int end = table.getEndOffset();System.out.printf("开始位置%d,结束为止%d\r\n",start,end);//获取行的数目int rowNum = table.numRows();for (int j = 0; j < rowNum; j++) {//获取每一行TableRow row = table.getRow(j);int cellNum = row.numCells();for (int k = 0; k < cellNum; k++) {//获取每一列TableCell cell = row.getCell(k);// 输出单元格的文本System.out.println(cell.text().trim());}}}return text;}/*** 也可以自己写方法* @param imgByte* @throws Exception*/public static void copyByteToFile(byte[] imgByte,String path) throws Exception {InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);byte[] buff = new byte[1024];String fileName = UUID.randomUUID().toString().substring(0, 6);OutputStream out = new FileOutputStream(new File(path + fileName + ".jpg"));int len = 0;while ((len = in.read(buff)) > 0) {out.write(buff, 0, len);}out.flush();out.close();in.close();}/*** 通过Picture 自己类中的读写方法* @param pics* @param path*/public static void copyPic2Disk(List<Picture> pics,File path){if(pics == null  || pics.size()  <=0){return;}if(!path.isDirectory()){throw new RuntimeException("路径填写不正确");}//当文件夹路径不存在的情况下，我们自己创建文件夹目录if(!path.exists() ){path.mkdirs();}try {for(Picture pic:pics){//写出数据，我们使用的是Poi类中，Picture自己所带的函数pic.writeImageContent(new FileOutputStream(new File(path,pic.suggestFullFileName())));/*byte [] picBytes = pic.getContent(); //获取字节流，也可以自己写入数据copyByteToFile(picBytes);*/}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}

5、CosineSimilarAlgorithm 获取两个文件相似性

 package com.graduation.util;import java.util.ArrayList;import java.util.LinkedHashMap;import java.util.List;import java.util.Map;import java.util.Set;public class CosineSimilarAlgorithm {/**** @Title: cosSimilarityByFile* @Description: 获取两个文件相似性* @param @param firstFile* @param @param secondFile* @param @return* @return Double* @throws*/public static Double cosSimilarityByFile(String firstFile,String secondFile){try{Map<String, Map<String, Integer>> firstTfMap=TfIdfAlgorithm.wordSegCount(firstFile);Map<String, Map<String, Integer>> secondTfMap=TfIdfAlgorithm.wordSegCount(secondFile);if(firstTfMap==null || firstTfMap.size()==0){throw new IllegalArgumentException("firstFile not found or firstFile is empty! ");}if(secondTfMap==null || secondTfMap.size()==0){throw new IllegalArgumentException("secondFile not found or secondFile is empty! ");}Map<String,Integer> firstWords=firstTfMap.get(firstFile);Map<String,Integer> secondWords=secondTfMap.get(secondFile);if(firstWords.size()<secondWords.size()){Map<String, Integer> temp=firstWords;firstWords=secondWords;secondWords=temp;}return calculateCos((LinkedHashMap<String, Integer>)firstWords, (LinkedHashMap<String, Integer>)secondWords);}catch(Exception e){e.printStackTrace();}return 0d;}/**** @Title: cosSimilarityByString* @Description: 得到两个字符串的相似性* @param @param first* @param @param second* @param @return* @return Double* @throws*/public static Double cosSimilarityByString(String first,String second){try{Map<String, Integer> firstTfMap=TfIdfAlgorithm.segStr(first);Set<String> set = firstTfMap.keySet();String res = "";for(String i:set) {res = res+i;}//System.out.println(res);System.out.println("------------------------");Map<String, Integer> secondTfMap=TfIdfAlgorithm.segStr(second);////           for(int i=0;i<firstTfMap.size();i++) {//              System.out.print(secondTfMap.toString());//         }System.out.println("------------------------");if(firstTfMap.size()<secondTfMap.size()){Map<String, Integer> temp=firstTfMap;firstTfMap=secondTfMap;secondTfMap=temp;}return calculateCos((LinkedHashMap<String, Integer>)firstTfMap, (LinkedHashMap<String, Integer>)secondTfMap);}catch(Exception e){e.printStackTrace();}return 0d;}/**** @Title: calculateCos* @Description: 计算余弦相似性* @param @param first* @param @param second* @param @return* @return Double* @throws*/private static Double calculateCos(LinkedHashMap<String, Integer> first,LinkedHashMap<String, Integer> second){List<Map.Entry<String, Integer>> firstList = new ArrayList<Map.Entry<String, Integer>>(first.entrySet());List<Map.Entry<String, Integer>> secondList = new ArrayList<Map.Entry<String, Integer>>(second.entrySet());//计算相似度double vectorFirstModulo = 0.00;//向量1的模double vectorSecondModulo = 0.00;//向量2的模double vectorProduct = 0.00; //向量积int secondSize=second.size();for(int i=0;i<firstList.size();i++){if(i<secondSize){vectorSecondModulo+=secondList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();vectorProduct+=firstList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();}vectorFirstModulo+=firstList.get(i).getValue().doubleValue()*firstList.get(i).getValue().doubleValue();}return vectorProduct/(Math.sqrt(vectorFirstModulo)*Math.sqrt(vectorSecondModulo));}public static void main(String[] args){Double result=cosSimilarityByString("三网融合又可被称为“数位汇流”，是将电信网、计算机互联网和有线电视网三者互联互通，融合发展，从而为用户提供语音、数据和广播电视等服务， 伴随着通信行业加快发展，传统的三网融合已逐渐成为当前互联网发展的趋势。","三网融合是指电信网、广播电视网、互联网在向宽带通信网、数字电视网、下一代互联网演进过程中，三大网络通过技术改造，其技术功能趋于一致，业务范围趋于相同，网络互联互通、资源共享，能为用户提供语音、数据和广播电视等多种服务。三合并不意味着三大网络的物理合一，而主要是指高层业务应用的融合。三网融合应用广泛，遍及智能交通、环境保护、政府工作、公共安全、平安家居等多个领域。以后的手机可以看电视、上网，电视可以打电话、上网，电脑也可以打电话、看电视。三者之间相互交叉，形成你中有我、我中有你的格局。");System.out.println(result);}}

6、TfIdfAlgorithm 统计单词的TF-IDF

package com.graduation.util;import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;public class TfIdfAlgorithm {/*** 文件名保存在list*/private static List<String> fileList = new ArrayList<String>();/*** 所有文件tf结果.key:文件名,value:该文件tf*/private static Map<String, Map<String, Double>> allTfMap = new HashMap<String, Map<String, Double>>();/*** 所有文件分词结果.key:文件名,value:该文件分词统计*/private static Map<String, Map<String, Integer>> allSegsMap = new HashMap<String, Map<String, Integer>>();/*** 所有文件分词的idf结果.key:文件名,value:词w在整个文档集合中的逆向文档频率idf (Inverse Document Frequency)，即文档总数n与词w所出现文件数docs(w, D)比值的对数*/private static Map<String, Double> idfMap = new HashMap<String, Double>();/*** 统计包含单词的文档数  key:单词  value:包含该词的文档数*/private static Map<String, Integer> containWordOfAllDocNumberMap=new HashMap<String, Integer>();/*** 统计单词的TF-IDF* key:文件名 value:该文件tf-idf*/private static Map<String, Map<String, Double>> tfIdfMap = new HashMap<String, Map<String, Double>>();/**** @Title: readDirs* @Description: 递归获取文件* @param @param filepath* @param @return List<String>* @param @throws FileNotFoundException* @param @throws IOException* @return List<String>* @throws*/private static List<String> readDirs(String filepath) throws FileNotFoundException, IOException {try {File file = new File(filepath);if (!file.isDirectory()) {System.out.println("输入的参数应该为[文件夹名]");System.out.println("filepath: " + file.getAbsolutePath());} else if (file.isDirectory()) {String[] filelist = file.list();for (int i = 0; i < filelist.length; i++) {File readfile = new File(filepath + File.separator + filelist[i]);if (!readfile.isDirectory()) {fileList.add(readfile.getAbsolutePath());} else if (readfile.isDirectory()) {readDirs(filepath + File.separator + filelist[i]);}}}} catch (FileNotFoundException e) {e.printStackTrace();}return fileList;}/**** @Title: readFile* @Description: 读取文件转化成string* @param @param file* @param @return String* @param @throws FileNotFoundException* @param @throws IOException* @return String* @throws*/private static String readFile(String file) throws FileNotFoundException, IOException {StringBuffer sb = new StringBuffer();InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");BufferedReader br = new BufferedReader(is);String line = br.readLine();while (line != null) {sb.append(line).append("\r\n");line = br.readLine();}br.close();return sb.toString();}/**** @Title: segString* @Description: 用ik进行字符串分词,统计各个词出现的次数* @param @param content* @param @return  Map<String, Integer>* @return Map<String,Integer>* @throws*/private static Map<String, Integer> segString(String content){// 分词Reader input = new StringReader(content);// 智能分词关闭（对分词的精度影响很大）IKSegmenter iks = new IKSegmenter(input, true);Lexeme lexeme = null;Map<String, Integer> words = new HashMap<String, Integer>();try {while ((lexeme = iks.next()) != null) {if (words.containsKey(lexeme.getLexemeText())) {words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);} else {words.put(lexeme.getLexemeText(), 1);}}}catch(IOException e) {e.printStackTrace();}return words;}/**** @Title: segStr* @Description: 返回LinkedHashMap的分词* @param @param content* @param @return* @return Map<String,Integer>* @throws*/public static Map<String, Integer> segStr(String content){// 分词Reader input = new StringReader(content);// 智能分词关闭（对分词的精度影响很大）IKSegmenter iks = new IKSegmenter(input, true);Lexeme lexeme = null;Map<String, Integer> words = new LinkedHashMap<String, Integer>();try {while ((lexeme = iks.next()) != null) {if (words.containsKey(lexeme.getLexemeText())) {words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);} else {words.put(lexeme.getLexemeText(), 1);}}}catch(IOException e) {e.printStackTrace();}return words;}public static Map<String, Integer> getMostFrequentWords(int num,Map<String, Integer> words){Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();int count=0;// 词频统计List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(words.entrySet());Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {return obj2.getValue() - obj1.getValue();}});// 高频词输出for (int j = 0; j < info.size(); j++) {// 词-->频if(info.get(j).getKey().length()>1){if(num>count){keywords.put(info.get(j).getKey(), info.get(j).getValue());count++;}else{break;}}}return keywords;}/**** @Title: tf* @Description: 分词结果转化为tf,公式为:tf(w,d) = count(w, d) / size(d)* 即词w在文档d中出现次数count(w, d)和文档d中总词数size(d)的比值* @param @param segWordsResult* @param @return* @return HashMap<String,Double>* @throws*/private static HashMap<String, Double> tf(Map<String, Integer> segWordsResult) {HashMap<String, Double> tf = new HashMap<String, Double>();// 正规化if(segWordsResult==null || segWordsResult.size()==0){return tf;}Double size=Double.valueOf(segWordsResult.size());Set<String> keys=segWordsResult.keySet();for(String key: keys){Integer value=segWordsResult.get(key);tf.put(key, Double.valueOf(value)/size);}return tf;}/**** @Title: allTf* @Description: 得到所有文件的tf* @param @param dir* @param @return Map<String, Map<String, Double>>* @return Map<String,Map<String,Double>>* @throws*/public static Map<String, Map<String, Double>> allTf(String dir){try{fileList=readDirs(dir);for(String filePath : fileList){String content=readFile(filePath);Map<String, Integer> segs=segString(content);allSegsMap.put(filePath, segs);allTfMap.put(filePath, tf(segs));}}catch(FileNotFoundException ffe){ffe.printStackTrace();}catch(IOException io){io.printStackTrace();}return allTfMap;}/**** @Title: wordSegCount* @Description: 返回分词结果,以LinkedHashMap保存* @param @param dir* @param @return* @return Map<String,Map<String,Integer>>* @throws*/public static Map<String, Map<String, Integer>> wordSegCount(String dir){try{fileList=readDirs(dir);for(String filePath : fileList){String content=readFile(filePath);Map<String, Integer> segs=segStr(content);allSegsMap.put(filePath, segs);}}catch(FileNotFoundException ffe){ffe.printStackTrace();}catch(IOException io){io.printStackTrace();}return allSegsMap;}/**** @Title: containWordOfAllDocNumber* @Description: 统计包含单词的文档数  key:单词  value:包含该词的文档数* @param @param allSegsMap* @param @return* @return Map<String,Integer>* @throws*/private static Map<String, Integer> containWordOfAllDocNumber(Map<String, Map<String, Integer>> allSegsMap){if(allSegsMap==null || allSegsMap.size()==0){return containWordOfAllDocNumberMap;}Set<String> fileList=allSegsMap.keySet();for(String filePath: fileList){Map<String, Integer> fileSegs=allSegsMap.get(filePath);//获取该文件分词为空或为0,进行下一个文件if(fileSegs==null || fileSegs.size()==0){continue;}//统计每个分词的idfSet<String> segs=fileSegs.keySet();for(String seg : segs){if (containWordOfAllDocNumberMap.containsKey(seg)) {containWordOfAllDocNumberMap.put(seg, containWordOfAllDocNumberMap.get(seg) + 1);} else {containWordOfAllDocNumberMap.put(seg, 1);}}}return containWordOfAllDocNumberMap;}/**** @Title: idf* @Description: idf = log(n / docs(w, D))* @param @param containWordOfAllDocNumberMap* @param @return Map<String, Double>* @return Map<String,Double>* @throws*/public static Map<String, Double> idf(Map<String, Map<String, Integer>> allSegsMap){if(allSegsMap==null || allSegsMap.size()==0){return idfMap;}containWordOfAllDocNumberMap=containWordOfAllDocNumber(allSegsMap);Set<String> words=containWordOfAllDocNumberMap.keySet();Double wordSize=Double.valueOf(containWordOfAllDocNumberMap.size());for(String word: words){Double number=Double.valueOf(containWordOfAllDocNumberMap.get(word));idfMap.put(word, Math.log(wordSize/(number+1.0d)));}return idfMap;}/**** @Title: tfIdf* @Description: tf-idf* @param @param tf,idf* @return Map<String, Map<String, Double>>* @throws*/public static Map<String, Map<String, Double>> tfIdf(Map<String, Map<String, Double>> allTfMap,Map<String, Double> idf){Set<String> fileList=allTfMap.keySet();for(String filePath : fileList){Map<String, Double> tfMap=allTfMap.get(filePath);Map<String, Double> docTfIdf=new HashMap<String,Double>();Set<String> words=tfMap.keySet();for(String word: words){Double tfValue=Double.valueOf(tfMap.get(word));Double idfValue=idf.get(word);docTfIdf.put(word, tfValue*idfValue);}tfIdfMap.put(filePath, docTfIdf);}return tfIdfMap;}public static void main(String[] args){System.out.println("tf--------------------------------------");Map<String, Map<String, Double>> allTfMap=TfIdfAlgorithm.allTf("d://dir");Set<String> fileList=allTfMap.keySet();for(String filePath : fileList){Map<String, Double> tfMap=allTfMap.get(filePath);Set<String> words=tfMap.keySet();for(String word: words){System.out.println("fileName:"+filePath+"     word:"+word+"      tf:"+tfMap.get(word));}}System.out.println("idf--------------------------------------");Map<String, Double> idfMap=TfIdfAlgorithm.idf(allSegsMap);Set<String> words=idfMap.keySet();for(String word : words){System.out.println("word:"+word+"     tf:"+idfMap.get(word));}System.out.println("tf-idf--------------------------------------");Map<String, Map<String, Double>> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);Set<String> files=tfIdfMap.keySet();for(String filePath : files){Map<String, Double> tfIdf=tfIdfMap.get(filePath);Set<String> segs=tfIdf.keySet();for(String word: segs){System.out.println("fileName:"+filePath+"     word:"+word+"        tf-idf:"+tfIdf.get(word));}}}
}

余弦定理对比文本相似度实现查重相关推荐

Python3实现计算文本相似度（查重机制）
使用Python3中jieba包进行分词,整理为指定格式,gensim库将要对比的文档通过doc2bow转化为稀疏向量,再通过models中的tf-idf将语料库进行处理,特征值和稀疏矩阵相似度建立索 ...
python实现文本查重系统_NLP之gensim库python实现文本相似度/匹配/查重
目的给定一个或多个搜索词,如"高血压患者",从已有的若干篇文本中找出最相关的(n篇)文本. 理论知识文本检索(text retrieve)的常用策略是:用一个ranking ...
文本改写和论文查重工具：探狐文案AICopy for Mac中文版
探狐文案AICopy for Mac是一款专业的文本改写和论文查重工具,探狐文案mac版能够快速创建高转化率的内容文案.大学论文.产品说明等,还能轻松进行写作笔记改写和论文翻译查重,非常实用,喜欢这款 ...
Java实现余弦定理计算文本相似度
相似度度量(Similarity),即计算个体间的相似程度,相似度度量的值越小,说明个体间相似度越小,相似度的值越大说明个体差异越大. 对于多个不同的文本或者短文本对话消息要来计算他们之间的相似度如何 ...
python余弦定理_余弦定理与文本相似度
什么是余弦定理学过向量代数的人都知道,向量实际上是多维空间中有方向的线段.如果两个向量的方向一致,即夹角接近零,那么这两个向量就相近.而要确定两个向量方向是否一致,这就要用到余弦定理计算向量的夹角了 ...
java 余弦定理_文本相似度计算之余弦定理
前言余弦相似度,又称为余弦相似性,是通过计算两个向量的夹角余弦值来评估他们的相似度.余弦相似度将向量根据坐标值,绘制到向量空间中.用向量空间中两个向量夹角的余弦值作为衡量两个个体间差异的大小.余弦值 ...
【NLP】余弦定理计算文本相似度
一. 余弦相似概述余弦相似性通过测量两个向量的夹角的余弦值来度量它们之间的相似性.0度角的余弦值是1,而其他任何角度的余弦值都不大于1:并且其最小值是-1. 从而两个向量之间的角度的余弦值确定两个向 ...
Sklearn 对比文本相似度算法
首先,使用的是 scikit-learn,关于用到的两个关键算法的详情,下面的链接会给出 TF-IDF: https://scikit-learn.org/stable/modules/generat ...
python对比文本相似度
方法:使用difflib中的SequenceMatcher s=difflib.SequenceMatcher(isjunk=None,a,b, autojunk=True) :构造函数,主要创建任何 ...

余弦定理对比文本相似度实现查重

余弦定理对比文本相似度实现查重相关推荐

最新文章

热门文章