NLP分词与词频实现

一、用spark与Hadoop


package com.citydo.sentinel.spark;import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import scala.Tuple2;
import java.io.Serializable;public class SparkWordCount implements Serializable {/*** */private static final long serialVersionUID = -6629178988243085024L;private String doc = "唐诗三百首";private boolean isSelectFile = false;private int wordLength = 0;//0 为所有单词长度//conf 和 sc变量 必须声明为静态类或者在静态方法如main方法中调用private static SparkConf conf = null;private static JavaSparkContext sc = null;private void initSpark(){/*** 1、创建SparkConf对象，设置Spark应用程序的配置信息*///SparkConf conf = new SparkConf()conf = new SparkConf()//设置Spark应用程序的名称.setAppName(SparkWordCount.class.getSimpleName());conf.setMaster("local");/*** 2、创建SparkContext对象，Java开发使用JavaSparkContext；Scala开发使用SparkContext* 在Spark中，SparkContext负责连接Spark集群，创建RDD、累积量和广播量等。* Master参数是为了创建TaskSchedule（较低级的调度器，高层次的调度器为DAGSchedule），如下：*         如果setMaster("local")则创建LocalSchedule；*         如果setMaster("spark")则创建SparkDeploySchedulerBackend。*         在SparkDeploySchedulerBackend的start函数，会启动一个Client对象，连接到Spark集群。*///JavaSparkContext sc = new JavaSparkContext(conf);sc = new JavaSparkContext(conf);}SparkWordCount(String doc, boolean isSelectFile, int wordLength){this.doc = doc;this.isSelectFile = isSelectFile;this.wordLength = wordLength;initSpark();}SparkWordCount(){initSpark();}private List<String> getSplitWords(String line){List<String> words = new ArrayList<String>();if (line == null || line.trim().length() == 0){return words;}try {InputStream is = new ByteArrayInputStream(line.getBytes("UTF-8"));IKSegmenter seg = new IKSegmenter(new InputStreamReader(is),false);Lexeme lex = seg.next();while (lex != null){String word = lex.getLexemeText();if (wordLength == 0 || word.length() == wordLength){words.add(word);}lex = seg.next();}} catch (UnsupportedEncodingException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return words;}public JavaPairRDD<String, Integer> wordCount(){/*** 3、sc中提供了textFile方法是SparkContext中定义的，如下：*         def textFile(path: String): JavaRDD[String] = sc.textFile(path)    * 用来读取HDFS上的文本文件、集群中节点的本地文本文件或任何支持Hadoop的文件系统上的文本文件，* 它的返回值是JavaRDD[String]，是文本文件每一行*///JavaRDD<String> lines = sc.textFile("hdfs://soy1:9000/mapreduces/word.txt");JavaRDD<String> lines = null;if (isSelectFile){lines = sc.textFile(doc);}else{lines = sc.textFile("src/com/magicstudio/spark/text/" + doc + ".txt");}/*** 4、将行文本内容拆分为多个单词* lines调用flatMap这个transformation算子（参数类型是FlatMapFunction接口实现类）* 返回每一行的每个单词* 加入了中文分词的功能，调用分词后的list结果*/JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>(){private static final long serialVersionUID = -3243665984299496473L;@Overridepublic Iterator<String> call(String line) throws Exception {//return Arrays.asList(line.split("\t"));return (Iterator<String>) getSplitWords(line);}});/*** 5、将每个单词的初始数量都标记为1个* words调用mapToPair这个transformation算子（参数类型是PairFunction接口实现类，PairFunction<String, String, Integer>的三个参数是<输入单词, Tuple2的key, Tuple2的value>），返回一个新的RDD，即JavaPairRDD*/JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {private static final long serialVersionUID = -7879847028195817507L;@Overridepublic Tuple2<String, Integer> call(String word) throws Exception {return new Tuple2<String, Integer>(word, 1);}});/*** 6、计算每个相同单词出现的次数* pairs调用reduceByKey这个transformation算子（参数是Function2接口实现类）对每个key的value进行reduce操作，返回一个JavaPairRDD，这个JavaPairRDD中的每一个Tuple的key是单词、value则是相同单词次数的和*/JavaPairRDD<String, Integer> wordCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {private static final long serialVersionUID = -4171349401750495688L;@Overridepublic Integer call(Integer v1, Integer v2) throws Exception {return v1+v2;}});return wordCount;}public JavaPairRDD<String, Integer> sortByValue(JavaPairRDD<String, Integer> wordCount, boolean isAsc){//加入按词频排序功能//先把key和value交换，然后按sortByKey，最后再交换回去JavaPairRDD<Integer, String> pairs2 = wordCount.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {private static final long serialVersionUID = -7879847028195817508L;@Overridepublic Tuple2<Integer, String> call(Tuple2<String, Integer> word) throws Exception {return new Tuple2<Integer, String>(word._2, word._1);}});//降序pairs2 = pairs2.sortByKey(isAsc);//再次交换key和valuewordCount = pairs2.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {private static final long serialVersionUID = -7879847028195817509L;@Overridepublic Tuple2<String, Integer> call(Tuple2<Integer, String> word) throws Exception {return new Tuple2<String, Integer>(word._2, word._1);}});return wordCount;}public void closeSpark(JavaPairRDD<String, Integer> wordCount){/*** 7、使用foreach这个action算子提交Spark应用程序* 在Spark中，每个应用程序都需要transformation算子计算，最终由action算子触发作业提交*/wordCount.foreach(new VoidFunction<Tuple2<String,Integer>>() {private static final long serialVersionUID = -5926812153234798612L;@Overridepublic void call(Tuple2<String, Integer> wordCount) throws Exception {System.out.println(wordCount._1+":"+wordCount._2);}});/*** 8、将计算结果文件输出到文件系统*         HDFS：*             使用新版API（org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;）*                 wordCount.saveAsNewAPIHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, TextOutputFormat.class, new Configuration());*             使用旧版API（org.apache.hadoop.mapred.JobConf;org.apache.hadoop.mapred.OutputFormat;）*                 wordCount.saveAsHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, OutputFormat.class, new JobConf(new Configuration()));*             使用默认TextOutputFile写入到HDFS(注意写入HDFS权限，如无权限则执行：hdfs dfs -chmod -R 777 /spark)*                 wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount");*///wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount");/*** 9、关闭SparkContext容器，结束本次作业*/sc.close();}public static void main(String[] args) {// TODO Auto-generated method stubSparkWordCount app = new SparkWordCount();JavaPairRDD<String, Integer> wordCount = app.wordCount();wordCount = app.sortByValue(wordCount, false);app.closeSpark(wordCount);}}

package com.citydo.sentinel.spark;import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;public class HadoopWordCount {private static SortableMap<Integer> totalWords = new SortableMap<Integer>();public static class TokenizerMapper extendsMapper<Object, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();private String pattern = "[^//w]"; // 正则表达式，代表不是0-9, a-z,// A-Z的所有其它字符,其中还有下划线@Overridepublic void map(Object key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString().toLowerCase(); // 全部转为小写字母/** line = line.replaceAll(pattern, " "); // 将非0-9, a-z, A-Z的字符替换为空格* StringTokenizer itr = new StringTokenizer(line); while* (itr.hasMoreTokens()) { word.set(itr.nextToken());* context.write(word, one); }*/try {InputStream is = new ByteArrayInputStream(line.getBytes("UTF-8"));IKSegmenter seg = new IKSegmenter(new InputStreamReader(is),false);Lexeme lex = seg.next();while (lex != null) {String text = lex.getLexemeText();word.set(text);context.write(word, one);lex = seg.next();}} catch (UnsupportedEncodingException e) {System.out.println(e);} catch (IOException e) {System.out.println(e);}}}public static class IntSumReducer extendsReducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();@Overridepublic void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);//保存结果totalWords.put(key.toString(), Integer.valueOf(sum));}}private static class IntWritableDecreasingComparator extendsIntWritable.Comparator {@Overridepublic int compare(WritableComparable a, WritableComparable b) {return -super.compare(a, b);}@Overridepublic int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {return -super.compare(b1, s1, l1, b2, s2, l2);}}public static Map<String, Integer> wordCount(String doc, boolean isSelectFile, int wordLength) throws IOException{Configuration conf = new Configuration();Path inputFile; //word count job inputif (isSelectFile){inputFile = new Path(doc);}else{inputFile = new Path("/spark/text/" + doc + ".txt");}//hadoop job会自动创建输出文件夹，所以必须确保输出文件夹不存在FileUtil.deleteFolder("C:\\hadoop");FileUtil.deleteFolder("C:\\hadoopsort");Path outputFolder = new Path("C:\\hadoop");//word count job output,sort job inputPath sortOutput = new Path("C:\\hadoopsort");//sort job outputtry {Job job = new Job(conf, "word count");job.setJarByClass(HadoopWordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, inputFile);FileOutputFormat.setOutputPath(job, outputFolder);// 先将词频统计任务的输出结果写到临时目// 录中, 下一个排序任务以临时目录为输入目录。job.setOutputFormatClass(SequenceFileOutputFormat.class);if (job.waitForCompletion(true)) {Job sortJob = new Job(conf, "sort");sortJob.setJarByClass(HadoopWordCount.class);FileInputFormat.addInputPath(sortJob, outputFolder);sortJob.setInputFormatClass(SequenceFileInputFormat.class);/* InverseMapper由hadoop库提供，作用是实现map()之后的数据对的key和value交换 */sortJob.setMapperClass(InverseMapper.class);/* 将 Reducer 的个数限定为1, 最终输出的结果文件就是一个。 */sortJob.setNumReduceTasks(1);FileOutputFormat.setOutputPath(sortJob, sortOutput);sortJob.setOutputKeyClass(IntWritable.class);sortJob.setOutputValueClass(Text.class);/** Hadoop 默认对 IntWritable 按升序排序，而我们需要的是按降序排列。 因此我们实现了一个* IntWritableDecreasingComparator 类,　 并指定使用这个自定义的 Comparator* 类对输出结果中的 key (词频)进行排序*/sortJob.setSortComparatorClass(IntWritableDecreasingComparator.class);if (sortJob.waitForCompletion(true)){if (wordLength == 0){return totalWords.sortMapByValue(false);}else{SortableMap<Integer> words = new SortableMap<Integer>();for(String key:totalWords.keySet()){if (key.length() == wordLength){words.put(key, totalWords.get(key));}}return words.sortMapByValue(false);}}}} catch (ClassNotFoundException e) {System.out.println(e);} catch (InterruptedException e) {System.out.println(e);} finally {FileSystem.get(conf).deleteOnExit(outputFolder);}return null;}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();//Path inputFile = new Path("E:\\text\\唐诗三百首.txt"); //word count job inputPath inputFile = new Path("唐诗三百首.txt");Path outputFolder = new Path("E:\\text\\hadoop");//word count job output,sort job inputPath sortOutput = new Path("E:\\text\\hadoopsort");//sort job outputJob job = new Job(conf, "word count");job.setJarByClass(HadoopWordCount.class);try {job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, inputFile);FileOutputFormat.setOutputPath(job, outputFolder);// 先将词频统计任务的输出结果写到临时目// 录中, 下一个排序任务以临时目录为输入目录。job.setOutputFormatClass(SequenceFileOutputFormat.class);if (job.waitForCompletion(true)) {Job sortJob = new Job(conf, "sort");sortJob.setJarByClass(HadoopWordCount.class);FileInputFormat.addInputPath(sortJob, outputFolder);sortJob.setInputFormatClass(SequenceFileInputFormat.class);/* InverseMapper由hadoop库提供，作用是实现map()之后的数据对的key和value交换 */sortJob.setMapperClass(InverseMapper.class);/* 将 Reducer 的个数限定为1, 最终输出的结果文件就是一个。 */sortJob.setNumReduceTasks(1);FileOutputFormat.setOutputPath(sortJob, sortOutput);sortJob.setOutputKeyClass(IntWritable.class);sortJob.setOutputValueClass(Text.class);/** Hadoop 默认对 IntWritable 按升序排序，而我们需要的是按降序排列。 因此我们实现了一个* IntWritableDecreasingComparator 类,　 并指定使用这个自定义的 Comparator* 类对输出结果中的 key (词频)进行排序*/sortJob.setSortComparatorClass(IntWritableDecreasingComparator.class);System.exit(sortJob.waitForCompletion(true) ? 0 : 1);}} finally {FileSystem.get(conf).deleteOnExit(outputFolder);}}
}

二、用IKAnalyzer

package com.citydo.sentinel.textrank;import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.lucene.IKAnalyzer;public class IKAnalyzerTest {public static void main(String[] args) throws IOException {String filePath = "唐诗三百首.txt";String news=new String();BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF8"));String str;while ((str = in.readLine()) != null) {news+=str;}in.close();System.out.println(news);IKAnalyzer analyzer = new IKAnalyzer(true);StringReader reader = new StringReader(news);TokenStream ts = analyzer.tokenStream("", reader);CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);while(ts.incrementToken()){System.out.print(term.toString()+"|");}analyzer.close();reader.close();System.out.println();StringReader re = new StringReader(news);IKSegmenter ik = new IKSegmenter(re,true);Lexeme lex = null;while((lex=ik.next())!=null){System.out.print(lex.getLexemeText()+"|");}}
}

三、用hanlp

package com.citydo.sentinel.textrank;import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.py.Pinyin;
import com.hankcs.hanlp.seg.CRF.CRFSegment;
import com.hankcs.hanlp.seg.NShort.NShortSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.suggest.Suggester;
import com.hankcs.hanlp.tokenizer.IndexTokenizer;
import com.hankcs.hanlp.tokenizer.NLPTokenizer;
import com.hankcs.hanlp.tokenizer.SpeedTokenizer;
import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer;import java.util.ArrayList;
import java.util.List;public class hanlp{public static void main(String[] args){//        Participle();
//        extractKeyword();
//        extractSummary();
//        extractPhrase();
//        suggest_test();
//        converto();
//        nameRecognize();analysebysyntax();}/*** @Author：sks* @Description：测试各种分词* @Date：*/private static void Participle(){//标准分词//HanLP.segment 其实是对 StandardTokenizer.segment 的包装。//HanLP中有一系列“开箱即用”的静态分词器，以 Tokenizer 结尾List<Term> stermList = HanLP.segment("商品和服务");System.out.println(stermList);//[商品/n, 和/c, 服务/vn]//NLP分词//NLP分词 NLPTokenizer 会执行全部命名实体识别和词性标注。//所以速度比标准分词慢，并且有误识别的情况。List<Term> nlptermList = NLPTokenizer.segment("中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程");System.out.println(nlptermList);//输出：[中国科学院/n, 计算/v, 技术/n, 研究所/n, 的/uj, 宗成庆/nr, 教授/n, 正在/d, 教授/n, 自然/d, 语言/n, 处理/v, 课程/n]//索引分词//索引分词 IndexTokenizer 是面向搜索引擎的分词器，能够对长词全切分，另外通过 term.offset 可以获取单词在文本中的偏移量List<Term> termList = IndexTokenizer.segment("主副食品");for (Term term : termList){System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");}//主副食品/n [0:4]//主副食/j [0:3]//副食品/n [1:4]//副食/n [1:3]//食品/n [2:4]//繁体分词List<Term> fttermList = TraditionalChineseTokenizer.segment("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前辣妹合唱團成員維多利亞·碧咸，" +"亦由於他擁有突出外表、百變髮型及正面的形象，以至自己品牌的男士香水等商品，及長期擔任運動品牌Adidas的代言人，" +"因此對大眾傳播媒介和時尚界等方面都具很大的影響力，在足球圈外所獲得的認受程度可謂前所未見。");System.out.println(fttermList);//[大衛貝克漢/nrf, 不僅僅/d, 是/v, 名著/n, 名/q, 球員/n, ，/w, 球場/n, 以外/f, ，/w, 其/r, 妻/ng, 為/p, 前/f, 辣妹/nz, 合唱團/n, 成員/n, 維多利亞/ns,// ·/w, 碧/ag, 咸/ng, ，/w, 亦/d, 由於/c, 他/r, 擁有/v, 突出/a, 外表/n, 、/w, 百變/nz, 髮型/n, 及/c, 正面/d, 的/uj, 形象/n, ，/w,// 以至/c, 自己/r, 品牌/n, 的/uj, 男士/n, 香水/n, 等/u, 商品/n, ，/w, 及/c, 長期/d, 擔任/v, 運動/n, 品牌/n, Adidas/nx, 的/uj, 代言人/n,// ，/w, 因此/c, 對/p, 大眾/n, 傳播/vn, 媒介/n, 和/c, 時尚界/nz, 等/u, 方面/n, 都/d, 具/vg, 很大/d, 的/uj, 影響力/n, ，/w,// 在/p, 足球/n, 圈外/nz, 所/u, 獲得/v, 的/uj, 認/v, 受/v, 程度/n, 可/v, 謂/vg, 前所未見/l, 。/w]//极速词典分词//极速分词是词典最长分词，速度极其快，精度一般。调用方法如下:String text = "江西鄱阳湖干枯，中国最大淡水湖变成大草原";System.out.println(SpeedTokenizer.segment(text));long start = System.currentTimeMillis();int pressure = 1000000;for (int i = 0; i < pressure; ++i){SpeedTokenizer.segment(text);}double costTime = (System.currentTimeMillis() - start) / (double)1000;System.out.printf("分词速度：%.2f字每秒", text.length() * pressure / costTime);//N-最短路径分词//N最短路分词器 NShortSegment 比最短路分词器( DijkstraSegment )慢，但是效果稍微好一些，对命名实体识别能力更强//一般场景下最短路分词的精度已经足够，而且速度比N最短路分词器快几倍Segment nShortSegment = new NShortSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);Segment shortestSegment = new ViterbiSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);String[] testCase = new String[]{"刘喜杰石国祥会见吴亚琴先进事迹报告团成员",};for (String sentence : testCase){System.out.println("N-最短分词：" + nShortSegment.seg(sentence) + "\n最短路分词：" + shortestSegment.seg(sentence));}//CRF分词//基于CRF模型和BEMS标注训练得到的分词器//CRF对新词有很好的识别能力，但是无法利用自定义词典。//也不支持命名实体识别，应用场景仅限于新词识别。Segment segment = new CRFSegment();segment.enablePartOfSpeechTagging(true);List<Term> crftermList = segment.seg("你看过穆赫兰道吗");System.out.println(crftermList);for (Term term : crftermList){if (term.nature == null){System.out.println("识别到新词：" + term.word);}}}/*** 这里使用HanLP进行分词 ,只获取名称和动名称* @param text* @return*//*** @Author：sks* @Description：抽取关键字* @Date：*/private static List<String> hanLPSegment(String text){List<String> wordList = new ArrayList<String>();List<Term> words= HanLP.segment(text);for(Term tm:words){if(tm.nature== Nature.n||tm.nature== Nature.vn){wordList.add(tm.word);}}return wordList;}private static void extractKeyword(){String content = "程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。";//返回频次最高的5个关键词List<String> keywordList = HanLP.extractKeyword(content, 5);System.out.println(keywordList);}/*** @Author：sks* @Description：提取摘要* @Date：*/private static void extractSummary(){//同样是一句话调用，第一个参数指定文本，第二个参数指定需要提取几个句子:String document = "算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。\n" +"算法可以宽泛的分为三类，\n" +"一，有限的确定性算法，这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务，但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。\n" +"二，有限的非确定算法，这类算法在有限的时间内终止。然而，对于一个（或一些）给定的数值，算法的结果并不是唯一的或确定的。\n" +"三，无限的算法，是那些由于没有定义终止定义条件，或定义的条件无法由输入的数据满足而不终止运行的算法。通常，无限算法的产生是由于未能确定的定义终止条件。";List<String> sentenceList = HanLP.extractSummary(document, 3);//3:返回的句子数System.out.println(sentenceList);String summary = HanLP.getSummary(document,50);//返回的摘要长度System.out.println(summary);}private static void extractPhrase(){String text = "算法工程师\n" +"算法（Algorithm）是一系列解决问题的清晰指令，也就是说，能够对一定规范的输入，在有限时间内获得所要求的输出。如果一个算法有缺陷，或不适合于某个问题，执行这个算法将不会解决这个问题。不同的算法可能用不同的时间、空间或效率来完成同样的任务。一个算法的优劣可以用空间复杂度与时间复杂度来衡量。算法工程师就是利用算法处理事物的人。\n" +"\n" +"1职位简介\n" +"算法工程师是一个非常高端的职位；\n" +"专业要求：计算机、电子、通信、数学等相关专业；\n" +"学历要求：本科及其以上的学历，大多数是硕士学历及其以上；\n" +"语言要求：英语要求是熟练，基本上能阅读国外专业书刊；\n" +"必须掌握计算机相关知识，熟练使用仿真工具MATLAB等，必须会一门编程语言。\n" +"\n" +"2研究方向\n" +"视频算法工程师、图像处理算法工程师、音频算法工程师 通信基带算法工程师\n" +"\n" +"3目前国内外状况\n" +"目前国内从事算法研究的工程师不少，但是高级算法工程师却很少，是一个非常紧缺的专业工程师。算法工程师根据研究领域来分主要有音频/视频算法处理、图像技术方面的二维信息算法处理和通信物理层、雷达信号处理、生物医学信号处理等领域的一维信息算法处理。\n" +"在计算机音视频和图形图像技术等二维信息算法处理方面目前比较先进的视频处理算法：机器视觉成为此类算法研究的核心；另外还有2D转3D算法(2D-to-3D conversion)，去隔行算法(de-interlacing)，运动估计运动补偿算法(Motion estimation/Motion Compensation)，去噪算法(Noise Reduction)，缩放算法(scaling)，锐化处理算法(Sharpness)，超分辨率算法(Super Resolution),手势识别(gesture recognition),人脸识别(face recognition)。\n" +"在通信物理层等一维信息领域目前常用的算法：无线领域的RRM、RTT，传送领域的调制解调、信道均衡、信号检测、网络优化、信号分解等。\n" +"另外数据挖掘、互联网搜索算法也成为当今的热门方向。\n" +"算法工程师逐渐往人工智能方向发展。";List<String> phraseList = HanLP.extractPhrase(text, 10);System.out.println(phraseList);}/*** @Author：sks* @Description：智能提示* @Date：*/private static void suggest_test(){Suggester suggester = new Suggester();String[] titleArray =("威廉王子发表演说 呼吁保护野生动物\n" +"《时代》年度人物最终入围名单出炉 普京马云入选\n" +"“黑格比”横扫菲：菲吸取“海燕”经验及早疏散\n" +"日本保密法将正式生效 日媒指其损害国民知情权\n" +"英报告说空气污染带来“公共健康危机”").split("\\n");for (String title : titleArray){suggester.addSentence(title);}System.out.println(suggester.suggest("发言", 1));       // 语义System.out.println(suggester.suggest("危机公共", 1));   // 字符System.out.println(suggester.suggest("mayun", 1));      // 拼音}/*** @Author：sks* @Description：简繁转换* @Date：*/private static void converto(){//简繁转换System.out.println(HanLP.convertToTraditionalChinese("“以后等你当上皇后，就能买草莓庆祝了”"));System.out.println(HanLP.convertToSimplifiedChinese("用筆記簿型電腦寫程式HelloWorld"));//拼音转换String text = "重载不是重任";List<Pinyin> pinyinList = HanLP.convertToPinyinList(text);System.out.print("原文,");for (char c : text.toCharArray()){System.out.printf("%c,", c);}System.out.println();System.out.print("拼音（数字音调）,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin);}System.out.println();System.out.print("拼音（符号音调）,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getPinyinWithToneMark());}System.out.println();System.out.print("拼音（无音调）,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getPinyinWithoutTone());}System.out.println();System.out.print("声调,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getTone());}System.out.println();System.out.print("声母,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getShengmu());}System.out.println();System.out.print("韵母,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getYunmu());}System.out.println();System.out.print("输入法头,");for (Pinyin pinyin : pinyinList){System.out.printf("%s,", pinyin.getHead());}System.out.println();}/*** @Author：sks* @Description：中国人姓名识别* @Date：*/private static void nameRecognize(){//目前分词器基本上都默认开启了中国人名识别，比如HanLP.segment()接口中使用的分词器等等，用户不必手动开启；上面的代码只是为了强调。String[] testCase = new String[]{"签约仪式前，秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。","王国强、高峰、汪洋、张朝阳光着头、韩寒、小四","张浩和胡健康复员回家了","王总和小丽结婚了","编剧邵钧林和稽道青说","这里有关天培的有关事迹","龚学平等领导,邓颖超生前",};Segment segment = HanLP.newSegment().enableNameRecognize(true);for (String sentence : testCase){List<Term> termList = segment.seg(sentence);System.out.println(termList);}//音译人名识别//目前分词器基本上都默认开启了音译人名识别，用户不必手动开启；上面的代码只是为了强调。String[] testCase1 = new String[]{"一桶冰水当头倒下，微软的比尔盖茨、Facebook的扎克伯格跟桑德博格、亚马逊的贝索斯、苹果的库克全都不惜湿身入镜，这些硅谷的科技人，飞蛾扑火似地牺牲演出，其实全为了慈善。","世界上最长的姓名是简森·乔伊·亚历山大·比基·卡利斯勒·达夫·埃利奥特·福克斯·伊维鲁莫·马尔尼·梅尔斯·帕特森·汤普森·华莱士·普雷斯顿。",};Segment segment1 = HanLP.newSegment().enableTranslatedNameRecognize(true);for (String sentence : testCase1){List<Term> termList = segment1.seg(sentence);System.out.println(termList);}//地名识别//可以自动识别地名，标注为ns://目前标准分词器都默认关闭了地名识别，用户需要手动开启；这是因为消耗性能，其实多数地名都收录在核心词典和用户自定义词典中。//在生产环境中，能靠词典解决的问题就靠词典解决，这是最高效稳定的方法String[] areaCase = new String[]{"武胜县新学乡政府大楼门前锣鼓喧天","蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机",};Segment areasegment = HanLP.newSegment().enablePlaceRecognize(true);for (String sentence : areaCase){List<Term> termList = areasegment.seg(sentence);System.out.println(termList);}//机构名识别//可以自动识别地名，标注为ns://目前分词器默认关闭了机构名识别，用户需要手动开启；这是因为消耗性能，其实常用机构名都收录在核心词典和用户自定义词典中。//HanLP的目的不是演示动态识别，在生产环境中，能靠词典解决的问题就靠词典解决，这是最高效稳定的方法。String[] jgCase = new String[]{"我在上海林原科技有限公司兼职工作，","同时在上海外国语大学日本文化经济学院学习经济与外语。","我经常在台川喜宴餐厅吃饭，","偶尔去地中海影城看电影。",};Segment jgsegment =  HanLP.newSegment().enableOrganizationRecognize(true);for (String sentence : jgCase){List<Term> termList = jgsegment.seg(sentence);System.out.println(termList);}}private static void analysebysyntax(){//        System.out.println(HanLP.parseDependency("把市场经济奉行的等价交换原则引入党的生活和国家机关政务活动中"));
//        System.out.println(CRFDependencyParser.compute("把市场经济奉行的等价交换原则引入党的生活和国家机关政务活动中"));}
}

四、用Elasticsearch+分词+API

 /*** 删除创建索引* DELETE message_index* PUT message_index* {*    "mappings": {*     "_doc":{*        "properties":{*             "message": {*                "analyzer": "ik_smart",*                "term_vector": "with_positions_offsets",*                 "boost": 8,*                 "type": "text",*                 "fielddata":"true"*             }*         }*     }*   }* }*/@Testpublic void testCreateIndexMessage() {elasticsearchTemplate.createIndex(XXX.class);}/*** 插入数据* POST message_index/_doc/1* {*   "message":"沉溺于「轻易获得高成就感」的事情：有意无意地寻求用很小付出获得很大「回报」的偏方，哪怕回报是虚拟的"* }** POST message_index/_doc/2* {*   "message":"过度追求“短期回报”可以先思考这样一个问题：为什么玩王者荣耀沉溺我们总是停不下来回报"* }** POST message_index/_doc/3* {*   "message":"过度追求的努力无法带来超额的回报，就因此放弃了努力。这点在聪明人身上尤其明显。以前念本科的时候身在沉溺"* }*/@Testpublic void insertListsMessage() {// 接收对象集合，实现批量新增elasticsearchTemplate.index(XXX);}/*** POST message_index/_search* {*    "size" : 0,*     "aggs" : {*         "messages" : {*             "terms" : {*                "size" : 10,*               "field" : "message"*             }*         }*     }* }*/public void queryByPriceBetweenMessage(){elasticsearchTemplate.queryForAlias(XXX);}

全部pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.2.1.RELEASE</version><relativePath/> <!-- lookup parent from repository --></parent><groupId>com.citydo</groupId><artifactId>sentinel</artifactId><version>0.0.1-SNAPSHOT</version><name>sentinel</name><description>Demo project for Spring Boot</description><properties><java.version>1.8</java.version></properties><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-redis</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-test</artifactId><scope>test</scope><exclusions><exclusion><groupId>org.junit.vintage</groupId><artifactId>junit-vintage-engine</artifactId></exclusion></exclusions></dependency><dependency><groupId>com.alibaba.csp</groupId><artifactId>sentinel-annotation-aspectj</artifactId><version>1.7.0</version></dependency><dependency><groupId>com.alibaba.csp</groupId><artifactId>sentinel-core</artifactId><version>1.7.0</version></dependency><dependency><groupId>com.alibaba.csp</groupId><artifactId>sentinel-transport-simple-http</artifactId><version>1.7.0</version></dependency><dependency><groupId>io.atomix.copycat</groupId><artifactId>copycat-server</artifactId><version>1.1.4</version></dependency><dependency><groupId>io.atomix.copycat</groupId><artifactId>copycat-client</artifactId><version>1.1.4</version></dependency><dependency><groupId>io.atomix.catalyst</groupId><artifactId>catalyst-netty</artifactId><version>1.1.1</version></dependency><dependency><groupId>dom4j</groupId><artifactId>dom4j</artifactId><version>1.6.1</version></dependency><dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId><version>3.7</version></dependency><dependency><groupId>io.atomix</groupId><artifactId>atomix</artifactId><version>3.0.6</version></dependency><dependency><groupId>io.atomix</groupId><artifactId>atomix-raft</artifactId><version>3.0.6</version></dependency><dependency><groupId>io.atomix</groupId><artifactId>atomix-primary-backup</artifactId><version>3.0.6</version></dependency><dependency><groupId>io.atomix</groupId><artifactId>atomix-gossip</artifactId><version>3.0.6</version></dependency><!-- 下面是jstorm运行依赖 --><dependency><groupId>com.alibaba.jstorm</groupId><artifactId>jstorm-core</artifactId><version>2.2.1</version><scope>provided</scope></dependency><!-- 下面是storm运行依赖 --><!-- <dependency><groupId>org.apache.storm</groupId><artifactId>storm-core</artifactId><version>1.0.1</version><scope>provided</scope></dependency> --><dependency><groupId>com.janeluo</groupId><artifactId>ikanalyzer</artifactId><version>2012_u6</version></dependency><dependency><groupId>com.hankcs</groupId><artifactId>hanlp</artifactId><version>portable-1.5.3</version></dependency><dependency><groupId>com.hankcs.nlp</groupId><artifactId>hanlp-lucene-plugin</artifactId><version>1.1.2</version></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-core_2.11</artifactId><version>2.1.0</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId><version>1.2.1</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-common</artifactId><version>3.2.0</version></dependency><dependency><groupId>org.apdplat</groupId><artifactId>word</artifactId><version>1.3</version></dependency>
<!--        <dependency>-->
<!--            <groupId>org.elasticsearch.client</groupId>-->
<!--            <artifactId>transport</artifactId>-->
<!--            <version>6.2.2</version>-->
<!--        </dependency>--><dependency><groupId>org.springframework.data</groupId><artifactId>spring-data-elasticsearch</artifactId><version>3.1.10.RELEASE</version></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-test</artifactId></dependency><dependency><groupId>org.elasticsearch</groupId><artifactId>elasticsearch</artifactId><version>6.2.2</version></dependency><dependency><groupId>org.elasticsearch.client</groupId><artifactId>elasticsearch-rest-high-level-client</artifactId><version>6.3.1</version></dependency></dependencies><build><plugins><plugin><groupId>org.springframework.boot</groupId><artifactId>spring-boot-maven-plugin</artifactId></plugin></plugins></build></project>

NLP分词与词频实现相关推荐

简单Nlp分析套路，获取数据（爬虫），数据处理（分词，词频，命名实体识别与关键词抽取），结果展现
简单NLP分析套路(1)----语料库积累之3种简单爬虫应对大部分网站: https://cloud.tencent.com/developer/article/1384454 简单NLP分析套路(2 ...
简单NLP分析套路（2）----分词，词频，命名实体识别与关键词抽取
文章大纲中文分词技术评测参考云服务哈工大语言云 ltp 基于深度学习方法的中文分词一个领域细分的中文分词工具包(北大最新开源) 信息检索与关键词提取 TF-IDF TEXTRANK word ...
python软件和rost软件哪个更好_ROST-CM软件分词和词频统计用法体验
ROST作为一款优秀的内容挖掘工具,在自然语言处理上提供一系列能够快速上手使用的功能,其中"分词"功能也是广为使用的.我们在对文本进行分词处理的过程中,由于词库是固定的,所以不管是 ...
Python中文分词及词频统计
Python中文分词及词频统计中文分词中文分词(Chinese Word Segmentation),将中文语句切割成单独的词组.英文使用空格来分开每个单词的,而中文单独一个汉字跟词有时候完全不是 ...
NLP分词数据准备及模型训练实例
NLP分词数据准备及模型训练实例目录 NLP分词数据准备及模型训练实例方案分析及梳理示例代码
R语言实战应用精讲50篇（十八）-R语言实现分词、词频与词云案例解析
前言我真的超爱R语言,原因之一就是R有许多已经写好."开箱即用"的程序包可以直接拿来用:要知道,程序包减少了多少工作量.当然,其他语言也有类似的包,但是貌似没那么多.没那么细.这 ...
python分词和词频统计
Python大数据:jieba分词,词频统计黑冰中国关注 0.1 2018.03.21 11:39* 字数 1717 阅读 7553评论 6喜欢 45赞赏 1 实验目的学习如何读取一个文件学 ...
python字频统计软件_python结巴分词以及词频统计实例
python结巴分词以及词频统计实例发布时间:2018-03-20 14:52, 浏览次数:773 , 标签: python # coding=utf-8 ''' Created on 2018年3 ...
python词频统计完整步骤_Python中文文本分词、词频统计、词云绘制
本文主要从中文文本分词.词频统计.词云绘制方面介绍Python中文文本分词的使用.会使用到的中文文本处理包包括:wordcloud,jieba,re(正则表达式),collections. 1 准备工 ...

NLP分词与词频实现

NLP分词与词频实现相关推荐

最新文章

热门文章