首先是wordcount

package org.lukey.hadoop.classifyBayes;import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;/*** * 一次将需要的结果都统计到对应的文件夹中 AFRICA 484017newsML.txt afford 1* * 按照这个格式输出给后面处理得到需要的: 1. AFRICA 484017newsML.txt AFRICA 487141newsML.txt* 类别中的文本数, ---> 计算先验概率(单独解决这个) 所有类别中的文本总数, ---> 可以由上面得到,计算先验概率* * 2. AFRICA afford 1 AFRICA boy 3 每个类中的每个单词的个数,---> 计算各个类中单词的概率* * 3. AFRICA 768 类中单词总数, ---> 将2中的第一个key相同的第三个数相加即可* * 4. AllWORDS 12345 所有类别中单词种类数 ---> 将1中的第三个key归并,计算个数**/public class MyWordCount {private static MultipleOutputs<Text, IntWritable> mos;static String baseOutputPath = "/user/hadoop/test_out";// 设计两个map分别计算每个类别的文本数//和每个类别的单词总数static Map<String, List<String>> fileCountMap = new HashMap<String, List<String>>();static Map<String, Integer> fileCount = new HashMap<String, Integer>();// static Map<String, List<String>> wordsCountInClassMap = new// HashMap<String, List<String>>();static enum WordsNature {CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = { "/user/hadoop/test", "/user/hadoop/mid/wordsFrequence" };/** String[] otherArgs = new GenericOptionsParser(conf,* args).getRemainingArgs();* * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");* System.exit(-1); }*/Job job = new Job(conf, "file count");job.setJarByClass(MyWordCount.class);// job.setInputFormatClass(CustomInputFormat.class);
job.setMapperClass(First_Mapper.class);job.setReducerClass(First_Reducer.class);Path inputpath = new Path(otherArgs[0]);// 调用自己写的方法
        MyUtils.addInputPath(job, inputpath, conf);// CustomInputFormat.setInputPaths(job, inputpath);// FileInputFormat.addInputPath(job, inputpath);FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);int exitCode = job.waitForCompletion(true) ? 0 : 1;// 调用计数器Counters counters = job.getCounters();Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());// 将单词种类数写入文件中Path totalWordsPath = new Path("/user/hadoop/output/totalwords.txt");FileSystem fs = FileSystem.get(conf);FSDataOutputStream outputStream = fs.create(totalWordsPath);outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());// 将每个类的文本个数写入文件中Path priorPath = new Path("/user/hadoop/output/priorPro.txt"); // 先验概率for (Map.Entry<String, List<String>> entry : fileCountMap.entrySet()) {fileCount.put(entry.getKey(), entry.getValue().size());}// 求文本总数int fileSum = 0;for (Integer num : fileCount.values()) {fileSum += num;}System.out.println("fileSum = " + fileSum);FSDataOutputStream priorStream = fs.create(priorPath);// 计算每个类的先验概率并写入文件for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {double p = (double) entry.getValue() / fileSum;priorStream.writeBytes(entry.getKey() + ":" + p);}IOUtils.closeStream(priorStream);IOUtils.closeStream(outputStream);// 下次求概率是尝试单词总种类数写到configuration中//// conf.set("TOTALWORDS", totalWords.toString());
System.exit(exitCode);}// Mapperstatic class First_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private final static IntWritable zero = new IntWritable(0);private Text className = new Text();private Text countryName = new Text();@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubFileSplit fileSplit = (FileSplit) context.getInputSplit();// 文件名String fileName = fileSplit.getPath().getName();// 文件夹名(即类别名)String dirName = fileSplit.getPath().getParent().getName();className.set(dirName + "\t" + value.toString());countryName.set(dirName + "\t" + fileName + "\t" + value.toString());// 将文件名添加到map中用于统计文本个数if (fileCountMap.containsKey(dirName)) {fileCountMap.get(dirName).add(fileName);} else {List<String> oneList = new ArrayList<String>();oneList.add(fileName);fileCountMap.put(dirName, oneList);}context.write(className, one); // 每个类别的每个单词数 // ABDBI hello 1context.write(new Text(dirName), one);// 统计每个类中的单词总数 //ABDBI 1context.write(value, zero); // 用于统计所有类中单词个数
}}// Reducerstatic class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {// result 表示每个类别中每个单词的个数IntWritable result = new IntWritable();Map<String, List<String>> classMap = new HashMap<String, List<String>>();Map<String, List<String>> fileMap = new HashMap<String, List<String>>();@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable value : values) {sum += value.get();}// sum为0,总得单词数加1,统计所有单词的种类if (sum == 0) {context.getCounter(WordsNature.TOTALWORDS).increment(1);} else {// sum不为0时,通过key的长度来判断,String[] temp = key.toString().split("\t");if (temp.length == 2) { // 用tab分隔类别和单词
                result.set(sum);context.write(key, result);
//                mos.write(new Text(temp[1]), result, temp[0]);}else{    //类别中单词总数
                    result.set(sum);mos.write(key, result, "wordsInClass");}/*// 先处理类中的单词数String[] temp = key.toString().split("\t");if (temp.length == 2) { // 用tab分隔类别和单词if (classMap.containsKey(temp[0])) {classMap.get(temp[0]).add(temp[1]);} else {List<String> oneList = new ArrayList<String>();oneList.add(temp[1]);classMap.put(temp[0], oneList);}// mos.write(temp[0], temp[1], result);result.set(sum);context.write(key, result); // 保存每个类别名,单词名以及个数// mos.write(temp[0], temp[1], result);} else if (temp.length == 1) {// 统计文件个数,每个map保存的是一个类别的文件名和文件名列表,list的长度就是个数if (fileMap.containsKey(temp[0])) {fileMap.get(temp[0]).add(temp[1]);} else {List<String> oneList = new ArrayList<String>();oneList.add(temp[1]);fileMap.put(temp[0], oneList);}}// 计算先验概率int fileNumberSum = 0;for (List<String> list : classMap.values()) {fileNumberSum += list.size();System.out.println(fileNumberSum);// test}// 保存先验概率Map<String, Double> priorMap = new HashMap<>();Iterator<Map.Entry<String, List<String>>> iterators = classMap.entrySet().iterator();while (iterators.hasNext()) {Map.Entry<String, List<String>> iterator = iterators.next();double prior = (double) iterator.getValue().size() / fileNumberSum;priorMap.put(iterator.getKey(), prior);}*/// result.set(sum);// context.write(key, result);
            }}@Overrideprotected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stub
            mos.close();}@Overrideprotected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubmos = new MultipleOutputs<Text, IntWritable>(context);}}    }

View Code

循环添加路径

package org.lukey.hadoop.classifyBayes;import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class MyUtils {// 循环添加文件夹路径,对含有子文件夹的路径使用static void addInputPath(Job job, Path inputpath, Configuration conf) throws IOException {FileSystem fs = null;fs = FileSystem.get(inputpath.toUri(), conf);FileStatus[] fileStatus = fs.listStatus(inputpath);for (FileStatus status : fileStatus) {if (status.isDir())addInputPath(job, status.getPath(), conf);elseFileInputFormat.addInputPath(job, status.getPath());}}}

View Code

计算每个类别中单词的概率

package org.lukey.hadoop.classifyBayes;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;public class Probability {private static final Log LOG = LogFactory.getLog(FileInputFormat.class);public static int total = 0;private static MultipleOutputs<Text, DoubleWritable> mos;// Clientpublic static void main(String[] args) throws Exception {Configuration conf = new Configuration();conf.set("mapred.job.tracker", "192.168.190.128:9001");conf.set("mapred.jar", "probability.jar");// 读取单词总数,设置到congfiguration中String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsInClass-r-00000";conf.set("wordsInClassPath", "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsInClass-r-00000");
//        Map<String, Integer> wordsInClassMap = new HashMap<String, Integer>();//保存每个类别的单词总数//先读取单词总类别数FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));String strLine = buffer.readLine();String[] temp = strLine.split(":");if (temp.length == 2) {// temp[0] = TOTALWORDSconf.set(temp[0], temp[1]);// 设置两个String
        }total = Integer.parseInt(conf.get("TOTALWORDS"));LOG.info("------>total = " + total);System.out.println("total ==== " + total);/** String[] otherArgs = new GenericOptionsParser(conf,* args).getRemainingArgs();* * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");* System.exit(-1); }*/Job job = new Job(conf, "file count");job.setJarByClass(Probability.class);job.setMapperClass(WordsOfClassCountMapper.class);job.setReducerClass(WordsOfClassCountReducer.class);String input = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";String output = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";FileInputFormat.addInputPath(job, new Path(input));FileOutputFormat.setOutputPath(job, new Path(output));job.setOutputKeyClass(Text.class);job.setOutputValueClass(DoubleWritable.class);System.exit(job.waitForCompletion(true) ? 0 : 1);}// Mapperstatic class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {private static DoubleWritable number = new DoubleWritable();private static Text className = new Text();protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {Configuration conf = context.getConfiguration();int tot = Integer.parseInt(conf.get("TOTALWORDS"));System.out.println("total = " + total);System.out.println("tot = " + tot);// 输入的格式如下:// ALB weekend 1// ALB weeks 3Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据// Map<String, Map<String, Double>> priorMap = new HashMap<String,// Map<String, Double>>(); // 保存每个单词出现的概率
String[] temp = value.toString().split("\t");// 先将数据存到baseMap中if (temp.length == 3) {// 文件夹名类别名if (baseMap.containsKey(temp[0])) {baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));} else {Map<String, Integer> oneMap = new HashMap<String, Integer>();oneMap.put(temp[1], Integer.parseInt(temp[2]));baseMap.put(temp[0], oneMap);}} // 读取数据完毕,全部保存在baseMap中int allWordsInClass = 0;for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求和allWordsInClass += entry.getValue();}}for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);className.set(entries.getKey() + "\t" + entry.getKey());number.set(p);LOG.info("------>p = " + p);context.write(className, number);}}/** // 两层循环计算出每个类别中每个单词的概率 Iterator<Map.Entry<String, Map<String,* Integer>>> iterators = baseMap.entrySet().iterator(); while* (iterators.hasNext()) {// 遍历类别 Map.Entry<String, Map<String,* Integer>> iterator = iterators.next(); int allWordsInClass = 0;* * for(Integer num : iterator.getValue().values()){ allWordsInClass* += num; }* * * for (Map.Entry<String, Integer> entry :* iterator.getValue().entrySet()) {// 遍历类别中的单词,先求出类别中的单词总数* allWordsInClass += entry.getValue(); }* * System.out.println(allWordsInClass);// 这个数据没有计算成功 // Map<String,* Double> pMap = new HashMap<String, Double>(); for* (Map.Entry<String, Integer> entry :* iterator.getValue().entrySet()) {// 在遍历每个单词的个数计算单词出现的概率 double p* = (entry.getValue() + 1.0) / (allWordsInClass + tot);//* pMap.put(entry.getKey(), p); priorMap.put(iterator.getKey(),* pMap); className.set(iterator.getKey() + "\t" + entry.getKey());* number.set(p); LOG.info("------>p = " + p);* * context.write(className, number); // mos.write(iterator.getKey(),* entry.getKey(), p); }* * }*//** value.set(temp[1]); number.set(Integer.parseInt(temp[2]));* mos.write(value, number, dirName);*/}protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stub
            mos.close();}protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubmos = new MultipleOutputs<Text, DoubleWritable>(context);}}// Reducerstatic class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {// result 表示每个文件里面单词个数DoubleWritable result = new DoubleWritable();// Configuration conf = new Configuration();// int total = conf.getInt("TOTALWORDS", 1);protected void reduce(Text key, Iterable<DoubleWritable> values,Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {double sum = 0L;for (DoubleWritable value : values) {sum += value.get();}result.set(sum);context.write(key, result);}}}

View Code

基本可以跑通还有很多需要调整修改的地方。算是mark一下。

后续还有通过每个类中单词的概率计算出测试文本的类别。

最后还要计算出分类的正确度,

评价其好坏。

转载于:https://www.cnblogs.com/luolizhi/p/4943456.html

成功计算出文本类单词的概率相关推荐

  1. Python程序计算给定文本中单词的出现

    Given a text (paragraph) and a word whose occurrence to be found in the text/paragraph, we have to f ...

  2. java 只显示文本文件_Java设计并实现一个应用程序,能够读取一个文本文件中的内容并显示,同时能够计算出文本中的行数。...

    展开全部 java编写显示文本的应用程序, 需要用到图形界面GUI编程技术. 步骤一: 需要搭建一个整体的外观32313133353236313431303231363533e4b893e5b19e3 ...

  3. opencv-python运用模板寻找黑白子,并计算出两类棋子各自最大距离,并用直线连接,用霍夫变换检测棋盘黑线,用直线画出

    # 获取白子的模板 template = cv2.imread('white.jpg', 0) w, h = template.shape[::-1]# 获取黑子的模板 template2 = cv2 ...

  4. Java文本处理12-找出文本最长句与最短句并计算方差

    1.任务简介 在分句完成后我们可以统计出每一个句子的长度(指句子包含的汉字数),在此基础上我们可以找出最长的句子和最短的句子,并且可以计算出句子长度的方差,本节任务的第一个程序是将结果打印出来,第二个 ...

  5. Java实现标题相似度计算,文本内容相似度匹配,Java通过SimHash计算标题文本内容相似度

     目录 一.前言 二.关于SimHash 补充知识 一).什么是海明距离 二).海明距离的应用 三).什么是编辑距离 三.SimHash算法的几何意义和原理 一).SimHash算法的几何意义 二). ...

  6. excel函数公式html文档,Excel中把计算式转换为运算结果的方法 EXCEL中计算出的公式如何转换成纯文本内容?...

    如何在EXCEL中将计算公式直接变结果如我在A1输入计算式:(1+2*3+4/5-6)*1.2+(1+5) 如何在B1得出该计算式的选中B1然后按快捷键ctrl+F3,进入名称管理器,新建,输入任意名 ...

  7. 随机创建圆形、三角形或者矩形类的对象, 存放在数组中,并计算出每个形状的面积和周长

    随机创建圆形.三角形或者矩形类的对象, 存放在数组中,并计算出每个形状的面积和周长 题目要求 解题思路 代码 形状类代码 圆形类代码 三角形类代码 矩形类代码 方法类代码 运行结果 题目要求 随机创建 ...

  8. NLP --- 文本分类(基于概率的隐语意分析(PLSA)详解)

    上一节我们详细的讲解了SVD的隐语意分析,一旦提到这个,大家脑海里应该立刻有如下的矩阵形式: 我们通过矩阵的分解对文本数据进行压缩,压缩量很可观,尤其是原始的矩阵的维度很高时压缩的更可观,因为k通常要 ...

  9. 《Python自然语言处理》——1.1 语言计算:文本和词汇

    本节书摘来异步社区<Python自然语言处理>一书中的第1章,第1.1节,作者:[美]Steven Bird , Ewan Klein , Edward Loper,更多章节内容可以访问云 ...

最新文章

  1. This version of Android Studio cannot open this project, please retry with Android Studio 3.5 or new
  2. FCC 成都社区·前端周刊 第 6 期
  3. Linux学习之线程封装四:基于接口的封装
  4. Java多线程(四)之ConcurrentSkipListMap深入分析
  5. 可以让SQL针对某个错误信息号抓DUMP的命令组合
  6. 双城生活,一种相对无奈且幸福的选择
  7. React开发(191):ant design中inputNumber格式化
  8. 树形结构 —— 并查集 —— 带权并查集
  9. 数字图像处理--图像ROI
  10. 博弈——无向图删边游戏
  11. 关于手心输入法配置完整自然码辅码
  12. firefox 各个版本都在这里
  13. 启用了被称为 HTTP 严格传输安全(HSTS)的安全策略,Firefox 只能与其建立安全连接
  14. 八个处理好职场人际关系的必备技巧
  15. 教你免费将手机里的PDF转Word还能同步到电脑
  16. 【杂记】如何利用Python3+you-get快速的下载B站的视频(尤其是多P的教程类视频)
  17. mysql经典问题之group by和max函数
  18. AVPlayer 播放的时候有黑色边
  19. 如何设置海思开发板的静态IP
  20. Linux下如何创建和取消软连接

热门文章

  1. 计算机系统的组成doc,《计算机系统的组成》.doc
  2. 企业网络推广有效收录一定会带来排名?企业网络推广专员怎么说?
  3. 手机桌面隐藏大师_受够了内置主题?扒一扒Windows 10隐藏主题
  4. 访问手机app怎么看路径_推荐一款手机看国内电视的app
  5. python 遍历目录_Python遍历目录的4种方法实例介绍
  6. python在函数内部有没有办法定义全局变量_修改函数内部的全局变量
  7. MLPclassifier,MLP 多层感知器的的缩写(Multi-layer Perceptron)
  8. DDos攻击,使用深度学习中 栈式自编码的算法
  9. IntelliJ IDEA 配置JDK出现The selected directory is not a valid home for JDK
  10. MediaPlayer loading 问题解决