成功计算出文本类单词的概率

首先是wordcount

package org.lukey.hadoop.classifyBayes;import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;/*** * 一次将需要的结果都统计到对应的文件夹中 AFRICA 484017newsML.txt afford 1* * 按照这个格式输出给后面处理得到需要的： 1. AFRICA 484017newsML.txt AFRICA 487141newsML.txt* 类别中的文本数， ---> 计算先验概率(单独解决这个) 所有类别中的文本总数， ---> 可以由上面得到，计算先验概率* * 2. AFRICA afford 1 AFRICA boy 3 每个类中的每个单词的个数，---> 计算各个类中单词的概率* * 3. AFRICA 768 类中单词总数， ---> 将2中的第一个key相同的第三个数相加即可* * 4. AllWORDS 12345 所有类别中单词种类数 ---> 将1中的第三个key归并，计算个数**/public class MyWordCount {private static MultipleOutputs<Text, IntWritable> mos;static String baseOutputPath = "/user/hadoop/test_out";// 设计两个map分别计算每个类别的文本数//和每个类别的单词总数static Map<String, List<String>> fileCountMap = new HashMap<String, List<String>>();static Map<String, Integer> fileCount = new HashMap<String, Integer>();// static Map<String, List<String>> wordsCountInClassMap = new// HashMap<String, List<String>>();static enum WordsNature {CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = { "/user/hadoop/test", "/user/hadoop/mid/wordsFrequence" };/** String[] otherArgs = new GenericOptionsParser(conf,* args).getRemainingArgs();* * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");* System.exit(-1); }*/Job job = new Job(conf, "file count");job.setJarByClass(MyWordCount.class);// job.setInputFormatClass(CustomInputFormat.class);
job.setMapperClass(First_Mapper.class);job.setReducerClass(First_Reducer.class);Path inputpath = new Path(otherArgs[0]);// 调用自己写的方法
        MyUtils.addInputPath(job, inputpath, conf);// CustomInputFormat.setInputPaths(job, inputpath);// FileInputFormat.addInputPath(job, inputpath);FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);int exitCode = job.waitForCompletion(true) ? 0 : 1;// 调用计数器Counters counters = job.getCounters();Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());// 将单词种类数写入文件中Path totalWordsPath = new Path("/user/hadoop/output/totalwords.txt");FileSystem fs = FileSystem.get(conf);FSDataOutputStream outputStream = fs.create(totalWordsPath);outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());// 将每个类的文本个数写入文件中Path priorPath = new Path("/user/hadoop/output/priorPro.txt"); // 先验概率for (Map.Entry<String, List<String>> entry : fileCountMap.entrySet()) {fileCount.put(entry.getKey(), entry.getValue().size());}// 求文本总数int fileSum = 0;for (Integer num : fileCount.values()) {fileSum += num;}System.out.println("fileSum = " + fileSum);FSDataOutputStream priorStream = fs.create(priorPath);// 计算每个类的先验概率并写入文件for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {double p = (double) entry.getValue() / fileSum;priorStream.writeBytes(entry.getKey() + ":" + p);}IOUtils.closeStream(priorStream);IOUtils.closeStream(outputStream);// 下次求概率是尝试单词总种类数写到configuration中//// conf.set("TOTALWORDS", totalWords.toString());
System.exit(exitCode);}// Mapperstatic class First_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private final static IntWritable zero = new IntWritable(0);private Text className = new Text();private Text countryName = new Text();@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubFileSplit fileSplit = (FileSplit) context.getInputSplit();// 文件名String fileName = fileSplit.getPath().getName();// 文件夹名(即类别名)String dirName = fileSplit.getPath().getParent().getName();className.set(dirName + "\t" + value.toString());countryName.set(dirName + "\t" + fileName + "\t" + value.toString());// 将文件名添加到map中用于统计文本个数if (fileCountMap.containsKey(dirName)) {fileCountMap.get(dirName).add(fileName);} else {List<String> oneList = new ArrayList<String>();oneList.add(fileName);fileCountMap.put(dirName, oneList);}context.write(className, one); // 每个类别的每个单词数 // ABDBI hello 1context.write(new Text(dirName), one);// 统计每个类中的单词总数 //ABDBI 1context.write(value, zero); // 用于统计所有类中单词个数
}}// Reducerstatic class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {// result 表示每个类别中每个单词的个数IntWritable result = new IntWritable();Map<String, List<String>> classMap = new HashMap<String, List<String>>();Map<String, List<String>> fileMap = new HashMap<String, List<String>>();@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable value : values) {sum += value.get();}// sum为0，总得单词数加1，统计所有单词的种类if (sum == 0) {context.getCounter(WordsNature.TOTALWORDS).increment(1);} else {// sum不为0时，通过key的长度来判断，String[] temp = key.toString().split("\t");if (temp.length == 2) { // 用tab分隔类别和单词
                result.set(sum);context.write(key, result);
//                mos.write(new Text(temp[1]), result, temp[0]);}else{    //类别中单词总数
                    result.set(sum);mos.write(key, result, "wordsInClass");}/*// 先处理类中的单词数String[] temp = key.toString().split("\t");if (temp.length == 2) { // 用tab分隔类别和单词if (classMap.containsKey(temp[0])) {classMap.get(temp[0]).add(temp[1]);} else {List<String> oneList = new ArrayList<String>();oneList.add(temp[1]);classMap.put(temp[0], oneList);}// mos.write(temp[0], temp[1], result);result.set(sum);context.write(key, result); // 保存每个类别名，单词名以及个数// mos.write(temp[0], temp[1], result);} else if (temp.length == 1) {// 统计文件个数,每个map保存的是一个类别的文件名和文件名列表，list的长度就是个数if (fileMap.containsKey(temp[0])) {fileMap.get(temp[0]).add(temp[1]);} else {List<String> oneList = new ArrayList<String>();oneList.add(temp[1]);fileMap.put(temp[0], oneList);}}// 计算先验概率int fileNumberSum = 0;for (List<String> list : classMap.values()) {fileNumberSum += list.size();System.out.println(fileNumberSum);// test}// 保存先验概率Map<String, Double> priorMap = new HashMap<>();Iterator<Map.Entry<String, List<String>>> iterators = classMap.entrySet().iterator();while (iterators.hasNext()) {Map.Entry<String, List<String>> iterator = iterators.next();double prior = (double) iterator.getValue().size() / fileNumberSum;priorMap.put(iterator.getKey(), prior);}*/// result.set(sum);// context.write(key, result);
            }}@Overrideprotected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stub
            mos.close();}@Overrideprotected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubmos = new MultipleOutputs<Text, IntWritable>(context);}}    }

View Code

循环添加路径

package org.lukey.hadoop.classifyBayes;import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class MyUtils {// 循环添加文件夹路径，对含有子文件夹的路径使用static void addInputPath(Job job, Path inputpath, Configuration conf) throws IOException {FileSystem fs = null;fs = FileSystem.get(inputpath.toUri(), conf);FileStatus[] fileStatus = fs.listStatus(inputpath);for (FileStatus status : fileStatus) {if (status.isDir())addInputPath(job, status.getPath(), conf);elseFileInputFormat.addInputPath(job, status.getPath());}}}

View Code

计算每个类别中单词的概率

package org.lukey.hadoop.classifyBayes;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;public class Probability {private static final Log LOG = LogFactory.getLog(FileInputFormat.class);public static int total = 0;private static MultipleOutputs<Text, DoubleWritable> mos;// Clientpublic static void main(String[] args) throws Exception {Configuration conf = new Configuration();conf.set("mapred.job.tracker", "192.168.190.128:9001");conf.set("mapred.jar", "probability.jar");// 读取单词总数，设置到congfiguration中String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsInClass-r-00000";conf.set("wordsInClassPath", "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsInClass-r-00000");
//        Map<String, Integer> wordsInClassMap = new HashMap<String, Integer>();//保存每个类别的单词总数//先读取单词总类别数FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));String strLine = buffer.readLine();String[] temp = strLine.split(":");if (temp.length == 2) {// temp[0] = TOTALWORDSconf.set(temp[0], temp[1]);// 设置两个String
        }total = Integer.parseInt(conf.get("TOTALWORDS"));LOG.info("------>total = " + total);System.out.println("total ==== " + total);/** String[] otherArgs = new GenericOptionsParser(conf,* args).getRemainingArgs();* * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");* System.exit(-1); }*/Job job = new Job(conf, "file count");job.setJarByClass(Probability.class);job.setMapperClass(WordsOfClassCountMapper.class);job.setReducerClass(WordsOfClassCountReducer.class);String input = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";String output = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";FileInputFormat.addInputPath(job, new Path(input));FileOutputFormat.setOutputPath(job, new Path(output));job.setOutputKeyClass(Text.class);job.setOutputValueClass(DoubleWritable.class);System.exit(job.waitForCompletion(true) ? 0 : 1);}// Mapperstatic class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {private static DoubleWritable number = new DoubleWritable();private static Text className = new Text();protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {Configuration conf = context.getConfiguration();int tot = Integer.parseInt(conf.get("TOTALWORDS"));System.out.println("total = " + total);System.out.println("tot = " + tot);// 输入的格式如下：// ALB weekend 1// ALB weeks 3Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据// Map<String, Map<String, Double>> priorMap = new HashMap<String,// Map<String, Double>>(); // 保存每个单词出现的概率
String[] temp = value.toString().split("\t");// 先将数据存到baseMap中if (temp.length == 3) {// 文件夹名类别名if (baseMap.containsKey(temp[0])) {baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));} else {Map<String, Integer> oneMap = new HashMap<String, Integer>();oneMap.put(temp[1], Integer.parseInt(temp[2]));baseMap.put(temp[0], oneMap);}} // 读取数据完毕，全部保存在baseMap中int allWordsInClass = 0;for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求和allWordsInClass += entry.getValue();}}for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);className.set(entries.getKey() + "\t" + entry.getKey());number.set(p);LOG.info("------>p = " + p);context.write(className, number);}}/** // 两层循环计算出每个类别中每个单词的概率 Iterator<Map.Entry<String, Map<String,* Integer>>> iterators = baseMap.entrySet().iterator(); while* (iterators.hasNext()) {// 遍历类别 Map.Entry<String, Map<String,* Integer>> iterator = iterators.next(); int allWordsInClass = 0;* * for(Integer num : iterator.getValue().values()){ allWordsInClass* += num; }* * * for (Map.Entry<String, Integer> entry :* iterator.getValue().entrySet()) {// 遍历类别中的单词,先求出类别中的单词总数* allWordsInClass += entry.getValue(); }* * System.out.println(allWordsInClass);// 这个数据没有计算成功 // Map<String,* Double> pMap = new HashMap<String, Double>(); for* (Map.Entry<String, Integer> entry :* iterator.getValue().entrySet()) {// 在遍历每个单词的个数计算单词出现的概率 double p* = (entry.getValue() + 1.0) / (allWordsInClass + tot);//* pMap.put(entry.getKey(), p); priorMap.put(iterator.getKey(),* pMap); className.set(iterator.getKey() + "\t" + entry.getKey());* number.set(p); LOG.info("------>p = " + p);* * context.write(className, number); // mos.write(iterator.getKey(),* entry.getKey(), p); }* * }*//** value.set(temp[1]); number.set(Integer.parseInt(temp[2]));* mos.write(value, number, dirName);*/}protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stub
            mos.close();}protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {// TODO Auto-generated method stubmos = new MultipleOutputs<Text, DoubleWritable>(context);}}// Reducerstatic class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {// result 表示每个文件里面单词个数DoubleWritable result = new DoubleWritable();// Configuration conf = new Configuration();// int total = conf.getInt("TOTALWORDS", 1);protected void reduce(Text key, Iterable<DoubleWritable> values,Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)throws IOException, InterruptedException {double sum = 0L;for (DoubleWritable value : values) {sum += value.get();}result.set(sum);context.write(key, result);}}}

View Code

基本可以跑通还有很多需要调整修改的地方。算是mark一下。

后续还有通过每个类中单词的概率计算出测试文本的类别。

最后还要计算出分类的正确度，

评价其好坏。

转载于:https://www.cnblogs.com/luolizhi/p/4943456.html

成功计算出文本类单词的概率相关推荐

Python程序计算给定文本中单词的出现
Given a text (paragraph) and a word whose occurrence to be found in the text/paragraph, we have to f ...
java 只显示文本文件_Java设计并实现一个应用程序,能够读取一个文本文件中的内容并显示,同时能够计算出文本中的行数。...
展开全部 java编写显示文本的应用程序, 需要用到图形界面GUI编程技术. 步骤一: 需要搭建一个整体的外观32313133353236313431303231363533e4b893e5b19e3 ...
opencv-python运用模板寻找黑白子，并计算出两类棋子各自最大距离，并用直线连接，用霍夫变换检测棋盘黑线，用直线画出
# 获取白子的模板 template = cv2.imread('white.jpg', 0) w, h = template.shape[::-1]# 获取黑子的模板 template2 = cv2 ...
Java文本处理12-找出文本最长句与最短句并计算方差
1.任务简介在分句完成后我们可以统计出每一个句子的长度(指句子包含的汉字数),在此基础上我们可以找出最长的句子和最短的句子,并且可以计算出句子长度的方差,本节任务的第一个程序是将结果打印出来,第二个 ...
Java实现标题相似度计算，文本内容相似度匹配，Java通过SimHash计算标题文本内容相似度
目录一.前言二.关于SimHash 补充知识一).什么是海明距离二).海明距离的应用三).什么是编辑距离三.SimHash算法的几何意义和原理一).SimHash算法的几何意义二). ...
excel函数公式html文档,Excel中把计算式转换为运算结果的方法 EXCEL中计算出的公式如何转换成纯文本内容？...
如何在EXCEL中将计算公式直接变结果如我在A1输入计算式:(1+2*3+4/5-6)*1.2+(1+5) 如何在B1得出该计算式的选中B1然后按快捷键ctrl+F3,进入名称管理器,新建,输入任意名 ...
随机创建圆形、三角形或者矩形类的对象，存放在数组中，并计算出每个形状的面积和周长
随机创建圆形.三角形或者矩形类的对象, 存放在数组中,并计算出每个形状的面积和周长题目要求解题思路代码形状类代码圆形类代码三角形类代码矩形类代码方法类代码运行结果题目要求随机创建 ...
NLP --- 文本分类（基于概率的隐语意分析（PLSA）详解）
上一节我们详细的讲解了SVD的隐语意分析,一旦提到这个,大家脑海里应该立刻有如下的矩阵形式: 我们通过矩阵的分解对文本数据进行压缩,压缩量很可观,尤其是原始的矩阵的维度很高时压缩的更可观,因为k通常要 ...
《Python自然语言处理》——1.1　语言计算：文本和词汇
本节书摘来异步社区<Python自然语言处理>一书中的第1章,第1.1节,作者:[美]Steven Bird , Ewan Klein , Edward Loper,更多章节内容可以访问云 ...

成功计算出文本类单词的概率

成功计算出文本类单词的概率相关推荐

最新文章

热门文章