Hadoop1.2.0开发笔记（九）

人类学习的方式在很大程度上始于模仿，“古者包犠氏之王天下也……作结绳而为网罟，以佃以渔，盖取诸离”，古人从自然法则中求生存，逐步走出蒙昧，人法地，地法天，天法道，道法自然。（历代对本句训诂汗牛充栋，还不如本人的解释来得直接，顺便鄙视一下那些训诂专家，小题大做，愚不可及）

而本文要描述的是，先来模仿几个hadoop的example，以增强hadoop编程的感悟能力

从下面几个example可以增强理解MapReduce的具体处理过程，包括输入输出的类型以及shuffle的功能

1 数据去重

public class Dedup {//map将输入中的value复制到输出数据的key上，并直接输出public static class Map extends Mapper<Object,Text,Text,Text>{private static Text line=new Text();//每行数据       //实现map函数public void map(Object key,Text value,Context context)throws IOException,InterruptedException{line=value;context.write(line, new Text(""));}}   //reduce将输入中的key复制到输出数据的key上，并直接输出public static class Reduce extends Reducer<Text,Text,Text,Text>{//实现reduce函数public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{context.write(key, new Text(""));}}/*** @param args*/public static void main(String[] args) throws Exception {        // TODO Auto-generated method stubConfiguration conf = new Configuration();        String[] ioArgs=new String[]{"dedup_in","dedup_out"};String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: Data Deduplication <in> <out>");System.exit(2);}Job job = new Job(conf, "Data Deduplication");job.setJarByClass(Dedup.class);         //设置Map、Combine和Reduce处理类job.setMapperClass(Map.class);job.setCombinerClass(Reduce.class);job.setReducerClass(Reduce.class);         //设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);         //设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

2 数据排序

public class Sort {// map将输入中的value化成IntWritable类型，作为输出的keypublic static class Map extendsMapper<Object, Text, IntWritable, IntWritable> {private static IntWritable data = new IntWritable();// 实现map函数public void map(Object key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();data.set(Integer.parseInt(line));context.write(data, new IntWritable(1));}}// reduce将输入中的key复制到输出数据的key上，// 然后根据输入的value-list中元素的个数决定key的输出次数// 用全局linenum来代表key的位次public static class Reduce extendsReducer<IntWritable, IntWritable, IntWritable, IntWritable> {private static IntWritable linenum = new IntWritable(1);// 实现reduce函数public void reduce(IntWritable key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {for (IntWritable val : values) {context.write(linenum, key);linenum = new IntWritable(linenum.get() + 1);}}}/*** @param args* @throws Exception*/public static void main(String[] args) throws Exception {// TODO Auto-generated method stubConfiguration conf = new Configuration();String[] ioArgs = new String[] { "sort_in", "sort_out" };String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: Data Sort <in> <out>");System.exit(2);}Job job = new Job(conf, "Data Sort");job.setJarByClass(Sort.class);// 设置Map和Reduce处理类job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);// 设置输出类型job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

3 平均成绩

public class Score {public static class Map extends    Mapper<LongWritable, Text, Text, IntWritable> {// 实现map函数public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {// 将输入的纯文本文件的数据转化成StringString line = value.toString();// 将输入的数据首先按行进行分割StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");// 分别对每一行进行处理while (tokenizerArticle.hasMoreElements()) {// 每行按空格划分StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());String strName = tokenizerLine.nextToken();// 学生姓名部分String strScore = tokenizerLine.nextToken();// 成绩部分
Text name = new Text(strName);int scoreInt = Integer.parseInt(strScore);// 输出姓名和成绩context.write(name, new IntWritable(scoreInt));}}}public static class Reduce extendsReducer<Text, IntWritable, Text, IntWritable> {// 实现reduce函数public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;int count = 0;Iterator<IntWritable> iterator = values.iterator();while (iterator.hasNext()) {sum += iterator.next().get();// 计算总分count++;// 统计总的科目数
            }int average = (int) sum / count;// 计算平均成绩context.write(key, new IntWritable(average));}}/*** @param args* @throws Exception */public static void main(String[] args) throws Exception {// TODO Auto-generated method stubConfiguration conf = new Configuration();String[] ioArgs = new String[] { "score_in", "score_out" };String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: Score Average <in> <out>");System.exit(2);}Job job = new Job(conf, "Score Average");job.setJarByClass(Score.class);// 设置Map、Combine和Reduce处理类job.setMapperClass(Map.class);job.setCombinerClass(Reduce.class);job.setReducerClass(Reduce.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);// 将输入的数据集分割成小数据块splites，提供一个RecordReader的实现job.setInputFormatClass(TextInputFormat.class);// 提供一个RecordWriter的实现，负责数据输出job.setOutputFormatClass(TextOutputFormat.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

4 倒排索引

public class InvertedIndex {public static class Map extends Mapper<Object, Text, Text, Text> {private Text keyInfo = new Text(); // 存储单词和URL组合private Text valueInfo = new Text(); // 存储词频private FileSplit split; // 存储Split对象// 实现map函数public void map(Object key, Text value, Context context)throws IOException, InterruptedException {// 获得<key,value>对所属的FileSplit对象split = (FileSplit) context.getInputSplit();StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {// key值由单词和URL组成，如"MapReduce：file1.txt"// 获取文件的完整路径// keyInfo.set(itr.nextToken()+":"+split.getPath().toString());// 这里为了好看，只获取文件的名称。int splitIndex = split.getPath().toString().indexOf("file");keyInfo.set(itr.nextToken() + ":"+ split.getPath().toString().substring(splitIndex));// 词频初始化为1valueInfo.set("1");context.write(keyInfo, valueInfo);}}}public static class Combine extends Reducer<Text, Text, Text, Text> {private Text info = new Text();// 实现reduce函数public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {// 统计词频int sum = 0;for (Text value : values) {sum += Integer.parseInt(value.toString());}int splitIndex = key.toString().indexOf(":");// 重新设置value值由URL和词频组成info.set(key.toString().substring(splitIndex + 1) + ":" + sum);// 重新设置key值为单词key.set(key.toString().substring(0, splitIndex));context.write(key, info);}}public static class Reduce extends Reducer<Text, Text, Text, Text> {private Text result = new Text();// 实现reduce函数public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {// 生成文档列表String fileList = new String();for (Text value : values) {fileList += value.toString() + ";";}result.set(fileList);context.write(key, result);}}/*** @param args* @throws Exception*/public static void main(String[] args) throws Exception {// TODO Auto-generated method stubConfiguration conf = new Configuration();String[] ioArgs = new String[] { "index_in", "index_out" };String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: Inverted Index <in> <out>");System.exit(2);}Job job = new Job(conf, "Inverted Index");job.setJarByClass(InvertedIndex.class);// 设置Map、Combine和Reduce处理类job.setMapperClass(Map.class);job.setCombinerClass(Combine.class);job.setReducerClass(Reduce.class);// 设置Map输出类型job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);// 设置Reduce输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

---------------------------------------------------------------------------

本系列Hadoop1.2.0开发笔记系本人原创

转载请注明出处博客园刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/06/03/3114564.html

转载于:https://www.cnblogs.com/chenying99/archive/2013/06/03/3114564.html

Hadoop1.2.0开发笔记（九）相关推荐

Hadoop1.2.0开发笔记（八）
本人一贯的风格是先了解系统的基础部分,然后在深入到高级部分:如果违背这种循序渐进的次序,也超出了本人的接受能力.古人说,学有本末,事有终始,知所先后,则尽道矣.我们还是从基础开始吧(本人上文提到的开发 ...
Kinect for Windows SDK v2.0 开发笔记 (十) 高清面部帧(1) FACS 介绍
转载于:https://blog.csdn.net/dustpg/article/details/38892783 使用SDK: Kinect for Windows SDK v2.0 public ...
树莓派开发笔记(九)：基于CSI口的摄像头拍照程序(同样适用USB摄像头)
若该文为原创文章,未经允许不得转载原博主博客地址:https://blog.csdn.net/qq21497936 原博主博客导航:https://blog.csdn.net/qq21497936/ ...
树莓派android摄像头驱动开发,树莓派开发笔记(九)：基于CSI口的摄像头拍照程序(同样适用USB摄像头)...
前话前面开发完GPIO口的功能,还剩下2个接口,一个是摄像头,一个是显示屏.本篇我们开发一个基于CSI接口的摄像头拍照程序. Demo:摄像头CSI的拍照程序树莓派摄像头笔者买的不是USB网络摄 ...
Extjs4.0 开发笔记-desktop开始菜单动态生成方法
desktop开始菜单动态生成方法: Desktop.html中,在<scripts>中的Ext.onReady之前添加如下: var mArr = [];//这里是保存显示模块的数组va ...
Kinect for Windows SDK v2.0 开发笔记 (十三) 高清面部帧(4) 面部模型构建器
(转载请注明出处) 使用SDK: Kinect for Windows SDK v2.0 public preview1409 同前面,因为SDK未完成,不附上函数/方法/接口的超链接. 这次让 ...
Kinect for Windows SDK v2.0 开发笔记 (十二) 高清面部帧(3) 面部模型(2D)
(转载请注明出处) 使用SDK: Kinect for Windows SDK v2.0 public preview1409 同前面,因为SDK未完成,不附上函数/方法/接口的超链接. 是的, ...
Cocos2d-x 3.0 开发（九）使用Physicals代替Box2D和chipmunk
1. 概述游戏中模拟真实的世界是个比较麻烦的事情,通常这种事情都是交给物理引擎来做.首屈一指的是Box2D了,它几乎能模拟所有的物理效果.而chipmunk则是个更轻量的引擎,能够满足简单的物理 ...
Kinect for Windows SDK v2.0 开发笔记 (五)骨骼帧与笑面男
(转载请注明出处) 使用SDK: Kinect for Windows SDK v2.0 public preview 这次说说这骨骼帧的获取.嗯,Kinect买来就为这个啊.不然其他数据,买其他产品 ...
eclipse hadoop1.2.0配置及wordcount运行
"error: failure to login"问题 http://www.cnblogs.com/xia520pi/archive/2012/05/20/2510723.htm ...

Hadoop1.2.0开发笔记（九）

Hadoop1.2.0开发笔记（九）相关推荐

最新文章

热门文章