一、关于此次实践

1、实战简介

MapReduce是Hadoop的核心功能之一，掌握它对学习Hadoop至关重要。

Hadoop Map/Reduce是一个使用简易的软件框架，基于它写出来的应用程序能够运行在由上千个商用机器组成的大型集群上，并以一种可靠容错的方式并行处理上T级别的数据集。

本章我们来通过几个示例来学习MapReduce的用法。

2、全部任务

二、实践详解

1、第 1 关：成绩统计

import java.io.IOException;
import java.util.StringTokenizer;import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {/********** Begin **********///Mapper函数public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();private int maxValue = 0;public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString(),"\n");while (itr.hasMoreTokens()) {String[] str = itr.nextToken().split(" ");String name = str[0];one.set(Integer.parseInt(str[1]));word.set(name);context.write(word,one);}//context.write(word,one);}}public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int maxAge = 0;int age = 0;for (IntWritable intWritable : values) {maxAge = Math.max(maxAge, intWritable.get());}result.set(maxAge);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = new Job(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);String inputfile = "/user/test/input";String outputFile = "/user/test/output/";FileInputFormat.addInputPath(job, new Path(inputfile));FileOutputFormat.setOutputPath(job, new Path(outputFile));job.waitForCompletion(true);/********** End **********/}
}

命令行
touch file01
echo Hello World Bye World
cat file01
echo Hello World Bye World >file01
cat file01
touch file02
echo Hello Hadoop Goodbye Hadoop >file02
cat file02
start-dfs.sh
hadoop fs -mkdir /usr
hadoop fs -mkdir /usr/input
hadoop fs -ls /usr/output
hadoop fs -ls /
hadoop fs -ls /usr
hadoop fs -put file01 /usr/input
hadoop fs -put file02 /usr/input
hadoop fs -ls /usr/input

测评

2、第 2 关：文件内容合并去重

import java.io.IOException;import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class Merge {/*** @param args* 对A,B两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件C*///在这重载map函数，直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常：throws IOException,InterruptedExceptionpublic static class Map  extends Mapper<Object, Text, Text, Text>{/********** Begin **********/public void map(Object key, Text value, Context content) throws IOException, InterruptedException {  Text text1 = new Text();Text text2 = new Text();StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {text1.set(itr.nextToken());text2.set(itr.nextToken());content.write(text1, text2);}}  /********** End **********/} //在这重载reduce函数，直接将输入中的key复制到输出数据的key上  注意在reduce方法上要抛出异常：throws IOException,InterruptedExceptionpublic static class  Reduce extends Reducer<Text, Text, Text, Text> {/********** Begin **********/public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {Set<String> set = new TreeSet<String>();for(Text tex : values){set.add(tex.toString());}for(String tex : set){context.write(key, new Text(tex));}}  /********** End **********/}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();conf.set("fs.default.name","hdfs://localhost:9000");Job job = Job.getInstance(conf,"Merge and duplicate removal");job.setJarByClass(Merge.class);job.setMapperClass(Map.class);job.setCombinerClass(Reduce.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/tmp/input/";  //在这里设置输入路径String outputPath = "/user/tmp/output/";  //在这里设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

测评

3、第 3 关：信息挖掘 - 挖掘父子关系

import java.io.IOException;
import java.util.*;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class simple_data_mining {public static int time = 0;/*** @param args* 输入一个child-parent的表格* 输出一个体现grandchild-grandparent关系的表格*///Map将输入文件按照空格分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表，需要注意的是在输出的value中必须加上左右表区别标志public static class Map extends Mapper<Object, Text, Text, Text>{public void map(Object key, Text value, Context context) throws IOException,InterruptedException{/********** Begin **********/String line = value.toString();String[] childAndParent = line.split(" ");List<String> list = new ArrayList<>(2);for (String childOrParent : childAndParent) {if (!"".equals(childOrParent)) {list.add(childOrParent);} } if (!"child".equals(list.get(0))) {String childName = list.get(0);String parentName = list.get(1);String relationType = "1";context.write(new Text(parentName), new Text(relationType + "+"+ childName + "+" + parentName));relationType = "2";context.write(new Text(childName), new Text(relationType + "+"+ childName + "+" + parentName));}/********** End **********/}}public static class Reduce extends Reducer<Text, Text, Text, Text>{public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{/********** Begin **********///输出表头if (time == 0) {context.write(new Text("grand_child"), new Text("grand_parent"));time++;}//获取value-list中value的child
List<String> grandChild = new ArrayList<>();//获取value-list中value的parentList<String> grandParent = new ArrayList<>();//左表，取出child放入grand_childfor (Text text : values) {String s = text.toString();String[] relation = s.split("\\+");String relationType = relation[0];String childName = relation[1];String parentName = relation[2];if ("1".equals(relationType)) {grandChild.add(childName);} else {grandParent.add(parentName);}}//右表，取出parent放入grand_parentint grandParentNum = grandParent.size();int grandChildNum = grandChild.size();if (grandParentNum != 0 && grandChildNum != 0) {for (int m = 0; m < grandChildNum; m++) {for (int n = 0; n < grandParentNum; n++) {//输出结果context.write(new Text(grandChild.get(m)), new Text(grandParent.get(n)));}}}/********** End **********/}}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();Job job = Job.getInstance(conf,"Single table join");job.setJarByClass(simple_data_mining.class);job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/reduce/input";   //设置输入路径String outputPath = "/user/reduce/output";   //设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

测评

Ending！
更多课程知识学习记录随后再来吧！

就酱，嘎啦！

注：
人生在勤，不索何获。

大数据从入门到实战 - 第3章 MapReduce基础实战相关推荐

视频教程-赵强老师：大数据从入门到精通（1）Linux基础-Linux
赵强老师:大数据从入门到精通(1)Linux基础毕业于清华大学,拥有超过13年的工作经验. Oracle认证讲师,拥有6年以上授课经验.精通Oracle数据库.中间(Weblogic)和大数据Had ...
jdbc代码_凯哥带你从零学大数据系列之数据库篇---第三章:JDBC基础
温馨提示:如果想学扎实,一定要从头开始看凯哥的一系列文章(凯哥带你从零学大数据系列),千万不要从中间的某个部分开始看,知识前后是有很大关联,否则学习效果会打折扣. 系列文章第一篇是拥抱大数据:凯哥带你 ...
视频教程-赵强老师：大数据从入门到精通（6）MapReduce-Hadoop
赵强老师:大数据从入门到精通(6)MapReduce 毕业于清华大学,拥有超过13年的工作经验. Oracle认证讲师,拥有6年以上授课经验.精通Oracle数据库.中间(Weblogic)和大数据H ...
大数据从入门到实战 - 第2章分布式文件系统HDFS
大数据从入门到实战 - 第2章分布式文件系统HDFS 一.关于此次实践 1.实战简介 2.全部任务二.实践详解 1.第1关: HDFS 的基本操作 2.第2关:HDFS-JAVA接口之读取文件 3 ...
大数据与Hadoop有什么关系？大数据Hadoop入门简介
学习着数据科学与大数据技术专业(简称大数据)的我们,对于"大数据"这个词是再熟悉不过了,而每当我们越去了解大数据就越发现有个词也会一直被提及那就是--Hadoop 那Hadoop与 ...
大数据必看经典书籍：大数据从入门到深入书籍推荐
大数据领域,尤其是涉及到技术开发方向,是有着很庞杂的技术知识体系的,通过看书来打好理论基础是很多同学初学大数据的想法.下面加米谷大数据老师给大家推荐几本大数据从入门到深入必看的经典书籍,希望能对大家有 ...
视频教程-赵强老师：大数据从入门到精通（7）HBase-Hbase
赵强老师:大数据从入门到精通(7)HBase 毕业于清华大学,拥有超过13年的工作经验. Oracle认证讲师,拥有6年以上授课经验.精通Oracle数据库.中间(Weblogic)和大数据Hadoo ...
视频教程-赵强老师：大数据从入门到精通（15）Storm-大数据
赵强老师:大数据从入门到精通(15)Storm 毕业于清华大学,拥有超过13年的工作经验. Oracle认证讲师,拥有6年以上授课经验.精通Oracle数据库.中间(Weblogic)和大数据Hado ...
视频教程-赵强老师：大数据从入门到精通（23）配置Hive On Spark-Spark
赵强老师:大数据从入门到精通(23)配置Hive On Spark 毕业于清华大学,拥有超过13年的工作经验. Oracle认证讲师,拥有6年以上授课经验.精通Oracle数据库.中间(Weblogi ...

大数据从入门到实战 - 第3章 MapReduce基础实战