hadoop MapReduce 输出结果中文乱码解决

hadoop涉及输出文本的默认输出编码统一用没有BOM的UTF-8的形式，但是对于中文的输出window系统默认的是GBK，有些格式文件例如CSV格式的文件用excel打开输出编码为没有BOM的UTF-8文件时，输出的结果为乱码，只能由UE或者记事本打开才能正常显示。因此将hadoop默认输出编码更改为GBK成为非常常见的需求。
自定义 TextOutputFormat.class

package com.ljt.hdfs;import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
/*** * <p>Title:hadoop MapReduce 输出结果中文乱码解决</p>* <p> 功能描述:: </p>* <p>Company: adteach </p> * @author  刘建涛 * * @date    2017年7月19日下午4:37:41* @version 1.0*/@InterfaceAudience.Public
@InterfaceStability.Stable
public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";protected static class LineRecordWriter<K, V>extends RecordWriter<K, V> {private static final String utf8 = "UTF-8";  // 将UTF-8转换成GBK private static final byte[] newline;static {try {newline = "\n".getBytes(utf8);} catch (UnsupportedEncodingException uee) {throw new IllegalArgumentException("can't find " + utf8 + " encoding");}}protected DataOutputStream out;private final byte[] keyValueSeparator;public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {this.out = out;try {this.keyValueSeparator = keyValueSeparator.getBytes(utf8);} catch (UnsupportedEncodingException uee) {throw new IllegalArgumentException("can't find " + utf8 + " encoding");}}public LineRecordWriter(DataOutputStream out) {this(out, "\t");}/*** Write the object to the byte stream, handling Text as a special* case.* @param o the object to print* @throws IOException if the write throws, we pass it on*/private void writeObject(Object o) throws IOException {if (o instanceof Text) {Text to = (Text) o;   // 将此行代码注释掉out.write(to.getBytes(), 0, to.getLength());  // 将此行代码注释掉} else { // 将此行代码注释掉      out.write(o.toString().getBytes(utf8));}}public synchronized void write(K key, V value)throws IOException {boolean nullKey = key == null || key instanceof NullWritable;boolean nullValue = value == null || value instanceof NullWritable;if (nullKey && nullValue) {return;}if (!nullKey) {writeObject(key);}if (!(nullKey || nullValue)) {out.write(keyValueSeparator);}if (!nullValue) {writeObject(value);}out.write(newline);}public synchronized void close(TaskAttemptContext context) throws IOException {out.close();}}public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {Configuration conf = job.getConfiguration();boolean isCompressed = getCompressOutput(job);String keyValueSeparator= conf.get(SEPERATOR, "\t");CompressionCodec codec = null;String extension = "";if (isCompressed) {Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);extension = codec.getDefaultExtension();}Path file = getDefaultWorkFile(job, extension);FileSystem fs = file.getFileSystem(conf);if (!isCompressed) {FSDataOutputStream fileOut = fs.create(file, false);return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);} else {FSDataOutputStream fileOut = fs.create(file, false);return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),keyValueSeparator);}}
}

  默认的情况下MR主程序中，设定输出编码的设置语句为：

job.setOutputFormatClass(TextOutputFormat.class);

上述代码的第48行可以看出hadoop已经限定此输出格式统一为UTF-8，因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat
（注意是
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat）
即可，如下代码

package com.ljt.hdfs;import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
/*** * <p>* Title: GbkOutputFormat* </p>* <p>* 功能描述::* hadoop已经限定此输出格式统一为UTF-8，因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat* （注意是 org.apache.hadoop.mapreduce.lib.output.FileOutputFormat）* </p>* <p>* Company: adteach* </p>* * @author 刘建涛 ** @date 2017年7月19日下午4:42:05* @version 1.0*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";protected static class LineRecordWriter<K, V>extends RecordWriter<K, V> {private static final String utf8 = "GBK";private static final byte[] newline;static {try {newline = "\n".getBytes(utf8);} catch (UnsupportedEncodingException uee) {throw new IllegalArgumentException("can't find " + utf8 + " encoding");}}protected DataOutputStream out;private final byte[] keyValueSeparator;public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {this.out = out;try {this.keyValueSeparator = keyValueSeparator.getBytes(utf8);} catch (UnsupportedEncodingException uee) {throw new IllegalArgumentException("can't find " + utf8 + " encoding");}}public LineRecordWriter(DataOutputStream out) {this(out, "\t");}/*** Write the object to the byte stream, handling Text as a special* case.* @param o the object to print* @throws IOException if the write throws, we pass it on*/private void writeObject(Object o) throws IOException {if (o instanceof Text) {
//        Text to = (Text) o;
//        out.write(to.getBytes(), 0, to.getLength());
//      } else {out.write(o.toString().getBytes(utf8));}}public synchronized void write(K key, V value)throws IOException {boolean nullKey = key == null || key instanceof NullWritable;boolean nullValue = value == null || value instanceof NullWritable;if (nullKey && nullValue) {return;}if (!nullKey) {writeObject(key);}if (!(nullKey || nullValue)) {out.write(keyValueSeparator);}if (!nullValue) {writeObject(value);}out.write(newline);}public synchronized void close(TaskAttemptContext context) throws IOException {out.close();}}public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {Configuration conf = job.getConfiguration();boolean isCompressed = getCompressOutput(job);String keyValueSeparator= conf.get(SEPERATOR, "\t");CompressionCodec codec = null;String extension = "";if (isCompressed) {Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);extension = codec.getDefaultExtension();}Path file = getDefaultWorkFile(job, extension);FileSystem fs = file.getFileSystem(conf);if (!isCompressed) {FSDataOutputStream fileOut = fs.create(file, false);return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);} else {FSDataOutputStream fileOut = fs.create(file, false);return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),keyValueSeparator);}}
}

最后将输出编码类型设置成GbkOutputFormat.class，如：
job.setOutputFormatClass(GbkOutputFormat.class);

hadoop MapReduce 输出结果中文乱码解决相关推荐

成功解决连接SQL输出出现中文乱码问题(10001, 'oracle¿ìËÙÈëÃÅ', 'Íõº£ÁÁ', 'Ë®Àû³ö°æÉç',
成功解决连接SQL输出出现中文乱码问题(10001, 'oracle¿ìËÙÈëÃÅ', 'Íõº£ÁÁ', 'Ë®Àû³ö°æÉç', 目录解决问题解决思路解决方法解决问题解决连接SQL出 ...
python2中文输出代码_解决vscode python print 输出窗口中文乱码的问题
解决vscode python print 输出窗口中文乱码的问题发布时间:2020-09-17 23:53:25 来源:脚本之家阅读:119 一.搭建 python 环境在 VSC 中点击 F ...
QT5 界面截图保存到本地+输出PDF/WORD格式文档+QT界面中文乱码及输出PDF中文乱码的解决（亲身实践并且成功）
最近做了一个和QT5有关的项目,遇到很多问题也学习到不少,特意写下来希望帮到更多的人.(我的版本VS2017+QT5.12.0) 一.QT5截图并保存到本地在头文件添加必须项 #include &l ...
Spark读取日志文件集中文乱码解决方法
Spark读取日志中文乱码解决方法问题展示 �� 一般来说,这个问题多出现于GBK与UTF-8编码互相转换时.众所周知,GBK编码是windows系统的编码格式,而UTF-8是linux系统的编 ...
vs code中文乱码解决方法
修改 1.(安装方法) 2.显示终端输入数据输出结果(完美解决) 3.修改部分:中文乱码解决方法第一步: 第二步: 1.(安装方法) 转载于: https:blog.csdn.net/qq_4304 ...
在一个JS文件中包含中文字符串，通过innerHTML输出后中文乱码？
在一个JS文件中包含中文字符串,通过innerHTML输出后中文乱码? Posted on 2008-07-13 12:00 尹合磊阅读(1902) 评论(0) 编辑收藏所属分类: ASP.N ...
Java中文jsp页面_java中文乱码解决之道（七）—–JSP页面编码过程
我们知道JSP页面是需要转换为servlet的,在转换过程中肯定是要进行编码的.在JSP转换为servlet过程中下面一段代码起到至关重要的作用. 在上面代码中有两个地方存在编码:pageEncodi ...
java 页面编码_java中文乱码解决之道（七）-----JSP页面编码过程
我们知道JSP页面是需要转换为servlet的,在转换过程中肯定是要进行编码的.在JSP转换为servlet过程中下面一段代码起到至关重要的作用. 在上面代码中有两个地方存在编码:pageEncodi ...
eclipse java web乱码,eclipse中文乱码解决
本文收集整理关于eclipse中文乱码解决的相关议题,使用内容导航快速到达. 内容导航: Q1:eclipse中java中文控制台输出的这种乱码怎么解决 eclipse中java中文控制台输出的这种乱 ...
X64dbg 2021最新版中文乱码解决
X64dbg中文乱码解决 X64dbg可以对64位的软件进行反编译,是针对Olldbg只能调试32位软件的改进,使用也比较方便.但由于该软件前端使用QT开发,对中文的解析经常会出现乱码,不能很好解析出 ...

hadoop MapReduce 输出结果中文乱码解决

hadoop MapReduce 输出结果中文乱码解决相关推荐

最新文章

热门文章