哈夫曼编码解压缩文件

文章目录

前言
一、文件压缩
二、文件解压
结语

前言

不了解哈夫曼树的可以移步查看我的另一篇博客：哈夫曼树(最优二叉树)

使用哈夫曼编码压缩文件，其实就是将每个字符的哈夫曼编码得到，每8位转为一个字节存到文件中，解压缩的时候，在将字节转为二进制重新找到哈夫曼编码对应的字符，这样即可完成文件的解压缩。

文件解压缩的方法：
①将每个字符对应的权值存入压缩文件，在解压时重写构建哈夫曼树，遍历哈夫曼树来获得对应的字符
②将每个字符对应的哈夫曼编码以及长度存入压缩文件，在解压时根据每个字符对应哈夫曼编码的长度，来截取每个字符对应的哈夫曼编码

本博客使用：方法②。
方法①：用于在使用字节流传输时如果每个字符对应的权值大于255时，就会出现权值错误，这是由于java在字节流传输时，会将int转为bety，取int低8位，而int为32位，那么大于8位的数值就会丢失。
具体参考该博客
当然可以使用字符流来传输就可以解决这个问题。

一、文件压缩

大体步骤：

读取文件，统计每个字符出现的次数(权值)
根据权值，创建哈夫曼树
遍历哈夫曼树，得到每个字符的哈夫曼编码
再次读取文件，将每个字符对应的哈夫曼编码拼接，每8位编码转为一个字节写入压缩文件

注意：

字符可能出现特殊字符，btye值小于0，需要特殊处理，代码中有
需要将码表（每个字符对应的长度、字符对应的哈夫曼编码）写入压缩文件，用于文件解压
每8位转一个字节，如果不够那么就需要补0，所以需要将最后8位补0的个数写入文件
使用缓存机制，减少io次数，提高效率

Compress类
Compress.java

package com.kiger.fileDecompression;import java.io.*;
import java.util.Arrays;
import java.util.Comparator;
import java.util.PriorityQueue;/*** @ClassName Compress* @Description 压缩文件类* @Author zk_kiger* @Date 2019/11/7 18:55* @Version 1.0*/public class Compress {static final int CHAR_INDEX = 256;static final int BUFFER_SIZE = 128;// 用来记录文件中字符出现的次数,下标对应字符的ASCII码private int[] times = new int[CHAR_INDEX];// 用来记录每个字符对应的huffman编码private String[] huffmanCodes = new String[CHAR_INDEX];// 优先队列用于创建huffman树,自动从小到大排序结点private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {@Overridepublic int compare(Node o1, Node o2) {return o1.getWeight() - o2.getWeight();}});public Compress() {for (int i = 0; i < huffmanCodes.length; i++) {huffmanCodes[i] = "";}}/*** 压缩文件* @param fromPath 被压缩文件路径* @param toPath   已压缩文件路径*/public void compress(String fromPath, String toPath) {compress_(fromPath, toPath);}private void compress_(String fromPath, String toPath) {// 1.读取文件并统计字符权值statCharWeight(fromPath);// 2.根据权值创建Huffman树Node root = createHuffman();// 3.根据前序遍历获得编码表getHuffmanCode(root, "");System.out.println("正在压缩文件...");// 4.根据编码表压缩文件compressFile(fromPath, toPath);System.out.println("文件压缩完成...");}// 根据编码表压缩文件byte value = 0;int index = 0;int writeBufferSize = 0;byte[] writeBuffer = new byte[BUFFER_SIZE];int lastIndex = 0;   // 最后一个字节补0的个数private void compressFile(String fromPath, String toPath) {File toFile = new File(toPath);try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toFile))) {// 将每个编码的长度写入文件StringBuilder code = new StringBuilder();for (int i = 0; i < CHAR_INDEX; i++) {bos.write(huffmanCodes[i].length());
//                if (huffmanCodes[i].length() != 0)
//                    System.out.println(i + " : " + huffmanCodes[i]);code.append(huffmanCodes[i]);}// 再将哈夫曼编码写入文件char[] charArray = code.toString().toCharArray();for (int i = 0; i < charArray.length; i++) {if (charArray[i] == '0')value = CLR_BYTE(value, index);if (charArray[i] == '1')value = SET_BYTE(value, index);index++;if (index >= 8) {index = 0;writeInBuffer(bos, value);}}if (index != 0) {writeInBuffer(bos, value);}// 写文件内容index = 0;value = 0;byte[] bytes = new byte[BUFFER_SIZE];int len;double length = 0;double fileTotalSize = (double)bis.available();while ((len = bis.read(bytes)) != -1) {length += len;double jd = (length/fileTotalSize)*100;System.out.printf("压缩进度：%.2f%%\n",jd);// 用于拼接字符编码StringBuilder sb = new StringBuilder();for (int i = 0; i < len; i++) {int temp = bytes[i];if (temp < 0) {sb.append(huffmanCodes[CHAR_INDEX + temp]);
//                        System.out.print((CHAR_INDEX + temp) + " ");} else {sb.append(huffmanCodes[temp]);
//                        System.out.print(temp + " ");}}
//                System.out.print(sb.toString());// 将拼接好的01字符,每8位转为一个字节存到缓存区char[] chars = sb.toString().toCharArray();for (int i = 0; i < chars.length; i++) {if (chars[i] == '0')value = CLR_BYTE(value, index);if (chars[i] == '1')value = SET_BYTE(value, index);index++;if (index >= 8) {writeInBuffer(bos, value);index = 0;}}}if (index != 0) {lastIndex = 8 - index;writeInBuffer(bos, value);writeInBuffer(bos, (byte) lastIndex);
//                System.out.println(lastIndex);} else {writeInBuffer(bos, (byte) lastIndex);}// 将缓存中的字节写入到文件中byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);bos.write(data);} catch (IOException e) {e.printStackTrace();}}// 前序遍历获得哈夫曼编码表private void getHuffmanCode(Node root, String code) {if (root.getLeftChild() != null)getHuffmanCode(root.getLeftChild(), code + "0");if (root.getRightChild() != null)getHuffmanCode(root.getRightChild(), code + "1");if (root.getLeftChild() == null && root.getRightChild() == null) {//            System.out.println(root.getIndex() + " 的编码为：" + code);huffmanCodes[root.getIndex()] = code;}}// 创建Huffman树private Node createHuffman() {// 将字符结点存入到优先队列中for (int i = 0; i < times.length; i++) {if (times[i] != 0){//                System.out.println("i = " + i + " : " + "value = " + times[i]);queue.add(new Node(i, times[i]));}}// 根据优先队列构建哈夫曼树while (queue.size() > 1) {// 权值最小Node rightChild = queue.remove();// 权值仅次于rightChildNode leftChild = queue.remove();Node newNode = new Node(-1, rightChild.getWeight() + leftChild.getWeight());newNode.setLeftChild(leftChild);newNode.setRightChild(rightChild);queue.add(newNode);}// 返回根结点return queue.peek();}// 计算字符权值private void statCharWeight(String fromPath) {try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath))) {byte[] bytes = new byte[BUFFER_SIZE];int len;while ((len = bis.read(bytes)) != -1) {// 用缓存中的字节统计出现权值for (int i = 0; i < len; i++) {int temp = bytes[i];if (temp < 0)times[CHAR_INDEX + temp]++;elsetimes[temp]++;}}} catch (IOException e) {e.printStackTrace();}}//指定位，置1private byte SET_BYTE(byte value, int index){return (value) |= (1 << ((index) ^ 7));}//指定位，置0private byte CLR_BYTE(byte value, int index){return (value) &= (~(1 << ((index) ^ 7)));}// 写入缓存,达到要求再写入文件private void writeInBuffer(BufferedOutputStream bos, byte value) throws IOException {if (writeBufferSize < BUFFER_SIZE) {//            System.out.print(value + " ");
//            System.out.println(Integer.toBinaryString((byte)value) + " ");writeBuffer[writeBufferSize] = value;if (++writeBufferSize >= BUFFER_SIZE) {bos.write(writeBuffer);writeBufferSize = 0;}}}}

二、文件解压

大体步骤：

读取文件记录每个字符对应的哈夫曼编码长度
根据每个字符哈夫曼编码长度截取每个字符的哈夫曼编码
读取文件内容哈夫曼编码找到对应的字符，并写入解压文件中

注意：

在读取字节转为二进制时，需要将字节先强转为整型（因为在字节中可能出现大于127的值，转为byte时，就会存入该值的补码，那么就需要先转为整型再转二进制，否则会出现错误的二进制转换）

int num = value&0xff;

在压缩文件的最后一个字节存储的是，后面8位补0的个数，需要在特殊读取最后2个字节

Decompress类
Decompress.java

package com.kiger.fileDecompression;import java.io.*;
import java.util.*;/*** @ClassName Decompress* @Description 解压类* @Author zk_kiger* @Date 2019/11/7 22:14* @Version 1.0*/public class Decompress {static final int CHAR_INDEX = 256;static final int BUFFER_SIZE = 128;// 每个字符对应哈夫曼编码的长度private int[] codelengths = new int[CHAR_INDEX];// 每个Huffman编码对应的字符private Map<String, Integer> huffmanMap = new HashMap<>();// 优先队列用于创建huffman树,自动从小到大排序结点private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {@Overridepublic int compare(Node o1, Node o2) {return o1.getWeight() - o2.getWeight();}});public Decompress() {}public void deCompress(String fromPath, String toPath) {deCompress_(fromPath, toPath);}/*** 解压文件*/private void deCompress_(String fromPath, String toPath) {// 1.读取文件里面的码表并还原码表// 2.根据权值重新构建Huffman树// 3.根据创建Huffman树遍历将字符写入文件System.out.println("开始解压缩文件...");decompressFile(fromPath, toPath);System.out.println("解压缩文件完成...");}// 读取文件内容,转为哈夫曼编码并解码写入文件private void decompressFile(String fromPath, String toPath) {// 前面256个字节存储的是每个字符的权值,从第257个字节读取try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toPath))){// 1.读取文件里面的码表并还原码表readHuffmanCode(bis);// 2.读取剩下的文件内容byte[] bytes = new byte[BUFFER_SIZE];int len;int lastIndex = -1;double length = 0;double fileTotalSize = (double)bis.available();String codeString = "";while ((len = bis.read(bytes)) != -1) {length += len;double jd = (length/fileTotalSize)*100;System.out.printf("解压进度：%.2f%%\n",jd);StringBuilder sb = new StringBuilder();if (bis.available() == 0) {lastIndex = len-1;len -= 2;}for (int i = 0; i < len; i++) {// 将1字节8位字符串sb.append(tranIntToBin(bytes[i]));}// 为最后一个字节，需要去掉后面添加的0if (lastIndex != -1) {byte value = bytes[lastIndex-1];int lastLen = bytes[lastIndex]&0xff;
//                    System.out.println(lastLen);String s = tranIntToBin(value);sb.append(s, 0, s.length()-lastLen);}
//                System.out.println(sb.toString());// 根据Huffman编码找到对应的字符codeString += sb.toString();for (int i = 0; i < codeString.length(); i++) {String s = codeString.substring(0, i+1);if (huffmanMap.containsKey(s)) {writeInBuffer(bos, huffmanMap.get(s));
//                        System.out.print(huffmanMap.get(s) + " ");codeString = codeString.substring(i+1);i = -1;}}}byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);bos.write(data);} catch (IOException e) {e.printStackTrace();}}// 读取文件码表 - 得到每个字符对应的编码private void readHuffmanCode(BufferedInputStream bis) {try {int temp;int codeTotalLength = 0;// 记录每个字符对应的编码长度for (int i = 0; i < codelengths.length; i++) {temp = bis.read();codelengths[i] = temp;codeTotalLength += codelengths[i];}// 得到编码总长度可以获取前多少字节存放编码,用来截取每一个字符对应的编码int length = codeTotalLength / 8;if ((codeTotalLength%8) != 0)length++;byte[] bytes = new byte[length];int len;while ((len = bis.read(bytes)) != -1) {StringBuilder sb = new StringBuilder();for (int i = 0; i < bytes.length; i++) {// 将字节转为二进制sb.append(tranIntToBin(bytes[i]));}String code = sb.toString();// 读取Huffman编码并存入map中for (int i = 0; i < codelengths.length; i++) {if (codelengths[i] != 0) {String s = code.substring(0, codelengths[i]);
//                        System.out.println(i + " : " + codelengths[i] + " : " + s);huffmanMap.put(s, i);code = code.substring(codelengths[i]);}}break;}} catch (IOException e) {e.printStackTrace();}}// 写入缓存,达到要求再写入文件int writeBufferSize = 0;byte[] writeBuffer = new byte[BUFFER_SIZE];private void writeInBuffer(BufferedOutputStream bos, int value) throws IOException {if (writeBufferSize < BUFFER_SIZE) {writeBuffer[writeBufferSize] = (byte)value;if (++writeBufferSize >= BUFFER_SIZE) {bos.write(writeBuffer);writeBufferSize = 0;}}}// 将整数转为8位二进制private static String tranIntToBin(byte value) {// 该操作非常重要  字节&0xff  强转为int类型int num = value&0xff;
//        System.out.println(num + " ");String s = "";for (int i = 0; i < 8; i++) {s = num%2 + s;num = num / 2;}return s;}}

测试类
RunTest.java

package com.kiger.fileDecompression;import java.io.IOException;/*** @ClassName RunTest* @Description TODO* @Author zk_kiger* @Date 2019/11/7 21:20* @Version 1.0*/public class RunTest {public static void main(String[] args) throws IOException {String sourcePath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test.txt";String compressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test2.huffmanZip";String decompressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test3.txt";Compress compress = new Compress();compress.compress(sourcePath, compressPath);Decompress decompress = new Decompress();decompress.deCompress(compressPath, decompressPath);}
}

结语

由于哈夫曼编码压缩文件效率较低只能达到80%~90%之间，而且还要存入码表，所以效率不高。