test3_Huffman codes

Huffman编码的方法

（1）统计符号发生的概率。

（2）按照出现概率从小到大排序。

（3）每一次选出概率最小的两个符号作为二叉树的叶节点，将和作为它们的根节点，其频率为两个子节点频率之和，这两个叶子节点不再参与比较，再用新的根节点参与比较。
（4）重复（3）步骤，直到得到概率为1的根节点。

（5）二叉树的左节点为0，右节点为1，从上到下由根节点到叶节点得到每个叶节点的编码。

Huffman节点及Huffman码字节点的数据结构

[cpp] view plaincopy

typedef struct huffman_node_tag
{
unsigned char isLeaf; // 是否为叶节点，1是0否
unsigned long count; //信源中出现频数
struct huffman_node_tag *parent; //父节点指针
union{
struct{ //如果不是叶节点，这里为左右子节点指针
struct huffman_node_tag *zero, *one;
};
unsigned char symbol; //如果是叶节点，这里为一个信源符号
};
} huffman_node;
typedef struct huffman_code_tag //码字数据类型
{
unsigned long numbits; //码字长度
/* 码字的第1到第8比特由低到高保存在bits[0]中，第9比特到第16比特保存在bits[1]中/
unsigned char *bits;
} huffman_code;

静态链接库

该程序文件包含两个两个工程（project），其中“Huff_run”为主工程（Win32 Console Application），其中包含程序的主函数，有“Huff_code”为库工程（Win32 Static Library）。

Huffman编码的流程

1.读入文件。

2.进行第一次扫描，统计文件中各个字符出现的频率。

3.建立huffman树。

4.将码表及其他必要信息写入输出文件。

5.第二次扫描，对源文件进行编码并输出。

Huff_code

Huffman.h

[csharp] view plaincopy

/*
* huffman_coder - Encode/Decode files using Huffman encoding.
* http://huffman.sourceforge.net
* Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef HUFFMAN_HUFFMAN_H
#define HUFFMAN_HUFFMAN_H
#include <stdio.h>
int huffman_encode_file(FILE *in, FILE *out,FILE *out_Table );//step1:changed by yzhang for huffman statistics
int huffman_decode_file(FILE *in, FILE *out);
int huffman_encode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen);
int huffman_decode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **bufout,
unsigned int *pbufoutlen);
#endif

Huffman.c

1.从源文件中读取数据（本实验以ASCII字符流），统计每个符号发生的概率，并建立相应的树叶节点。

[csharp] view plaincopy

#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];

[csharp] view plaincopy

static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)//统计文件中各个字符出现频率
{
int c;
unsigned int total_count = 0;//扫描的总信源符号数，初始化为0
/* 将所有信源符号地址初始化为NULL，使得所有字符频率为0 */
init_frequencies(pSF);
/* 计算输入文件中每个符号的频率。 */
while((c = fgetc(in)) != EOF)//挨个读取字符
{
unsigned char uc = c;//将读取的字符赋给uc
if(!(*pSF)[uc])//如果uc不存在对应的空间，即uc是一个新的符号
(*pSF)[uc] = new_leaf_node(uc);//产生该字符的一个新的叶节点。
++(*pSF)[uc]->count;//如果uc不是一个新的字符，则当前字符出现的频数累加1
++total_count;//总计数值加1
}
return total_count;//返回值为总计数值
}

new_leaf_node(）

[csharp] view plaincopy

static huffman_node*
new_leaf_node(unsigned char symbol)/*新建一个叶节点*/
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
p->isLeaf = 1;//1表示是叶节点
p->symbol = symbol;//将新的符号的值存入symbol中
p->count = 0;//该节点的频数为初始化0
p->parent = 0;//该节点父节点初始化为0
return p;
}

2. 构建霍夫曼树及生成霍夫曼码。

[csharp] view plaincopy

static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;
SymbolEncoder *pSE = NULL;
#if 0
printf("BEFORE SORT\n");
print_freqs(pSF);
#endif
/* 按升序对符号频率数组进行排序 */
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//数组的起始地址，数组的元素数，每个元素的大小，比较函数的指针
//将所有的节点按照字符概率小到大排序，可使用qsort函数对节点结构体进行排序。排序的依据是SFComp，即根据每个字符发生的概率进行排序。
#if 0
printf("AFTER SORT\n");
print_freqs(pSF);
#endif
/*得到文件出现的字符种类数 */
for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
;
/*
* Construct a Huffman tree. This code is based
* on the algorithm given in Managing Gigabytes
* by Ian Witten et al, 2nd edition, page 34.
* Note that this implementation uses a simple
* count instead of probability.
构建霍夫曼树
*/
for(i = 0; i < n - 1; ++i)
{
/* 将m1和m2设置为最小概率的两个子集。 */
m1 = (*pSF)[0];
m2 = (*pSF)[1];
/* 将m1和m2替换为一个集合{m1，m2}，其概率是m1和m2之和的概率。*/
//合并m1、m2为非叶节点，count为二者count之和
//并将该非叶节点的左右孩子设为m1、m2
//将左右孩子的父节点指向该非叶节点
//将(*pSF)[0]指向该非叶节点
(*pSF)[0] = m1->parent = m2->parent =
new_nonleaf_node(m1->count + m2->count, m1, m2);//
(*pSF)[1] = NULL;//1节点置空
/* 由于最小的两个频率数，进行了合并，频率大小发生改变，所以重新排序 */
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
}
/* Build the SymbolEncoder array from the tree. */
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
//定义一个指针数组，数组中每个元素是指向码节点的指针
memset(pSE, 0, sizeof(SymbolEncoder));
build_symbol_encoder((*pSF)[0], pSE);
return pSE;
}

其中qsort函数使用到的比较函数SFComp代码如下：

[csharp] view plaincopy

static int
SFComp(const void *p1, const void *p2)
{
const huffman_node *hn1 = *(const huffman_node**)p1;
const huffman_node *hn2 = *(const huffman_node**)p2;
/* 用于将所有NULL排到最后 */
if(hn1 == NULL && hn2 == NULL)
return 0;//若两者都为空，则返回相等
if(hn1 == NULL)
return 1;//若返回值为1，大于0，则hn1排到hn2后
if(hn2 == NULL)
return -1;若返回值为-1，小于0，则hn2排到hn1后
/*由小到大排列*/
if(hn1->count > hn2->count)
return 1;
else if(hn1->count < hn2->count)
return -1;
return 0;
}

遍历递归Huffman树，对存在的每个字符计算码字

[csharp] view plaincopy

static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
if(subtree == NULL)
return;//判断是否是空树, 是则说明编码结束，
if(subtree->isLeaf)//判断是否为树叶节点，是则产生新的码字
(*pSF)[subtree->symbol] = new_code(subtree);
else
{//
build_symbol_encoder(subtree->zero, pSF);//遍历左子树，调用build_symbol_encoder函数自身
build_symbol_encoder(subtree->one, pSF);//遍历右子数
}
}

对每个树叶节点进行编码：

[csharp] view plaincopy

static huffman_code*
new_code(const huffman_node* leaf)
{
/* 通过走到根节点然后反转位来构建huffman代码，
因为霍夫曼代码是通过走下树来计算的。*/
//采用向上回溯的方法
unsigned long numbits = 0;//表示码长，以位为单位
unsigned char* bits = NULL;//表示指向码字的指针
huffman_code *p;
while(leaf && leaf->parent)//用来判断节点和父节点是否存在，leaf为NULL时，不进行编码；parent为NULL时，已经到达树根不在编码
{
huffman_node *parent = leaf->parent;
unsigned char cur_bit = (unsigned char)(numbits % 8);//current_bit为当前在bits[]的第几位
unsigned long cur_byte = numbits / 8;//current_byte
/* 如果码字长度超过一个字节，那么就在分配一个字节 */
if(cur_bit == 0)
{
size_t newSize = cur_byte + 1;
bits = (char*)realloc(bits, newSize);
/*realloc()函数先判断当前的指针是否有足够的连续空间，如果有，扩大bits指向的地址，并且将bits返回，如果空间不够，先按照newsize指定的大小分配空间，将原有数据从头到尾拷贝到新分配的内存区域，而后释放原来bits所指内存区域(注意:原来指针是自动释放，不需要使用free)，同时返回新分配的内存区域的首地址。即重新分配存储器块的地址。*/
bits[newSize - 1] = 0; /* Initialize the new byte. */
}
//如果是左孩子，则不用改变数值，因为初始化为0。如果是右孩子，则将该位置1
if(leaf == parent->one)
bits[cur_byte] |= 1 << cur_bit;//将1左移至cur_bit，再将其与bits[cur_byte]进行或的操作
++numbits;//码字位数加1
leaf = parent;//下一位的码字在当前码字的父节点一级
}
if(bits)//将现有的码字进行反转
reverse_bits(bits, numbits);
p = (huffman_code*)malloc(sizeof(huffman_code));
p->numbits = numbits;//码长赋给节点的numbits
p->bits = bits;//码字付给节点的bits
return p;//返回值为码字
}

码字逆序：

[csharp] view plaincopy

static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
unsigned long numbytes = numbytes_from_numbits(numbits);//将numbits除8后上取整得到numbytes
unsigned char *tmp =
(unsigned char*)alloca(numbytes);//alloca()是内存分配函数，在栈上申请空间，用完后马上就释放
unsigned long curbit;
long curbyte = 0;//记录即将要反转的二进制码所在的的数组下标
memset(tmp, 0, numbytes); //将数组tmp[numbytes]所有元素置为为0
for(curbit = 0; curbit < numbits; ++curbit)
{
unsigned int bitpos = curbit % 8;//表示curbit不是8的倍数时需要左移的位数
if(curbit > 0 && curbit % 8 == 0)//curbit为8的倍数时，进入下一个字节
++curbyte;
tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);
}
memcpy(bits, tmp, numbytes);//将tmp临时数组内容拷贝到bits数组中
}

将码表写入文件

[csharp] view plaincopy

static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
unsigned long i, count = 0;
/* 计算se中的字符种类数. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count;
}
/* Write the number of entries in network byte order. */
i = htonl(count); //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，
//因此big-endian作为network byte order，little-endian作为host byte order。
//little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变
if(fwrite(&i, sizeof(i), 1, out) != 1)
return 1;//将字符种类的个数写入文件
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count);
if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
return 1;//将字符数写入文件
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* 写入1字节的符号 */
fputc((unsigned char)i, out);
/* 写入一字节的码长 */
fputc(p->numbits, out);
/* 写入numbytes字节的码字*/
numbytes = numbytes_from_numbits(p->numbits);
if(fwrite(p->bits, 1, numbytes, out) != numbytes)
return 1;
}
}
return 0;
}

第二次扫描对文件进行Huffman编码

[csharp] view plaincopy

static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
int c;
while((c = fgetc(in)) != EOF)//遍历文件的每一个字符
{
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc];//查表
unsigned long i;
/*将码字写入文件*/
for(i = 0; i < code->numbits; ++i)
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
fputc(curbyte, out);
curbyte = 0;
curbit = 0;
}
}
}

输出统计结果

[csharp] view plaincopy

int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{
int i,count =0;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*SF)[i])
{
st->freq[i]=(float)(*SF)[i]->count/total_count;
count+=(*SF)[i]->count;
}
else
{
st->freq[i]= 0;
}
}
if(count==total_count)
return 1;
else
return 0;
}
int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{
unsigned long i,j;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
st->numbits[i] = p->numbits;
numbytes = numbytes_from_numbits(p->numbits);
for (j=0;j<numbytes;j++)
st->bits[i][j] = p->bits[j];
}
else
st->numbits[i] =0;
}
return 0;
}
void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{
int i,j;
unsigned char c;
fprintf(out_Table,"symbol\t freq\t codelength\t code\n");
for(i = 0; i < MAX_SYMBOLS; ++i)
{
fprintf(out_Table,"%d\t ",i);
fprintf(out_Table,"%f\t ",st->freq[i]);
fprintf(out_Table,"%d\t ",st->numbits[i]);
if(st->numbits[i])
{
for(j = 0; j < st->numbits[i]; ++j)
{
c =get_bit(st->bits[i], j);
fprintf(out_Table,"%d",c);
}
}
fprintf(out_Table,"\n");
}
}

各样本文件的概率分布图

实验结果

根据香农第一定理（无失真信源编码定理），对于二进制码信源符号，平均码长的下界为信源熵。当信源符号接近等概分布时，信源熵最大，而平均码长也没有可降低的空间了。故当文件的概率分布越不均匀，通过霍夫曼编码得到的编码效率越高。

test3_Huffman codes相关推荐

RGB Color Codes Chart
RGB Color Codes Chart RGB颜色空间 RGB颜色空间或RGB颜色系统,从红色.绿色和蓝色的组合中构造所有颜色. 红色.绿色和蓝色各使用8位,它们的整数值从0到255.这使得256 ...
Brute Force STL --- UVA 146 ID Codes
ID Codes Problem's Link:http://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&a ...
WiFi Deauthenticated Reason Codes
Code Reason Explanation 0 Reserved Normal working operation 1 Unspecific Reason We don't know what's ...
基于deep learning的快速图像检索（Deep Learning of Binary Hash Codes for Fast Image Retrieval）
基于deep learning的快速图像检索(Deep Learning of Binary Hash Codes for Fast Image Retrieval) 2016-07-25 14 ...
cvpr2019/cvpr2018/cvpr2017（Papers/Codes/Project/Paper reading）
cvpr2019/cvpr2018/cvpr2017(Papers/Codes/Project/Paper reading) Source:https://mp.weixin.qq.com/s/SmS ...
optee的error codes
文章目录 1.TEE internal 2.TEE client 1.TEE internal (lib/libutee/include/tee_api_defines.h)* API Error C ...
F110报错：Company codes **/** do not appear in proposal ***; correct
F110自动支付的时候报错: Company codes CS19/CS19 do not appear in proposal 29.09.2015 0929A; correct Message n ...
Complete Guide to Parameter Tuning in XGBoost (with codes in Python)
Introduction If things don't go your way in predictive modeling, use XGboost. XGBoost algorithm has ...
2. Get the codes from GIT
Clone the code from git. Click the "GitEx Clone". Paste the url into the "Repository ...

test3_Huffman codes

Huffman编码的方法

test3_Huffman codes相关推荐

最新文章

热门文章