Python实现英文文本的霍夫曼编码压缩

霍夫曼编码作为变长码，在已知字符出现的频率的前提下，将频率高的字符用短码表示，频率低的字符用长码表示，实现用最短的码符号完整的表示出一段文本的信息。
例如
对于
这样一文本，进行字符频率统计，部分结果为

即意味将用最短的码符号表示“ ”(空格)这一符号，用较长的码符号表示"b"这一符号。

具体步骤如下

一、频数统计
对于一段英文文本，首先要统计各符号出现的频数

Path='C:\Users\88466\Desktop\Gone with the wind.txt'# Path为待统计的文本路径
with open(Path,'r') as f:data=f.read()
a=[] # 创建空列表a，存放文本中所有出现的符号
n=[] # 创建空列表n，存放列表a内各符号出现的次数
for d in data: # 遍历data，其中d即为data中的各符号try:assert(d in a) # 断言d在a内，否则断言error，运行关键字except后的内容for i in range(len(a)): # 若断言正确，继续运行，n列表中d符号对应a列表的位置的值+1if a[i]==d:n[i]=n[i]+1except: # 否则d符号为初次出现，添加进a列表，n添加1a.append(d)n.append(1)#以下代码对a按照n进行了冒泡排序
a_s=[]
n_s=[]
for i in range(len(a)):temp_m=0for j in range(len(a)):if n[j]>temp_m:temp_m=n[j]temp_index=ja_s.append(a[temp_index])n_s.append(n[temp_index])a.pop(temp_index)n.pop(temp_index）

二、霍夫曼编码
统计完字符频数后，我们获得了存储符号列表和对应频数的列表，利用这两部分进行霍夫曼编码。

霍夫曼编码中，首先对频数出现最低和次低的符号进行编码，其码符号先分别排上0,1。例如有符号a=[‘A’,‘B’,‘C’],对应频数n=[1,2,4]，则‘A’的码符号保存为0，‘B’的码符号保存为1。即码符号h=[‘0’,‘1’,’’]
将初次进行了编码的符号看做一个整体，其频数为整体内各部分频数的和，更新符号与频数。接上例a=[[‘A’,‘B’],‘C’],n=[3,4]。
再次进行编码，仍选取频数最低和次低的符号进行编码，频数最低的符号对应码符号前添加0，频数次低的符号对应码符号前添加1.接上例频数最低符号为[‘A’,‘B’],次低符号为’C’,原码符号h=[‘0’,‘1’,’’],则更新后码符号h=[‘00’,‘01’,‘1’]
重复2、3步，直到所有符号都成为一个整体，则霍夫曼编码完成。

代码实现为

# 该函数用于在每步编码完成后，更新符号与频数
# 其中b_zu表示对符号进行组合后的符号列表,b_sum为b_zu中各部分对应的频率
def sort(b_zu,b_sum):m=b_zu[0]n=b_sum[0]# 由于b_sum在进行排序前已经是排好序的列表，# 因此仅需将b_sum中最小的两项和从头到尾与b_sum中各项比较大小即可确定此次更新后，# 合为一组的符号按频数排序的新位置。temp_index=0time=len(b_zu)-1for i in range(time):if n>b_sum[i+1]:temp_index=temp_index+1else:break# 新位置记作temp_index#再根据temp_index，更新b_zu,b_sumif temp_index!=0:temp_b_zu=b_zu[0]temp_b_sum=b_sum[0]for j in range(temp_index):b_zu[j]=b_zu[j+1]b_sum[j]=b_sum[j+1]b_zu[temp_index]=temp_b_zub_sum[temp_index]=temp_b_sum# 该函数进行具体编码
def coding(a,a_n):# 初始化各列表h=[]b=[]b_zu=[]b_n=[]time=len(a)for i in range(time):h.append('')b.append(a[i])b_zu.append([i])b_n.append(a_n[i])# 开始编码for i in range(time-1):b_sum=[]# 计算b_sumfor j in range(len(b_zu)):temp_sum=0for k in b_zu[j]:temp_sum=temp_sum+b_n[k]b_sum.append(temp_sum)# 对b_zu按b_sum进行排序更新sort(b_zu,b_sum) # b_zu中前两项即为此次需要进行编码的符号h_0=b_zu[0]h_1=b_zu[1]for i in h_0:h[i]='0'+h[i]for j in h_1:h[j]='1'+h[j]# 将此次编码的符号分到一组内b_zu[0]=b_zu[0]+b_zu[1]b_zu.pop(1)return hh=coding(a,a_n) #最终h即为a按照a_n进行霍夫曼编码的结果

部分码表为

三、压缩文本
文本中各符号都在码表中有对应的码符号，获得码表后即可对文本进行压缩

将文本字符流转化为码符号流，即将字符串序列转化为01字符串序列。

#Path为待压缩的文本路径
Path='C:\Users\88466\Desktop\Gone with the wind.txt'
with open(Path,'r') as f:data=f.read()
# Compress为01字符串序列
Compress=''
for d in data:for i in range(len(a)):# a为符号序列if d==a[i]:Compress=Compress+h[i] # h 为码符号序列

将码符号流转化为字节流，即将01字符串流转为二进制字节流

def bitstring2bytes(bs):bt=bytearray()time=round(len(bs)/8)for i in range(time):# 将01字符串流8位一截，存为二进制数，再存为字节流bs_k=int(bs[i*8:(i+1)*8],2)bt.append(bs_k)return bytes(bt)# CompressData为字节流
CompressData=bitstring2bytes(Compress)
# 保存字节流
with open('Compress_Gone with the wind','wb') as f2:f2.write(CompressData)

四、解压缩
将文本文件压缩为二进制文件后，需要再完整的还原回原文本。
步骤同压缩，分为二进制字节流转为01字符串流，01字符串流转为字符串流。

def unCompress():with open('Compress_Gone with the wind','rb') as f1:Compress=f1.read()#将二进制字节流转为字符串def bytes2bitstring(byte):bs=''for i in byte:bs_k=str(bin(i)).split('b')[1]while len(bs_k)<8:bs_k='0'+bs_kbs=bs+bs_kreturn bsCompress_data=bytes2bitstring(Compress)unCompress_data=''temp=''for c in Compress_data:temp_c=temp_c+cif temp_c in code:temp_index=code.index(temp_c)unCompress_data=unCompress_data+a[temp_index]temp_c=''with open('unCompress_Gone with the wind.txt') as f2:f2.write(unCompress_data)

再此基础上用tkinter设计界面

import xlwt
import xlrd
import tkinter
import tkinter.filedialog
import tkinter.messagebox
import os
import os.path##编码代码
#1统计字符频数
def Statistics():Directory=os.path.split(ToBeCompressPath.get())[0]ToBeCompressName=os.path.split(ToBeCompressPath.get())[1]Path=ToBeCompressPath.get().replace('/','\\')                        name=ToBeCompressName.split('.')[0]with open(Path,'r') as f:data=f.read()a=[]n=[]for d in data:try:assert(d in a)for i in range(len(a)):if a[i]==d:n[i]=n[i]+1except:a.append(d)n.append(1)a_s=[]n_s=[]for i in range(len(a)):m=0for j in range(len(a)):if n[j]>m:m=n[j]index=ja_s.append(a[index])n_s.append(n[index])a.pop(index)n.pop(index)workbook=xlwt.Workbook()sheet=workbook.add_sheet('1')sheet.write(0,0,'符号')sheet.write(0,1,'次数')sheet.write(0,2,'占比')w=sum(n_s)for i in range(len(a_s)):sheet.write(i+1,0,a_s[i])sheet.write(i+1,1,n_s[i])sheet.write(i+1,2,n_s[i]/w)workbook.save(Directory+'/Statistic_'+name+'.xls')FrequencyPath.set(Directory+'/Statistic_'+name+'.xls')#2霍夫曼编码
def Huffman():Directory=os.path.split(FrequencyPath.get())[0]Path=FrequencyPath.get().replace('/','\\')FrequencyName=os.path.split(FrequencyPath.get())[1]name=FrequencyName.split('_')[1].split('.')[0]workbook1=xlrd.open_workbook(Path)sheet1=workbook1.sheets()[0]nrows1=sheet1.nrowsa=[]a_n=[]for i in range(nrows1-1):a.append(sheet1.cell_value(nrows1-i-1,0))a_n.append(sheet1.cell_value(nrows1-i-1,1))#对b按b_n排序，后将发生的变动记录到b_sdef sort(b_zu,b_sum):m=b_zu[0]n=b_sum[0]index=0time=len(b_zu)-1for i in range(time):if n>b_sum[i+1]:index=index+1else:breakif index!=0:temp_b_zu=b_zu[0]temp_b_sum=b_sum[0]for j in range(index):b_zu[j]=b_zu[j+1]b_sum[j]=b_sum[j+1]b_zu[index]=temp_b_zub_sum[index]=temp_b_sumdef coding(a,a_n):h=[]b=[]b_zu=[]b_n=[]time=len(a)for i in range(time):h.append('')b.append(a[i])b_zu.append([i])b_n.append(a_n[i])for i in range(time-1):b_sum=[]for j in range(len(b_zu)):temp_sum=0for k in b_zu[j]:temp_sum=temp_sum+b_n[k]b_sum.append(temp_sum)sort(b_zu,b_sum) h_0=b_zu[0]h_1=b_zu[1]for i in h_0:h[i]='0'+h[i]for j in h_1:h[j]='1'+h[j]b_zu[0]=b_zu[0]+b_zu[1]b_zu.pop(1)return hh=coding(a,a_n)workbook2=xlwt.Workbook()sheet2=workbook2.add_sheet('1')sheet2.write(0,0,'符号')sheet2.write(0,1,'编码')for i in range(len(a)):sheet2.write(i+1,0,a[i])sheet2.write(i+1,1,h[i])workbook2.save(Directory+'/Huffman_code_'+name+'.xls')HuffmanCodePath.set(Directory+'/Huffman_code_'+name+'.xls')#3压缩
def Compress():Directory=os.path.split(HuffmanCodePath.get())[0]CompressName=os.path.split(HuffmanCodePath.get())[1]name=CompressName.split('_')[2].split('.')[0]with open(ToBeCompressPath.get(),'r') as f1:data=f1.read()workbook1=xlrd.open_workbook(HuffmanCodePath.get())sheet1=workbook1.sheets()[0]nrows1=sheet1.nrowssymbol=[]code=[]for i in range(nrows1-1):symbol.append(sheet1.cell_value(nrows1-i-1,0))code.append(sheet1.cell_value(nrows1-i-1,1))Compress=''for d in data:for i in range(len(symbol)):if d==symbol[i]:Compress=Compress+code[i]#将二进制字符串转为字节流def bitstring2bytes(bs):bt=bytearray()time=round(len(bs)/8)for i in range(time):bs_k=int(bs[i*8:(i+1)*8],2)bt.append(bs_k)return bytes(bt)CompressData=bitstring2bytes(Compress)with open(Directory+'/Compress_'+name,'wb') as f2:f2.write(CompressData)CompressPath.set(Directory+'/Compress_'+name)#输出压缩效率size_before=os.path.getsize(ToBeCompressPath.get())size_after=os.path.getsize(CompressPath.get())theta=str(100*(1-round(size_after/size_before,4)))labelName=tkinter.Label(root,text=theta+'%',justify=tkinter.RIGHT,anchor='e',width=80)labelName.place(x=160,y=270,width=50,height=20)#解压缩
def unCompress():Directory=os.path.split(CompressPath.get())[0]unCompressName=os.path.split(HuffmanCodePath.get())[1]name=unCompressName.split('_')[2].split('.')[0]with open(CompressPath.get(),'rb') as f1:Compress=f1.read()#将二进制字节流转为字符串def bytes2bitstring(byte):bs=''for i in byte:bs_k=str(bin(i)).split('b')[1]while len(bs_k)<8:bs_k='0'+bs_kbs=bs+bs_kreturn bsCompress_data=bytes2bitstring(Compress)workbook1=xlrd.open_workbook(HuffmanCodePath.get())sheet1=workbook1.sheets()[0]nrows1=sheet1.nrowssymbol=[]code=[]for i in range(nrows1-1):symbol.append(sheet1.cell_value(nrows1-i-1,0))code.append(sheet1.cell_value(nrows1-i-1,1))unCompress_data=''temp=''for c in Compress_data:temp=temp+cif temp in code:index=code.index(temp)unCompress_data=unCompress_data+symbol[index]temp=''with open(Directory+'/unCompress_'+name+'.txt','w',encoding='ANSI') as f2:f2.write(unCompress_data)unCompressPath.set(Directory+'/unCompress_'+name+'.txt')##窗口设计
root=tkinter.Tk()#窗口大小
root.title('英文文本编码器')
root['height']=320
root['width']=600#放'待压缩文件'
labelName=tkinter.Label(root,text='待压缩文件:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=30,y=30,width=80,height=20)#放'待压缩文件'后的框
ToBeCompressPath=tkinter.StringVar(root,value='')
entry_ToBeCompressPath=tkinter.Entry(root,width=80,textvariable=ToBeCompressPath)
entry_ToBeCompressPath.place(x=120,y=30,width=350,height=20)#放'待压缩文件'的'...'钮
def open_file1():ToBeCompressPath.set(tkinter.filedialog.askopenfilename())button_op1=tkinter.Button(root,text='...',command=open_file1)
button_op1.place(x=470,y=30,width=20,height=20)#放'字符频率文件'
labelName=tkinter.Label(root,text='字符频率文件:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=30,y=80,width=80,height=20)#放'字符频率文件'后的框
FrequencyPath=tkinter.StringVar(root,value='')
entry_FrequencyPath=tkinter.Entry(root,width=80,textvariable=FrequencyPath)
entry_FrequencyPath.place(x=120,y=80,width=350,height=20)#放'字符频率文件'的'...'钮
def open_file2():FrequencyPath.set(tkinter.filedialog.askopenfilename())button_op2=tkinter.Button(root,text='...',command=open_file2)
button_op2.place(x=470,y=80,width=20,height=20)#放'统计字符频率'钮
button_Statistic=tkinter.Button(root,text='统计',command=Statistics)
button_Statistic.place(x=520,y=80,width=50,height=20)#放'编码文件'
labelName=tkinter.Label(root,text='编码文件:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=30,y=130,width=80,height=20)#放'编码文件'后的框
HuffmanCodePath=tkinter.StringVar(root,value='')
entry_HuffmanCodePath=tkinter.Entry(root,width=80,textvariable=HuffmanCodePath)
entry_HuffmanCodePath.place(x=120,y=130,width=350,height=20)#放'编码文件'的'...'钮
def open_file3():HuffmanCodePath.set(tkinter.filedialog.askopenfilename())button_op3=tkinter.Button(root,text='...',command=open_file3)
button_op3.place(x=470,y=130,width=20,height=20)#放'编码'钮
button_HuffmanCode=tkinter.Button(root,text='编码',command=Huffman)
button_HuffmanCode.place(x=520,y=130,width=50,height=20)#放'压缩文件'
labelName=tkinter.Label(root,text='压缩文件:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=30,y=180,width=80,height=20)#放'压缩文件'后的框
CompressPath=tkinter.StringVar(root,value='')
entry_CompressPath=tkinter.Entry(root,width=80,textvariable=CompressPath)
entry_CompressPath.place(x=120,y=180,width=350,height=20)#放'压缩文件'的'...'钮
def open_file4():CompressPath.set(tkinter.filedialog.askopenfilename())button_op4=tkinter.Button(root,text='...',command=open_file4)
button_op4.place(x=470,y=180,width=20,height=20)#放'压缩'钮
button_Compress=tkinter.Button(root,text='压缩',command=Compress)
button_Compress.place(x=520,y=180,width=50,height=20)#放'解压缩文件'
labelName=tkinter.Label(root,text='解压缩文件:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=30,y=230,width=80,height=20)#放'解压缩文件'后的框
unCompressPath=tkinter.StringVar(root,value='')
entry_unCompressPath=tkinter.Entry(root,width=80,textvariable=unCompressPath)
entry_unCompressPath.place(x=120,y=230,width=350,height=20)#放'解压缩文件'的'...'钮
def open_file5():unCompressPath.set(tkinter.filedialog.askopenfilename())button_op5=tkinter.Button(root,text='...',command=open_file5)
button_op5.place(x=470,y=230,width=20,height=20)#放'解压缩'钮
button_unCompress=tkinter.Button(root,text='解压缩',command=unCompress)
button_unCompress.place(x=520,y=230,width=50,height=20)#放'压缩效率'
labelName=tkinter.Label(root,text='压缩效率为:',justify=tkinter.RIGHT,anchor='e',width=80)
labelName.place(x=80,y=270,width=80,height=20)#放'退出'钮button_drop_out=tkinter.Button(root,text='退出',command=root.destroy)
button_drop_out.place(x=490,y=270,width=60,height=30)#开始
root.mainloop()

Python实现英文文本的霍夫曼编码压缩相关推荐

C语言霍夫曼编码压缩,数据结构大作业——哈夫曼编码压缩BMP格式文件
数据结构大作业--哈夫曼编码压缩BMP格式文件首先需要了解BMP图像格式 BMP图像格式详解其次需要了解哈夫曼编码如何对BMP文件进行压缩哈夫曼压缩与解压缩编程部分使用的头文件虽然这里用了 ...
创建霍夫曼树，霍夫曼编码以及使用霍夫曼编码压缩文件
那么,什么是霍夫曼树(赫夫曼树)呢? 给定n个权值(权值就是每个节点里面存放的数据,但是根据业务需求不同,存放的数据类型有些差别)作为n个叶子结点,构造一棵二叉树,若该树的带权路径长度达到最小,称这样 ...
Huffman霍夫曼树，霍夫曼编码
霍夫曼树基本概念: 路径:从一个结点往下到孩子或孙子结点之间的同理路径长度:如结点1到结点7的路径长度=2 结点的权:将结点的某一属性值作为结点的权带权路径长度:从根节点到该结点*该结点的权:如结 ...
霍夫曼树、霍夫曼编码
霍夫曼树一.基本介绍二.霍夫曼树几个重要概念和举例说明构成霍夫曼树的步骤: 举例:以arr = {1 3 6 7 8 13 29} public class HuffmanTr ...
labview霍夫曼编码_为什么霍夫曼编码好？
7 个答案: 答案 0 :(得分:3) 如果为最常用使用的符号指定较少的数字或位或较短的代码字词,则可以节省大量存储空间. 假设您要为英文字母分配26个唯一代码,并希望根据这些代码存储英文小说(仅限字 ...
【数据结构】图解霍夫曼编码，看了就能懂
今天来给大家普及一下霍夫曼编码(Huffman Coding),一种用于无损数据压缩的熵编码算法,由美国计算机科学家大卫·霍夫曼在 1952 年提出--这么专业的解释,不用问,来自维基百科了. 说实话 ...
哈夫曼编码压缩率计算_程序员的算法课（8）-贪心算法：理解霍夫曼编码
版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.net/m0_37609579/article/ ...
程序员的算法课（8）-贪心算法：理解霍夫曼编码
一.一种很贪婪的算法定义贪心是人类自带的能力,贪心算法是在贪心决策上进行统筹规划的统称. [百度百科]贪心算法(又称贪婪算法)是指,在对问题求解时,总是做出在当前看来是最好的选择.也就是说,不从整体 ...
Zlib压缩算法：LZ77、LZ78、霍夫曼编码、滑动窗口、Rabin-Karp算法、哈希链、I/O缓冲区
Table of Contents 1.简介 1.1 什么是zlib 2.压缩算法 2.1 放气 2.2 LZ77 2.2.1 滑动窗口 2.2.2 长距离对 2.3 霍夫曼编码 3. zlib的实现 ...

Python实现英文文本的霍夫曼编码压缩

Python实现英文文本的霍夫曼编码压缩相关推荐

最新文章

热门文章