Python 给词语编码新世纪五笔

输入法 RIME

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# d.pyfrom itertools import islice
import time# wubixinshiji.dict.yaml https://github.com/GuoBinyong/wubixinshiji
NCTPATH = './wubixinshiji.dict.yaml'# 读取新世纪五笔码表
def read_code_table():nc_ct = {}with open(NCTPATH, 'r', encoding='utf-8') as f:# 按行读取for line in islice(f.readlines(), 81, None):str = line.strip().split('\t')nc_ct[str[0]] = str[1]f.close()print("已读取新世纪码表, 共有 %d 项 " % len(nc_ct))return nc_ct# 读取新世纪五笔码表为 list
def list_code_table():list_nc_ct = []with open(NCTPATH, 'r', encoding='utf-8') as f:# 按行读取for line in islice(f.readlines(), 81, None):str = line.strip().split('\t')# 元组哦tmp=(str[0], str[1])list_nc_ct.append(tmp)f.close()print("已读取新世纪码表, 共有 %d 项 " % len(list_nc_ct))return list_nc_ct# 生成新世纪五笔单字码表
def single_table():single_ct = {}rc=0dc=0cc=0with open(NCTPATH, 'r', encoding='utf-8') as f:# 按行读取for line in islice(f.readlines(), 81, None):# 分割行 str:['工','a','99454797','aa']str = line.strip().split('\t')# 检测第一段的长度，是否为单字if len(str[0]) == 1:# 检测编码是否为独码if len(str[1])>1:if str[0] not in single_ct:single_ct[str[0]] = str[1]else:rc+=1else:dc+=1else:cc+=1f.close()print("忽略 82 行，删掉词语 %d 条，独码 %d 条，重复 %d 条，已生成单字码表共有 %d 项" %(cc,dc,rc,len(single_ct)))return single_ct# 全局读一次，减少开销
NCT = read_code_table()
listNCT=list_code_table()
SCT = single_table()# 比对 98 五笔词库和新世纪五笔码表，新世纪五笔码表中没有的保存到文件
def filter_code_table(table_path):dt = {}# 读取行数lc = 0# 新世纪五笔码表中已经存在项计数hc = 0# 单字数量sc = 0# 打开 98 五笔词库 从第25行开始with open(table_path, 'r', encoding='utf-8') as f:# 从第24行开始读取for line in islice(f.readlines(), 24, None):# 分割str = line.strip().split('\t')# 取出字串列表的第一个元素 汉字，是否在新世纪五笔码表中if str[0] not in NCT:# 检测 长度是词语吗？if len(str[0]) > 1:dt[str[0]] = str[1]else:sc += 1else:hc += 1lc += 1f.close()# 写入文件，追加方式with open(u'./out.txt', 'a+', encoding='utf-8') as o:# 遍历 dict 此处只有一对数据for key, value in dt.items():o.write('%s\t%s\n' % (key, value))o.close()print("处理词库文件 %s 完毕, 共处理 %d 行, %d 个项已经存在于新世纪五笔码表中, 保存了 %d 行，舍弃了 %d 个单独字符。" % (table_path, lc, hc, len(dt), sc))# 查询单字编码
def query_code(s):if s in SCT:return SCT[s]# 根据汉字查找 新世纪五笔 的编码
def get_code(str):len_str = len(str)if len_str == 1:return query_code(str)if len_str == 2:# 取索引 第一个字符 第二个字符f = str[0]s = str[1]fc = query_code(f)sc = query_code(s)return fc[:2] + sc[:2]if len_str == 3:s1 = str[0]s2 = str[1]s3 = str[2]s1c = query_code(s1)s2c = query_code(s2)s3c = query_code(s3)return s1c[0] + s2c[0] + s3c[:2]if len_str > 3:s1 = str[0]s2 = str[1]s3 = str[2]se = str[len_str - 1]s1c = query_code(s1)s2c = query_code(s2)s3c = query_code(s3)sec = query_code(se)return s1c[0] + s2c[0] + s3c[0] + sec[0]def take_second(e):return e[1]if __name__ == '__main__':start = time.time()filter_code_table('./wubi98_ci.dict.yaml')filter_code_table('./wubi98_S.dict.yaml')filter_code_table('./wubi98_U.dict.yaml')new_code_table = {}lc = 0cc = 0final_code_table=[]with open(u'./out.txt', 'r', encoding='utf-8') as f:for line in f:# 分割str = line.strip().split('\t')# 给汉字编码 此处会过滤掉重复 keynew_code_table[str[0]] = get_code(str[0])lc += 1# 将筛选出来的词语添加到新世纪五笔词库中for n in listNCT:final_code_table.append(n)for key,value in new_code_table.items():m=(key,value)final_code_table.append(m)# 按编码排序final_code_table.sort(key=take_second)f.close()# 输出汉字和编码写入文件with open(NCTPATH, 'r', encoding='utf-8') as f,open(u'./sorted.txt', 'w', encoding='utf-8') as o:for line in islice(f.readlines(), 0, 81):# 写入文件头o.write(line)for c in final_code_table:o.write('%s\t%s\n' % (c[0], c[1]))cc+=1print("新世纪五笔原有 %d 条，新添加了 %d 条，整合后为 %d 条，从98五笔词库中提取了 %d 条词语（未查重），现写入 %d 条词语。" % (len(listNCT),len(new_code_table),len(final_code_table),lc, cc))end = time.time()print("程序用时：%fs" % (end - start))

程序输出

已读取新世纪码表, 共有 107396 项
已读取新世纪码表, 共有 112061 项
忽略 82 行，删掉词语 79459 条，独码 25 条，重复 4491 条，已生成单字码表共有 28086 项
处理词库文件 ./wubi98_ci.dict.yaml 完毕, 共处理 108547 行, 92410 个项已经存在于新世纪五笔码表中, 保存了 16047 行，舍弃了 90 个单独字符。
处理词库文件 ./wubi98_S.dict.yaml 完毕, 共处理 73011 行, 64295 个项已经存在于新世纪五笔码表中, 保存了 8715 行，舍弃了 1 个单独字符。
处理词库文件 ./wubi98_U.dict.yaml 完毕, 共处理 100477 行, 32351 个项已经存在于新世纪五笔码表中, 保存了 0 行，舍弃了 68126 个单独字符。
新世纪五笔原有 112061 条，新添加了 16072 条，整合后为 128133 条，从98五笔词库中提取了 24762 条词语（未查重），现写入 128133 条词语。
程序用时：0.481324s

squirrel.custom.yaml

patch:style/color_scheme: das2mstyle/horizontal: truestyle/text_orientation: horizontal  # horizontal | verticalstyle/inline_preedit: truestyle/font_face: '思源黑体 CN ExtraLight'style/font_point: 16style/label_font_point: 16style/comment_font_point: 12style/corner_radius: 5style/border_height: 4style/dborder_width: 4style/candidate_format: "%c %@ "preset_color_schemes/das2m:name: 少司命 / das2mauthor: Das2m <zhangyingda@gmail.com>back_color: '0x4C4957'text_color: '0xCAFDDB'hilited_text_color: '0xCAFDDB'hilited_back_color: '0x4C4957'hilited_candidate_text_color: '0xA28AFD'hilited_candidate_back_color: '0x4C4957'hilited_candidate_label_color: '0xA28AFD'hilited_comment_text_color: '0xA28AFD'candidate_text_color: '0xFDFCFC'label_color: '0xFDFCFC'comment_text_color: '0xFDFCFC'

Python 给词语编码新世纪五笔相关推荐

Linux下安装新世纪五笔输入法（附表）（for 小白）
2018.1.31 本人刚入Linux不久,奈何自己习惯用的输入法是新世纪五笔,Linux下提供可安装的五笔并不是新世纪版本的,于是开始了漫长的百度之旅... 网上各种教程.经验,基本上关于86或98 ...
Ubuntu 下安装新世纪五笔输入法
http://pinyin.sogou.com/linux/help.php 1.在此页面下添加软件源:ppa:fcitx-team/nightly 2.之后在软件中心安装 fctix 这样就可以安 ...
为ibus输入法框架制作新世纪五笔码表
本文记录了笔者在ubuntu 16.04系统上为ibus输入法框架制作新世纪五笔码表的过程,有需要的可以参考一下. 首先,安装ibus输入法框架: sudo apt install ibus- ...
Arch Linux fcitx 新世纪五笔配置
前提:安装 fcitx RIME安装安裝: $ sudo pacman -S fcitx-rime 注销生效 Ctrl + Shift 切换到 RIME Ctrl +' 可以选择输入方案下面是新世 ...
linux 中的rime 输入法自定义新世纪五笔输入法
linux 中使用新世纪五笔输入法正如新世纪五笔输入法在windows 下是收费的情况是一样的, 我们大linux 下没有一个可以让我们轻松使用的新世纪五笔输入法, 可以方便使用的只有我们的86 ...
新世纪五笔形码之耻_世纪之剑
新世纪五笔形码之耻翻译自: https://www.pybloggers.com/2016/11/the-hack-of-the-century/ 新世纪五笔形码之耻
Linux新世纪五笔
下载新世纪版五笔码表链接: https://pan.baidu.com/s/1p_B9ijDzgoB2V_tTY3NqcA 提取码: zsra 首先在终端输入sudo apt-get install ...
提供linux下的新世纪五笔的码表和字根口诀，用于ibus。
注:怎么配置输入法,请阅读:http://blog.csdn.net/sabalol/article/details/8512436 # create new database from wubi-x ...
基于Linux系统部署新世纪版五笔输入法
基于Linux系统部署新世纪版五笔输入法出于工作需要,突然生出学习五笔的想法.五笔有三个版本:86/98/新世纪. 按照新手上路,有新学新的想法,选择了新世纪版. 然后发现了个坑:公司电脑是Linu ...
输入法全屏_五笔输入法那么方便，为什么败给了拼音？如今，我可算是明白了...
我们在日常生活中使用的输入法有很多种.输入法有四种:拼音输入法.五笔输入法.手写输入法和笔画输入法.其中,拼音输入法分为九笔输入法和二十六笔输入法.手写输入分为全屏手写和半屏手写.在这些输入法中,我们 ...

Python 给词语编码新世纪五笔

Python 给词语编码新世纪五笔相关推荐

最新文章

热门文章