【札记】Python处理TSV文件以及144790个英语单词的注音、释义、例句的.sql和.tsv文件下载

详情参见GitHub仓库：

链接

两年前的老东西了，这个周末闲得无聊又拿出来随便玩儿下。

之前的博文：

博文1

博文2

博文3

import csv
import re
import time
import operator# fp = csv.writer (fp, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar=None, escapechar="|")def write_to_tsv(output_path: str, file_columns: list, data: list):csv.register_dialect('tsv_dialect', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar=None, escapechar="|")wf = open(output_path, 'w', newline='', encoding='utf-8-sig')# with open(output_path, 'w', newline='', encoding='utf-8-sig') as wf:writer = csv.DictWriter(wf, fieldnames=file_columns, dialect='tsv_dialect')writer.writerows(data)csv.unregister_dialect('tsv_dialect')wf.close()def read_from_tsv(file_path: str, column_names: list) -> list:csv.register_dialect('tsv_dialect', delimiter='\t', quoting=csv.QUOTE_ALL)with open(file_path, 'r', encoding = 'utf8') as wf:reader = csv.DictReader(wf, fieldnames=column_names, dialect='tsv_dialect')data_list = []for row in reader:data = dict(row)data_list.append(data)csv.unregister_dialect('tsv_dialect')wf.close()return data_listdef extract_line_with_pronunciation(file_path: str, file_path_new_1: str, file_path_new_2: str):wf = open(file_path, 'r', encoding='utf8')nf1 = open(file_path_new_1, 'w+', encoding='utf8')nf2 = open(file_path_new_2, 'w+', encoding='utf8')line = wf.readline()pattern_1 = re.compile('\t美\[')pattern_2 = re.compile('\t英\[')while line:res_1 = pattern_1.search(line)res_2 = pattern_2.search(line)if res_1 or res_2:nf1.write(line)else:nf2.write(line)line = wf.readline()nf2.close()nf1.close()wf.close()def split_pronunciation_line(file_path: str, column_names: list) -> list:data_list = read_from_tsv(file_path, column_names)  # ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']pattern_1 = re.compile('美\[.*英.*\][，]')pattern_2 = re.compile('英\[.*\][，]')pattern_3 = re.compile('美\[.*\][，]')set_usa_un = set()set_un = set()set_usa = set()id_pronunciation = dict()id_meaning = dict()complete_format = []for column in data_list:pm = column['WORD_MEANINGS']res_1 = pattern_1.search(pm)res_2 = pattern_2.search(pm)res_3 = pattern_3.search(pm)id = int(column['WORD_ID'])if res_1:set_usa_un.add(id)id_pronunciation[id] = res_1.group()id_meaning[id] = str(pm).replace(res_1.group(), '')elif res_2 and (id not in set_usa_un):  # and (id not in set_usa):set_un.add(id)id_pronunciation[id] = res_2.group()id_meaning[id] = str(pm).replace(res_2.group(), '')elif res_3 and (id not in set_usa_un):  # and (id not in set_un):set_usa.add(id)id_pronunciation[id] = res_3.group()id_meaning[id] = str(pm).replace(res_3.group(), '')if id in set_usa_un:item = {'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],'meaning': id_meaning[id], 'EXAMPLE_sentence': column['EXAMPLE_SENTENCES']}elif id in set_un:item = {'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],'meaning': id_meaning[id], 'EXAMPLE_sentence': column['EXAMPLE_SENTENCES']}elif id in set_usa:item = {'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],'meaning': id_meaning[id], 'example_sentence': column['EXAMPLE_SENTENCES']}complete_format.append(item)return complete_formatdef split_pronunciation_line_no_pronunciation(file_path: str, column_names: list) -> list:data_list = read_from_tsv(file_path, column_names)complete_format = []for column in data_list:if column['WORD_ID'] == '\ufeff9':column['WORD_ID'] = '9''''# The following item is wrong for 'example_sentences' is not UPPERCASE but lowercase.item = {'id': column['WORD_ID'], 'word': column['SINGLE_WORD'], 'pronunciation': '','meaning': column['WORD_MEANINGS'], 'example_sentences': column['EXAMPLE_SENTENCES']}'''item = {'id': column['WORD_ID'], 'word': column['SINGLE_WORD'], 'pronunciation': '','meaning': column['WORD_MEANINGS'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']}complete_format.append(item)# print(complete_format)# complete_format = [{'id': '9', 'word': 'grimoires', 'pronunciation': '网络释义： 魔法之书；中世纪巫术之书；魔典；',#                     'meaning': '', 'EXAMPLE_SENTENCES': ''},#                    {'id': '10', 'word': 'subreptitious', 'pronunciation': '网络释义： 隐瞒事实的；',#                     'meaning': '', 'EXAMPLE_SENTENCES': ''}]return complete_formatdef combine_tsv_with_same_headers(file_path_1: str, file_path_2: str, column_names: list) -> list:data_list_1 = read_from_tsv(file_path_1, column_names)data_list_2 = read_from_tsv(file_path_2, column_names)complete_format = []for column in data_list_1:item = {'id': column['id'], 'word': column['word'], 'pronunciation': column['pronunciation'],'meaning': column['meaning'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']}complete_format.append(item)for column in data_list_2:item = {'id': column['id'], 'word': column['word'], 'pronunciation': column['pronunciation'],'meaning': column['meaning'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']}complete_format.append(item)return complete_formatdef fun1(file_path_: str):data_list = read_from_tsv(file_path_, ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES'])cnt = 0for i in data_list:print(i)cnt += 1if cnt == 100:breakdef fun2(file_path_origin: str, file_path_new_1: str, file_path_new_2: str):'''file_path_origin = './TSV_data/origin/unduplicated_word_pronounciations_meanings_example_sentences.tsv'file_path_new_1 = './TSV_data/1_yes.tsv'file_path_new_2 = './TSV_data/2_no.tsv''''time_start = time.time()extract_line_with_pronunciation(file_path_origin, file_path_new_1, file_path_new_2)time_end = time.time()time_sum = time_end - time_startprint('Time used:\n' + str(time_sum) + ' seconds.')def fun3(file_path: str, output_path: str, column_names_1: list, column_names_2: list):'''file_path = './TSV_data/1_yes.tsv'output_path = './TSV_data/partly_completed.tsv'column_names_1 = ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']column_names_2 = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_sentence']'''complete_format = split_pronunciation_line(file_path, column_names_1)write_to_tsv(output_path, column_names_2, complete_format)def fun4(file_path: str, output_path: str, column_names_1: list, column_names_2: list):'''file_path = './TSV_data/2_no.tsv'output_path = './TSV_data/partly_completed_no.tsv'column_names_1 = ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']column_names_2 = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_sentences']'''complete_format = split_pronunciation_line_no_pronunciation(file_path, column_names_1)write_to_tsv(output_path, column_names_2, complete_format)def fun5(output_path_combine: str, file_path_1: str, file_path_2: str, column_names: list):'''output_path_combine = './TSV_data/combined_version.tsv'file_path_yes = './TSV_data/partly_completed_yes.tsv'file_path_no = './TSV_data/partly_completed_no.tsv'column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']'''complete_format = combine_tsv_with_same_headers(file_path_1, file_path_2, column_names)# # sorted by id firstly and alphabet secondly.# complete_format = sorted(complete_format , key=lambda elem: "%06d %s" %#         (int(str(elem['id']).replace('\ufeff', '').replace('"', '')), elem['word']))# # sorted by id only.complete_format = sorted(complete_format, key=lambda elem: "%06d" %(int(str(elem['id']).replace('\ufeff', '').replace('"', ''))))# # sorted by alphabet only.# complete_format = sorted(complete_format, key=lambda elem: "%s" % (elem['word']))write_to_tsv(output_path_combine, column_names_2, complete_format)def fun6(file_path_len_field: str, column_names: list):'''file_path_len_field = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']'''complete_format = read_from_tsv(file_path_len_field, column_names)maxLen = -1filed_name = ''maxLen_1 = -1maxLen_2 = -1maxLen_3 = -1maxLen_4 = -1maxLen_5 = -1for column in complete_format:maxLen_1 = max(len(column['id']), maxLen_1)maxLen_2 = max(len(column['word']), maxLen_2)maxLen_3 = max(len(column['pronunciation']), maxLen_3)maxLen_4 = max(len(column['meaning']), maxLen_4)maxLen_5 = max(len(column['EXAMPLE_SENTENCES']), maxLen_5)for filed in column_names:maxLen = max(len(column[filed]), maxLen)filed_name = filedprint(maxLen)print(filed_name)print('**********')print(str(maxLen_1) + ' ' + str(maxLen_2)+ ' ' + str(maxLen_3)+ ' ' + str(maxLen_4)+ ' ' + str(maxLen_5))return maxLendef fun7(file_path_1: str, file_path_2: str, column_names: list):'''# # origin:def fun7(file_path_1: str, file_path_2: str, column_names: list) -> (list, list):file_path_1 = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'file_path_2 = './TSV_data/word.csv'column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']'''complete_format = read_from_tsv(file_path_1, column_names)word_list_1 = []id_list = []for column in complete_format:id_list.append(column['id'])word_list_1.append(column['word'])file_word_data = open(file_path_2, 'r', encoding='utf-8')word_data = file_word_data.readline()word_list_2 = []while word_data:word_list_2.append(word_data.strip('\n'))word_data = file_word_data.readline()word_list_1 = sorted(word_list_1)word_list_2 = sorted(word_list_2)print(word_list_1[0:90])print(word_list_2[0:90])print(operator.eq(word_list_1, word_list_2))print(str(len(word_list_1)) + ' - ' + str(len(word_list_2)))for word in word_list_1:if word not in word_list_2:print('Found. -- ' + word)breakprint('*************************************')id_list[0] = 1id_list[8] = 9id_list = [int(j) for j in id_list]# cnt = 0# for i in id_list:#     id_list[cnt] = int(i)#     cnt += 1print(id_list[0:20])print(len(id_list))discrete_id_list = []for i in range(1, len(id_list)):if id_list[i - 1] != id_list[i] - 1:# print(id_list[i])discrete_id_list.append(id_list[i] - 1)print(len(discrete_id_list))print(discrete_id_list)print('*************************************')# # origin:'''return word_list_1, word_list_2'''def fun8(file_path: str):'''file_path = './TSV_data/id.csv''''file_id_data = open(file_path, 'r', encoding='utf-8')id_data = file_id_data.readline()id_list = []while id_data:id_list.append(int(id_data))id_data = file_id_data.readline()print(len(id_list))discrete_id_list = []for i in range(1, len(id_list)):if id_list[i - 1] != id_list[i] - 1:discrete_id_list.append(id_list[i] - 1)print(len(discrete_id_list))print(discrete_id_list)def fun9(file_path_1: str, file_path_2: str, column_names: list):'''file_path_1 = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'file_path_2 = './TSV_data/combined_version_resorted_by_id.tsv'column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']'''complete_format = read_from_tsv(file_path_1, column_names)resort_id_list = []for i in range(len(complete_format)):# complete_format[i]['id'] = i + 1item ={'id': i + 1, 'word': complete_format[i]['word'], 'pronunciation': complete_format[i]['pronunciation'],'meaning': complete_format[i]['meaning'], 'EXAMPLE_SENTENCES': complete_format[i]['EXAMPLE_SENTENCES']}resort_id_list.append(item)for j in range(10):print(resort_id_list[j])resort_id_list = sorted(resort_id_list, key=lambda elem: "%s" % (elem['word']))write_to_tsv(file_path_2, column_names, resort_id_list)if __name__ == '__main__':# # fun1. fun2file_path_origin = './TSV_data/origin/unduplicated_word_pronounciations_meanings_example_sentences.tsv'file_path_new_1 = './TSV_data/1_yes.tsv'file_path_new_2 = './TSV_data/2_no.tsv'# # fun3file_path_yes = './TSV_data/1_yes.tsv'    # yesoutput_path_yes = './TSV_data/partly_completed_yes.tsv'column_names_1 = ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']column_names_2 = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']# # fun4file_path_no = './TSV_data/2_no.tsv'  # nooutput_path_no = './TSV_data/partly_completed_no.tsv'# # fun5output_path_combine = './TSV_data/combined_version.tsv'# # fun6file_path_len_field = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'# # fun7file_path_word = './TSV_data/word_nonconsecutive_IDs.csv'# # fun8file_path_id = './TSV_data/id_nonconsecutive.csv'# # fun9file_path_to_resort_id = file_path_len_fieldfile_path_resorted_id = './TSV_data/5_combined_version_sorted_by_alphabet.tsv'# fun1(file_path_origin)# fun2(file_path_origin, file_path_new_1, file_path_new_2)# fun3(file_path_yes, output_path_yes, column_names_1, column_names_2)# fun4(file_path_no, output_path_no, column_names_1, column_names_2)# fun5(output_path_combine, output_path_yes, output_path_no, column_names_2)# fun6(file_path_len_field, column_names_2) # max length of field is: 2115, so 2500 may be enough.'''The max length of each field is: 6, 31, 74, 375, 2115.('id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES')'''# fun7(file_path_len_field, file_path_word, column_names_2)fun8(file_path_id)# fun9(file_path_to_resort_id, file_path_resorted_id, column_names_2)print('Done.')

【札记】Python处理TSV文件以及144790个英语单词的注音、释义、例句的.sql和.tsv文件下载相关推荐

tsv文件导oracle窜列,TSV(tsv文档怎么转换excel)
汽车空调的循环内是指关闭空调系统的流量和空气中的车外来回仅车室内空气流通,用于冷却和加热能发挥最佳的效果,而室内空气污染增加了.外回路指的是汽车空调系. 一种格式,可以用excel打开,不过好象要改个 ...
python编程技术解决英语单词测试（包括添加英文单词、查询英文单词和查询英文单词）以及绘制雷达图功能
用如Tkinter或wxPython等GUI开发库开发程序用户界面,通过同用户界面交互,程序至少完成以下基本功能: 1.添加英文单词:能对单词添加多重中文释义,不同释义用逗号分开, 单词保存在txt文 ...
python读取.tsv文件例子（大文件处理）
本文介绍如何用python读取.tsv文件. 目录 1..tsv 文件介绍 2..tsv 文件读取 1..tsv 文件介绍 TSV(Tab-separated values)文件是一种文本文件格式,它 ...
【python】将多个tsv文件合并到excel表中
需求将某个目录下多个特定文本文件,合并到excel表中.并要求将文件名作为sheet名,每个sheet中第一行第一个单元格填写对应文件的所在路径,单元格格式使用Times New Roman. 使用 ...
python读取tsv文件_Python读取tsv文件和evalu
我有tsv文件,准备如下:*Settings* Force, Tags FakeTag Resource ../../robot_resources/global.tsv *Test, Cases* ...
【Python】Pandas读取tsv文件
TSV文件和CSV的文件的区别是:前者使用\t作为分隔符,后者使用,作为分隔符. 使用pandas读取tsv文件的代码如下: train=pd.read_csv('test.tsv', sep='\t ...
Python逐行读取tsv文件
tsv文件中的分隔符一般是'\t',比如下边这个文件: 该rating.tsv文件没有表头,逐行读取方法如下: with open('rating.tsv', 'r', encoding='utf-8 ...
Python 写入tsv文件
with open(r'file.tsv', 'w', newline='') as f:tsv_w = csv.writer(f, delimiter='\t')tsv_w.writerow(['A ...
python学习——tsv文件批量转为csv文件、csv文件列合并
写在前面--近日在处理数据的时候发现有的文件为csv文件,有的为tsv文件,大概搜了一下了解到:TSV是用制表符('\t')作为字段值的分隔符:CSV是用半角逗号(',')作为字段值的分隔符.http ...

【札记】Python处理TSV文件以及144790个英语单词的注音、释义、例句的.sql和.tsv文件下载

【札记】Python处理TSV文件以及144790个英语单词的注音、释义、例句的.sql和.tsv文件下载相关推荐

最新文章

热门文章