常用Python文件

# -*- coding: utf-8 -*-
"""
Created on Wed Jun 28 18:42:33 2017

"""
import re
import numpy as np

'''
该程序实现对giza++后的对齐双语平行语料抽取对齐词汇关系
建立源语言到目标语言的映射矩阵，编号从0开始，将对齐文件中的NULL当作第一个词语
如果词语之间存在对齐关系，则将对齐矩阵matrixST[s][t]位置值设置为1，其它为0

'''
def alig_pairs(filepath):
    matrixZeroOne = []
    pattern1 = re.compile(r' \(\{([0-9 ]*)\}\) ?')
    # print(pattern1)
    f = open(filepath,'r')#,encoding='utf-8')
    line=f.readline()
    #matrix = np.zeros()
    while(True):
        if not line:
            break
        target = f.readline().strip().split()
        source = f.readline().strip()
        #match= pattern1.findall(source) # 使用Pattern匹配文本，获得匹配结果，无法匹配时将返回None
        source_word = pattern1.split(source)
        # print(source_word)
        s_l = len(source_word)//2-1#-1不考虑null
        t_l = len(target)
        #print(s_l)
        #print(t_l)
        matrixTS = np.zeros((t_l,s_l))
        #print(matrixST.shape)
        #从null开始对齐i=0。如果不考虑null，从第二位开始，i=2
        i=2
        while( i < len(source_word)-2):
            index = source_word[i+1]
            if index != '' and index !=' ':
                s = index.strip().split()
                # print(s)
                for s_ind in s:
                    #设置对齐矩阵
                    matrixTS[int(s_ind)-1][int((i-2))//2]=1
                    #print(i//2-1)
                    #该语句抽取对齐词语队
                    #print(source_word[int(i)],target[int(s_ind)-1])
            i+=2
        # print(matrixTS)
        matrixZeroOne.append(matrixTS)
        # print(matrixTS.shape)
        #因为对齐这个矩阵是动态生成的，所以在这里进行矩阵的合并

#print(i)
        #print(source_word)
        #if match:
            #print (match)
            #print ('yes')
        line=f.readline()
        #print(target)
        #print(source)


    f.close()
    return matrixZeroOne
#alig_pairs('test.txt')
#alig_pairs('117-06-28.183340.lmt.A3.final')
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# -*- coding: utf-8 -*-
import codecs
def get_matrix():
        #print('程序进入process')
   Chinese = codecs.open("result/result_cn",'r',encoding = 'utf-8')
   English = codecs.open('result/result_en', 'r', encoding = 'utf-8')
   # result_eng = codecs.open('result/swap_en', 'w', encoding = 'utf-8')
   # result_chi = codecs.open('result/swap_cn', 'w', encoding = 'utf-8')
   # eng_chi = codecs.open('result/en_to_cn','w',encoding = 'utf-8')

english_sentence_count = 0
   chinese_sentence_count = 0

   chinese_word = []
   chinese_sentence = []

for line in Chinese.readlines():
       pair = line.strip().split()
       if len(pair) == 4:
           swap = pair[1]
           pair[1] = pair[2]
           pair[2] = swap
           s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
           chinese_word.append(s)
           # result_chi.write(pair[0] + "   " + pair[1] + "   " + pair[2] + "   " + pair[3] + "\n")
       if len(pair) == 0:
           chinese_sentence.append(chinese_word)
           chinese_word = []
           # result_chi.write("\n")
           chinese_sentence_count += 1

english_word = []
   english_sentence = []
   for line in English.readlines():
       pair = line.strip().split()
       if len(pair) == 4:
           swap = pair[1]
           pair[1] = pair[2]
           pair[2] = swap
           s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
           english_word.append(s)
           # result_eng.write(pair[0] + "   " + pair[1] + "   " + pair[2] + "   " + pair[3] + "\n")
       if len(pair) == 0:
           english_sentence.append(english_word)
           english_word = []
           # result_eng.write("\n")
           english_sentence_count += 1

if english_sentence_count < chinese_sentence_count:
       min_count = english_sentence_count
   else:
       min_count = chinese_sentence_count

matrix = []
   if len(english_sentence) == len(chinese_sentence):
       i = 0
       while i < len(english_sentence):
           chinese_sentence_length = len(chinese_sentence[i])
           english_sentence_length = len(english_sentence[i])#获得当前句子的行列值

english_chinese = [["0" for col in range(english_sentence_length + 1)] for row in range(chinese_sentence_length + 1)]

           col = 1
           while col <= english_sentence_length:
               english_chinese[0][col] = english_sentence[i][col - 1]
               col += 1

           row = 1
           while row <= chinese_sentence_length:
               english_chinese[row][0] = chinese_sentence[i][row - 1]
               row += 1
           # for row in range(chinese_sentence_length):
           #    for col in range(english_sentence_length):
           #        eng_chi.write(english_chinese[row][col] + "   ")
           #    eng_chi.write("\n")
           # eng_chi.write("\n")
           #每次放进去的矩阵，其实规模是不一样大的
           matrix.append(english_chinese)
           i = i + 1
   else:
       print('error')

# for j in range(len(matrix)):
   #    for row in range(len(matrix[j])):
   #        s = ""
   #        for col in range(len(matrix[j][row])):
   #            s += matrix[j][row][col]
   #            s += "   "
   #        print(s)

   return matrix, chinese_sentence
#matrix,_ = get_matrix()
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------

#-*-coding:utf-8-*-
import os
import string

def count(filepath):
    total = 0 #总行数
    countPound = 0 #注释行数
    countBlank = 0 #空行数
    line = open(filepath,'r')#,encoding='utf-8')
    for li in line.readlines(): #readlines()一次性读完整个文件
        total += 1
        if not li.split(): #判断是否为空行
            countBlank +=1
        li.strip()
        if li.startswith('#'):
            countPound += 1
    print(file)
    print("countBlank:%d" % countBlank)
    print("countPound:%d" % countPound)
    print("total:%d" % total)

count('result_cn')

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#-*-coding:utf-8-*-
def bijiao():
     f1=open('lmt.txt','r')
     f2=open('lh.txt','r')
     count=0     #统计行数
     dif=[]      #统计不同的数量序列
     for a in f1:
          b=f2.readline()
          count+=1
          if a!=b:
              dif.append(count)
     f1.close()
     f2.close()
     return dif
c=bijiao()
if c==0:
     print('两个文件一样！')
else:
     print('有%d处不同'% len(c))
     for each in d:
          print('%d行不一样'% each)

转载于:https://www.cnblogs.com/maowuyu-xb/p/7236769.html

常用Python文件相关推荐

python对文件的读操作方法有哪些-Python文件常用操作方法
Python文件常用操作方法一.对File对象常用操作方法: file= open(file, mode='r', buffering=-1, encoding=None, errors=None, ...
python文件操作的方法_Python文件常用操作方法
Python文件常用操作方法一.对File对象常用操作方法: file= open(file, mode='r', buffering=-1, encoding=None, errors=None, ...
cfile清空文件内容_体育老师学编程（第11天）python常用的文件读写操作
学习内容:python文件处理一.什么是文件: 前边学习了计算机的存储设备分为内存和硬盘两种,内存容量小,断电就丢失,我们若想长期存储一段内容,就需要存到硬盘中,那么存入的方式就是以文件形式存入的. ...
python复制文件shutil_Python常用模块——文件复制模块shutil
Python常用模块--文件复制模块shutil shutil模块高级的文件.文件夹.压缩包处理模块 shutil.copyfileobj(fsrc, fdst) 将文件内容拷贝到另一个文件中 im ...
Python文件操作-文本文件、二进制文件、csv文件的读取写入、OS、shutil、CSV模块、常用字符编码
Python文件操作文本文件和二进制文件文件操作相关模块 open()创建文件对象文件对象的常用属性和方法 pickle 序列化文本文件读取和写入文本文件写入步骤 write()/write ...
Python包下载常用whl文件汇总：最全的Python whl集合
Python包下载常用whl文件汇总:最全的Python whl集合对于Python开发者来说,我们经常需要下载各种第三方库或包,而这些包往往需要我们手动下载并安装.在Python中,我们通常使用p ...
Python文件的读写及常用文件的打开方式
编码格式常见的编码格式 Python的解释器使用的是Unicode(内存) .py文件在磁盘上使用UTF-8(外存) 更改编码格式一般形式为在程序开头写 # coding:编码格式.# codin ...
Python基础文本控制文件编码格式使用python读写文件常用的文件打开方式
编码格式: 常见的字符编码格式 Python的解释器使用的是Unicode(内存) .py文件在磁盘上使用UTF-8存储(外存) 文件的读写原理: 1.文件的读写俗称"IO操作" ...
linux 怎么用命令行运行python文件_Linux命令行常用命令及python应用
通常用户和电脑交互是通过图形用户界面(GUI), 更快捷的方式是通过命令行界面(Command line interface), 通过在终端输入命令来实现文件夹和程序间的切换.Linux 和OS X都 ...

常用Python文件

常用Python文件相关推荐

最新文章

热门文章