python怎么读文件内容-Python 文件内容读取

背景

计划实现文件中心，支撑检索常见文件内容

依赖包

#pip install baidu-aip

from aip import AipOcr

#pip install xlrd

import xlrd

import os

#pip install csv23

import csv23

#pip install docx2txt

import docx2txt

#pip install pypiwin32

from win32com import client as wc

#pip install python-pptx

from pptx import Presentation

#pip install wand

from wand.image import Image

# 使用 wand 异常，缺少 ImageMagick 支持

# http://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-windows

# https://imagemagick.org/script/download.php#windows

# 使用 wand 异常，FailedToExecuteCommand `"gswin32c.exe"

# http://ghostscript.com/download/gsdnld.html

基础文件读取

.txt

# 读取 txt 文件，返回文件内容

def readTxt(fileUrl):

content = ""

if os.path.exists(fileUrl):

with open(fileUrl, 'r') as f:

for l in f:

temp = l.rstrip(' ').rstrip().split(' ')[0]

content += temp.replace(' ','')

return content

.xls .xlsx

# 读取 excel 文件，返回文件内容

def readExcel(fileUrl):

content = ""

if os.path.exists(fileUrl):

excelfile = xlrd.open_workbook(fileUrl)

for name in excelfile.sheet_names():

sheet = excelfile.sheet_by_name(name)

sheet_rows = sheet.nrows

sheet_cols = sheet.ncols

for rowi in range(sheet_rows):

temp = sheet.row_values(rowi)

content += (''.join(map(str,temp))).replace(' ','')

return content

.docx

# 读取 docx 文件

def readDocx(fileUrl):

content = ""

if os.path.exists(fileUrl):

content = docx2txt.process(fileUrl)

content = "".join(content.split())

return content

.doc

# 读取 doc 文件，安装 pypiwin32，操作本地word程序，将doc 转为docx，再调用读取 docx 文件方法

def readDoc(fileUrl):

AbsolutePath = os.path.abspath(fileUrl)

word = wc.Dispatch('Word.Application')

doc = word.Documents.Open(AbsolutePath)

# 保存临时文件

doc.SaveAs(AbsolutePath + ".docx", 12, False, "", True, "", False, False, False, False) # 转化后路径下的文件

doc.Close()

word.Quit()

content = readDocx(fileUrl + ".docx")

# 移除临时文件

os.remove(fileUrl + ".docx")

return content

其他文件读取

.csv

# 读取 csv 文件，返回文件内容，默认utf-8，如果解析不了，使用gbk解析

def readCsv(fileUrl):

content = ""

if os.path.exists(fileUrl):

try:

with csv23.open_csv(fileUrl) as reader:

for row in reader:

content += (''.join(row)).replace(' ','')

except Exception as e:

with csv23.open_csv(fileUrl, encoding='gbk') as reader:

for row in reader:

content += (''.join(row)).replace(' ','')

return content

图片

# 读取图片文件，返回文件内容

def readImage(fileUrl):

content = ""

if os.path.exists(fileUrl):

APP_ID = 'xxxxx'

API_KEY = 'xxxxxxxxxxxxxxxx'

SECRET_KEY = 'xxxxxxxxxxxxxxxxxxxxxxx'

client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

with open(fileUrl,'rb') as f:

img = f.read()

msg = client.basicGeneral(img)

for i in msg.get('words_result'):

temp = i.get('words')

content += temp.replace(' ','')

return content

.pptx

# 读取 pptx 文件 ,默认读取正文，默认不读取表格，默认不读取图片

def readPptx(fileUrl,extend_table = False,extend_image = False):

content = ""

ppt = Presentation(fileUrl)

for slide in ppt.slides:

for shape in slide.shapes:

if not shape.has_text_frame:

# 提取图片文字

if extend_image and hasattr(shape,'image'):

# 图片存储本地

with open(shape.image.filename, 'wb') as f:

f.write(shape.image.blob)

f.close()

# 调用图片文字识别

content += readImage(shape.image.filename)

# 移除临时图片

os.remove(shape.image.filename)

# 提取表格内容

if extend_table and shape.has_table:

for row in shape.table.rows:

for cell in row.cells:

content += cell.text

else:

content += shape.text

content = "".join(content.split())

return content

.ppt

# 读取 ppt 文件，安装 pypiwin32，操作本地ppt程序，将ppt 转为pptx，再调用读取 pptx 文件方法

def readPpt(fileUrl,extend_table = False,extend_image = False):

AbsolutePath = os.path.abspath(fileUrl)

powerpoint = wc.Dispatch('PowerPoint.Application')

ppt = powerpoint.Presentations.Open(AbsolutePath)

# 保存临时文件

ppt.SaveAs(AbsolutePath + ".pptx")

powerpoint.Quit()

content = readPptx(fileUrl + ".pptx",extend_table,extend_image)

# 移除临时文件

os.remove(fileUrl + ".pptx")

return content

.pdf

# 读取 pdf 文件

def readPdf(fileUrl):

content = ""

# 将pdf文件转为jpg图片文件

# ./PDF_FILE_NAME 为pdf文件路径和名称

image_pdf = Image(filename=fileUrl,resolution=300)

image_jpeg = image_pdf.convert('jpg')

# wand已经将PDF中所有的独立页面都转成了独立的二进制图像对象。我们可以遍历这个大对象，并把它们加入到req_image序列中去。

req_image = []

for img in image_jpeg.sequence:

img_page = Image(image=img)

req_image.append(img_page.make_blob('jpg'))

# 遍历req_image,保存为图片文件

for img in req_image:

ff = open(fileUrl+'.jpg','wb')

ff.write(img)

ff.close()

# 调用图片文字识别

content += readImage(fileUrl+'.jpg')

# 移除临时图片

os.remove(fileUrl+'.jpg')

return content

python怎么读文件内容-Python 文件内容读取相关推荐

[Python] 先读后覆盖写文件
Python先读后覆盖写文件只打开一次文件,先把文件内容读取出来,处理之后形成新的数据,重新写入新的数据,完成先读取后覆盖写文件. 比如有一个文件内容如下,需要将内容全转为大写然后覆盖: Now Y ...
python怎么读xlsx_用python读取xlsx文件
一准备工作: 二 xlrd库读取首先安装xlrd库,安装方法: pip install xlrd import xlrd #打开excel wb = xlrd.open_workbook('tes ...
python怎么读write_Python如何读写文件？python写入文件读写操作详解
Python文件的打开或创建可以使用函数open().该函数可以指定处理模式,设置打开的文件为只读.只写或可读写状态.open()的声明如下所示. open(file, mode='r', buffe ...
python怎么读excelsheet_python3 excle(python怎么读写excel文件)
python读取已经打开的3个word和excle文件的路径用 win32com 操控 word和Excel就可以实现咋样把python写入excle中 # 需安装 xlrd-0.9.2 和 xl ...
python下读sougou中文语料文件
下载的sougou中文语料文件是xml格式的,有1.5G,需要在python下进行正文的提取及中文分词工作. 1. 首先,进行正文的提取,有几个需要注意的细节. a. 检测文件的中文编码在pytho ...
python 每次读一行-转载 python每次读入文件一行的问题(血的教训啊)
注意到Python每次读入一个文件的一行时,可以有两种写法: f = open("bigFile.txt","r") while True: line = f. ...
python未读邮件_urllib2模块学习--爬虫读取163邮箱未读邮件
在编写爬虫之前,先介绍个工具httpfox,是firefox的一个插件扩展,它是编写爬虫的神器啊. HttpFox 监控和分析浏览器和网络服务器之间收发的所有HTTP流量. 每个请求可得到的信息包括: ...
python怎么读文件里的某一行-Python如何获取文件指定行的内容
linecache, 可以用它方便地获取某一文件某一行的内容.而且它也被 traceback 模块用来获取相关源码信息来展示. 用法很简单: >>> import linecache ...
python调用什么函数实现对文件内容的读取_如何使用python语言中的方法对文件进行读写操作...
在我们使用python语言中的文件时,可以使用open()方法打开文件,close()方法关闭文件,read()方法读取文件内容,write()方法写入内容到文件中.下面利用几个实例说明文件读写方法, ...

python怎么读文件内容-Python 文件内容读取

python怎么读文件内容-Python 文件内容读取相关推荐

最新文章

热门文章