python识别文字颜色_python读取word文档识别字段颜色，解析字段

python版本3.7.3，读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看，更改运行里面的py文件

提取码：nngw

import os

import sys

import xlrd

import codecs

import collections

import json

import io

import docx

import string

from docx import Document

from docx.shared import RGBColor #这个是docx的颜色类

maxLength = 0

id = 1

convert_list = []

type_list = []

curPath = os.path.dirname(os.path.abspath(__file__))

# coding=utf-8

#获取文档对象

def readDocx(fileName,type):

xlsFile = curPath + '\\'+fileName+'.docx' #地理(葡)Respueda G .es.pt

print("xlsFile: "+xlsFile)

file=docx.Document(xlsFile)

# print("段落数:"+str(len(file.paragraphs)))

index = 0

data = {}

i = 0

global id

global maxLength

for p in file.paragraphs:

i = i + 1

if i <= 1: #跳过第一行

continue

if p.text == "" or (not p.text.strip()):

continue

# print("读取第 "+str(i)+" 行，文件名："+fileName+" ID："+str(id)+" 内容:"+p.text)

if index == 0: #提取题目

# print(p.text.find("-"),"题的内容是：", p.text)

length = len(p.text)

idx = p.text.find("Número")

if idx != -1 and idx < 2:

idx = idx + len("Número") + 1

# print("Número: "+str(idx)+" text: "+p.text)

p.text = p.text[idx:(length)]

# print("Número: "+str(idx)+" text: "+p.text)

indexStr = "-" #分隔符

if p.text.find(indexStr) == -1:

indexStr = "."

if p.text.find(indexStr) == -1:

indexStr = " "

# print("题的内容是：", p.text)

idx = p.text.index(indexStr)+len(indexStr)

length = len(p.text)

if length > maxLength:

maxLength = length

# print(id,"最大字符数",maxLength)

# print(str(idx)+str(length)+"第"+str(id)+"题的内容是："+p.text)

questionAndsubType = p.text[idx:(length)]

questionAndsubTypeList = questionAndsubType.split("|")

data["question"] = questionAndsubTypeList[0] #题目

# if len(questionAndsubTypeList) > 1 : #类型

# subType = questionAndsubTypeList[1].replace("\n", "")

# print("---类型---",type_list.count(subType))

# if type_list.count(subType) <= 0 :

# type_list.append(subType)

data["subType"] = type#escape(subType) #类型

else: #提取选项，以及正确答案

# print("第"+str(id)+"题选项"+ str(index) +"是："+p.text)

length = len(p.text)

for n in p.runs:

rgb = str(n.font.color.rgb) #读取段落颜色

# print("runs"+rgb)

if rgb == "00FF00":

# print("正确答案： ",index)

data["rightIndex"] = index

#删除段落中不必要文字

idx = p.text.find("(Direito)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Correcta)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Right)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Correct)")

if idx != -1:

p.text = p.text[0:idx]

#删除段落中不必要文字

data["option"+str(index)] = p.text

index = index + 1

if index >= 5:

data["_id"] = id

# print("data: "+str(data))

convert_list.append(data)

index = 0

id = id + 1

data = {}

def writeDocx(fileList,name):

global id

global convert_list

global type_list

id = 1

convert_list = []

type_list = []

for p in fileList:

readDocx(p["path"],p["type"])

#题库

jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径

dirname = os.path.dirname(jsonPath)

if not os.path.exists(dirname):

os.makedirs(dirname)

with io.open(jsonPath, 'w', encoding='utf-8') as f: #按照对应路径写入

f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():

en_fileList = [{"path":"en_us_topic\\地理(英)Respueda G .es.en", "type":"World"},

{"path":"en_us_topic\\科学与技术(英)", "type":"Technology"},

{"path":"en_us_topic\\历史(英)Resupeda H.es.en", "type":"History"},

{"path":"en_us_topic\\艺术和文学(英)Respueda A&L.es.en", "type":"ArtAndLiterature"},

{"path":"en_us_topic\\娱乐(英)Respueda E.es.en", "type":"Fashion"},

{"path":"en_us_topic\\运动(英)Respueda D.es.en", "type":"Sports"}]

en_name = "en_us_topic"

es_fileList = [{"path":"es_es_topic\\地理(西)Respueda G ", "type":"World"},

{"path":"es_es_topic\\科学与技术(西)Respueda C&T", "type":"Technology"},

{"path":"es_es_topic\\历史(西)Resupeda H", "type":"History"},

{"path":"es_es_topic\\艺术和文学(西)Respueda A&L", "type":"ArtAndLiterature"},

{"path":"es_es_topic\\娱乐(西)Respueda E", "type":"Fashion"},

{"path":"es_es_topic\\运动(西)Respueda D", "type":"Sports"}]

es_name = "es_es_topic"

pt_fileList = [{"path":"pt_br_topic\\地理(葡)Respueda G .es.pt", "type":"World"},

{"path":"pt_br_topic\\科学与技术(葡)", "type":"Technology"},

{"path":"pt_br_topic\\历史(葡)Resupeda H.es.pt", "type":"History"},

{"path":"pt_br_topic\\艺术和文学(葡)Respueda A&L.es.pt", "type":"ArtAndLiterature"},

{"path":"pt_br_topic\\娱乐(葡)Respueda E.es.pt", "type":"Fashion"},

{"path":"pt_br_topic\\运动(葡)Respueda D.es.pt", "type":"Sports"}]

pt_name = "pt_br_topic"

writeDocx(pt_fileList,pt_name)

writeDocx(es_fileList,es_name)

writeDocx(en_fileList,en_name)

main()

python识别文字颜色_python读取word文档识别字段颜色，解析字段相关推荐

python识别颜色1007python识别颜色_python读取word文档识别字段颜色，解析字段
python版本3.7.3,读取的文档格式为.docx 文中带有简单注释看不懂的百度网盘下载直接查看,更改运行里面的py文件提取码:nngw import os import sys import ...
用python将word文档导入数据库_python读取word文档,插入mysql数据库的示例代码
表格内容如下: 1.实现批量导入word文档,取文档标题中的数字作为编号 2.除取上面打钩的内容需要匹配出来入库入库,其他内容全部直接入库mysql # wuyanfeng # -*- coding: ...
python怎么获取word文档的章节_python读取word文档的方法
本文实例讲述了python读取word文档的方法.分享给大家供大家参考.具体如下: 首先下载安装win32com from win32com import client as wc word = wc ...
java获取word书签表格数据_Python读取word文档里面的表格数据
更多精彩,请点击上方蓝字关注我们! 我们常见的办公数据通常可以分为结构化数据与非结构化数据,比如常见的word, ppt, excel.前两者存储的是非结构化数据,excel存储的是结构化数据.从事数 ...
java 加背景颜色_Java 给Word文档添加背景颜色
前言当我们制作好Word文档后,想要让枯燥乏味的文本显得有活力,或是想高亮显示文档中指定的段落或文字,此时我们可以通过为整个文档或某特定文字/段落添加背景色的形式来实现.本文将使用Free Spir ...
Python读取word文档识别字段颜色，解析字段！
python版本3.7.3,读取的文档格式为.docx 文中带有简单注释看不懂的百度网盘下载直接查看,更改运行里面的py文件网盘下载 Python学习交流群:1004391443 提取码:nngw ...
python读word文档计算字数_python读取word文档
周末需要做一个统计word文档字数的问题,刚开始以为很简单,因为之前做过excel表格相关的任务,所以认为利用扩展模块应该比较简单. 通过搜索,确实搜到了一个python操作word的模块,pytho ...
用python将word文档导入数据库_python读取word文档，插入mysql数据库实例
表格内容如下: 1.实现批量导入word文档,取文档标题中的数字作为编号 2.除取上面打钩的内容需要匹配出来入库入库,其他内容全部直接入库mysql # wuyanfeng # -*- coding: ...
python word导入数据库_python读取word文档，插入mysql数据库实例
表格内容如下: 1.实现批量导入word文档,取文档标题中的数字作为编号 2.除取上面打钩的内容需要匹配出来入库入库,其他内容全部直接入库mysql # wuyanfeng # -*- coding: ...

python识别文字颜色_python读取word文档识别字段颜色，解析字段

python识别文字颜色_python读取word文档识别字段颜色，解析字段相关推荐

最新文章

热门文章