使用Python实现XML文件转为Excel文件
由于项目组中原来的文件使用的XML格式作为配置,扩展性很好,但是阅读起来不是很直观,编辑起来也是不很方便,特别一些规则的二维表,个人觉得使用EXCEL文件会方便很多。所以为了方便阅读与编辑这些规则的二维表,花了一些时间写了一个Python脚本,以实现将XML文件转为Excel文件。
这里支持XML文件转为一个Sheet或者多个Sheet:
- 如果第二层所有标签都相同则会转为一个Sheet,所有第二层的标签都会作为行数据
- 如果第二层的标签有多种,则会把第二层的不同标签作为不同的Sheet,第三层的数据作为行数据
其它情况未作测试。
注意:脚本会自动根据所有同层标签计算所有字段以及所有子标签及其重复数,并作为列的扩展,如果XML的嵌套过深或者太复杂,可能会导致扩展后的列数超过EXCEL所支持的最大列数(16384列),导致失败。
附上源码以共享,当然,如果有更好的Idea进行改进欢迎留言。
#
# XML文件转换成Excel文件
#
# 版本:1.0
#
# 作者:Witton Bell
# E_Mail:witton@163.com
#
#
# 功能描述:
#
# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
#
# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
#
# 仅支持XML的第一层只有一个标签,
# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
#
# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
#
#
import openpyxl
from openpyxl.styles import Alignment
import xml.dom.minidom
from xml.dom.minidom import Document
from openpyxl.styles import Border, Side
import chardet
import osclass XMLNode:def __init__(self):self.name = ""self.properties = []self.child = []self.layer = 0self.index = 0self.parent = Noneself.node_info = Noneclass XMLNodeInfo:def __init__(self, node):self.map_properties = {}self.map_child = {}self.max_index = 0self.layer = node.layerself.name = node.nameself.register(node)def register(self, node):for k in node.properties:if self.map_properties.get(k[0]) is None:self.map_properties[k[0]] = self.map_properties.__len__()for ch in node.child:v = self.map_child.get(ch.name)if v is None:self.map_child[ch.name] = ch.node_infoif node.index > self.max_index:self.max_index = node.indexclass XMLReader:# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件def __init__(self, file_path, is_auto_convert2utf8, is_convert_to_original_file):self.__root = []self.__map_node_info = {}if is_auto_convert2utf8:is_tmp_file, tmp_file_path = self.__convert_to_utf8(file_path)fd = xml.dom.minidom.parse(tmp_file_path)if is_tmp_file:if is_convert_to_original_file:os.remove(file_path)os.rename(tmp_file_path, file_path)else:os.remove(tmp_file_path)else:fd = xml.dom.minidom.parse(file_path)index = 0for child in fd.childNodes:if child.nodeType != xml.dom.Node.ELEMENT_NODE:continueself.__read_node(child, self.__root, index, None)index += 1def get_root_node(self):return self.__root@staticmethoddef __convert_to_utf8(file_path):fd = open(file_path, "rb")fd.seek(0, 2)size = fd.tell()if size > 1024 * 1024:size = 1024 * 1024fd.seek(0, 0)text = fd.read(size)ret = chardet.detect(text)if ret['encoding'].lower().find("utf-8") != -1:return False, file_pathtmp_file = file_path + ".tmp"file = open(tmp_file, "w", encoding="utf-8", newline="\n")fd.seek(0, 0)line = fd.readline()while line.__len__() > 0:file.write(line.decode("gb18030"))line = fd.readline()file.close()fd.close()return True, tmp_file@staticmethoddef __get_attrib(node, rc):if node._attrs is None:returnfor key in node._attrs:v = node._attrs[key]._valuerc.append([key, v])def __read_node(self, node, root, index, parent, layer=1):xml_node = XMLNode()xml_node.name = node.nodeNamexml_node.layer = layerxml_node.index = indexxml_node.parent = parentself.__get_attrib(node, xml_node.properties)i = 0for child in node.childNodes:if child.nodeType != xml.dom.Node.ELEMENT_NODE:continueself.__read_node(child, xml_node.child, i, xml_node, layer + 1)i += 1root.append(xml_node)self.__register(xml_node)def __register(self, node):key = node.name + str(node.layer)nd = self.__map_node_info.get(key)if nd is None:nd = XMLNodeInfo(node)node.node_info = ndself.__map_node_info[key] = ndelse:nd.register(node)node.node_info = ndclass XMLWriter:def __init__(self, xml_file_path, xml_node_list):doc = Document()for node in xml_node_list:ele = self.__write_node(node, doc)doc.appendChild(ele)f = open(xml_file_path, 'w', encoding='utf-8')doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')def __write_node(self, node, doc):ele = doc.createElement(node.name)for prop in node.properties:ele.setAttribute(prop[0], prop[1])for child in node.childs:ret = self.__write_node(child, doc)ele.appendChild(ret)return eleclass XmlToXls:# read_from_xml_file_path:XML源文件完整路径# save_to_xls_file_path:保存转换后的Excel文件完整路径# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件# is_merge_head:如果表头有多行,是否合并层属关系的表头# is_border_cell:是否添加单元格的边框# is_alignment_center:单元格是否居中对齐def __init__(self, read_from_xml_file_path, save_to_xls_file_path,is_auto_convert2utf8=True, is_convert_to_original_file=False,is_merge_head=False, is_border_cell=False, is_alignment_center=True):try:self.__is_merge_head = is_merge_headif is_alignment_center:self.__alignment = Alignment(horizontal='center', vertical='center')else:self.__alignment = Noneif is_border_cell:side = Side(border_style='thin', color='000000')self.__border = Border(side, side, side, side, side)else:self.__border = Nonewb = openpyxl.Workbook()wb.encoding = 'utf-8'for sh in wb.worksheets:wb.remove_sheet(sh)reader = XMLReader(read_from_xml_file_path,is_auto_convert2utf8,is_convert_to_original_file)self.__write(reader.get_root_node(), wb)wb.save(save_to_xls_file_path)wb.close()except Exception as e:print(e)def __write(self, xml_node, wb):self.__map_field = {}for node in xml_node:if node.node_info.map_child.__len__() == 1:self.__start_layer = 1self.__write_sheet(node, wb)else:self.__start_layer = 2for child in node.child:self.__write_sheet(child, wb)def __write_sheet(self, node, wb):sh = wb.create_sheet(node.name)self.__write_head(node.node_info, sh)self.__write_data(node, sh)def __write_head(self, node, sh):for key in node.map_child:col = 1child = node.map_child[key]for k in child.map_properties:c = child.map_properties[k]self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)col += child.map_properties.__len__()self.__write_head_ext(child, sh, col)breakdef __write_head_ext(self, node, sh, col):for key in node.map_child:child = node.map_child[key]num = child.map_properties.__len__()for i in range(0, child.max_index + 1):if col > 16384:raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")old_col = colself.__write_cell(sh, child.layer - self.__start_layer - 1, col, child.name)for k in child.map_properties:c = child.map_properties[k]self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)col += numcol = self.__write_head_ext(child, sh, col)if self.__is_merge_head:merge_row = child.layer - self.__start_layer - 1sh.merge_cells(start_row=merge_row, end_row=merge_row,start_column=old_col, end_column=col - 1)return coldef __write_data(self, node, sh):row = sh.max_row + 1sh.freeze_panes = sh.cell(row, 1)for child in node.child:col = 1self.__write_data_ext(child, sh, row, col)row += 1def __write_data_ext(self, node, sh, row, col):m = node.node_info.map_propertiesmax_prop_num = m.__len__()for prop in node.properties:c = m[prop[0]]self.__write_cell(sh, row, c + col, prop[1])col += max_prop_nummap_child = {}for child in node.child:if map_child.get(child.name) is None:map_child[child.name] = 1else:map_child[child.name] += 1col = self.__write_data_ext(child, sh, row, col)for key in node.node_info.map_child:child = node.node_info.map_child[key]all_count = child.max_index + 1count = map_child.get(key)if count is not None:all_count -= countfor i in range(0, all_count):col += child.map_properties.__len__()return coldef __write_cell(self, sh, row, col, value):if value.isdigit():cell = sh.cell(row, col, int(value))else:cell = sh.cell(row, col, value)cell.alignment = self.__alignmentcell.border = self.__border
由于使用的XML解析器不支持值中有大于(>),小于(<)这些特殊符号,所以如果值中有这些符号的XML文件会解析失败,报错:
not well-formed (invalid token)
比如下面的XML文件就会报上面的错:
<?xml version='1.0' encoding='UTF-8'?>
<test>
<testData value="<测试数据>"/>
</test>
也不支持没有根节点的XML文件:
比如:
<?xml version='1.0' encoding='UTF-8'?>
<A Value="A1"><AA value="a"/>
</A>
<B Value="B1"><BB value="b"/>
</B>
<C Value="C1"><CC value="c"/>
</C>
会报错:
junk after document element
C++使用的tinyxml是可以正常解析大于小于等特殊符号的,网上有一个pytinyxml2开源项目,让python可以使用tinyxml进行解析。
安装pytinyxml2之前需要先安装swig,Windows下可以下载:swigwin-4.0.1.zip,这是已经编译好的版本,解压出来后添加路径到PATH环境变量即可。
非Windows可以下载swig-4.0.1.tar.gz进行编译安装,也可以直接:
yum install swig
安装好swig后,在pytinyxml2源码目录中执行:
python setup.py install
不能使用
pip install pytinyxml2
进行安装,我遇到有报错:
running bdist_wheelrunning buildrunning build_pycreating buildcreating build/lib.linux-x86_64-3.5copying pytinyxml2.py -> build/lib.linux-x86_64-3.5running build_extbuilding '_pytinyxml2' extensionswigging pytinyxml2.i to pytinyxml2_wrap.cppswig -python -c++ -o pytinyxml2_wrap.cpp pytinyxml2.ipytinyxml2.i:5: Error: Unable to find 'tinyxml2.h'error: command 'swig' failed with exit status 1
查看pytinyxml2的包可以看到,里面缺少tinyxml2.h
项目中原来的程序只能读取XML格式的文档,我们为了方便编辑与查阅,所以把XML转为Excel,编辑完成后,还需要把Excel转为原来的XML,所以实现了XLS转XML,并对之前的代码作了部分修改,附上源码:
# -*- coding: UTF-8 -*-
#
# XML文件与Excel文件互转
#
# 版本:1.1
#
# 作者:Witton Bell
# E_Mail:witton@163.com
#
#
# 功能描述:
#
# XmlToXls:
#
# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
#
# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
#
# 使用XMLReader仅支持XML的只有一个根标签(标准XML格式),使用TinyXMLReader支持有多个根标签
# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
#
# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
#
# XlsToXml:
# 同样XMLWriter仅支持XML只有一个根标签(标准XML格式),使用TinyXMLWriter支持有多个根标签
# Excel文件需要有层次分明的表头,并且需要冻结表头
#
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.styles import Border, Side
from openpyxl.comments import Comment
import openpyxl.utils as xls_util
import chardet
import os
import xml.dom.minidom
import copyclass XMLNode:def __init__(self):self.name = ""self.properties = []self.child = []self.layer = 0self.index = 0self.parent = Noneself.node_info = Noneclass XMLNodeInfo:def __init__(self, node):self.map_properties = {}self.map_child = {}self.max_index = 0self.layer = node.layerself.name = node.nameself.parent = node.parentself.register(node)def register(self, node):for k in node.properties:if self.map_properties.get(k[0]) is None:self.map_properties[k[0]] = self.map_properties.__len__()for ch in node.child:v = self.map_child.get(ch.name)if v is None:self.map_child[ch.name] = ch.node_infoif node.index > self.max_index:self.max_index = node.indexclass _XMLReaderBase:def __init__(self):self._root = []self._map_node_info = {}def __calc_node_key(self, node):key = ""if node.parent is not None:key = self.__calc_node_key(node.parent)return "%s_%s" % (key, node.name)def _register(self, node):key = self.__calc_node_key(node)nd = self._map_node_info.get(key)if nd is None:nd = XMLNodeInfo(node)node.node_info = ndself._map_node_info[key] = ndelse:nd.register(node)node.node_info = nd@staticmethoddef _convert_to_utf8(file_path):fd = open(file_path, "rb")fd.seek(0, 2)size = fd.tell()if size > 1024 * 1024:size = 1024 * 1024fd.seek(0, 0)text = fd.read(size)ret = chardet.detect(text)if ret['encoding'].lower().find("utf-8") != -1:return False, file_pathtmp_file = file_path + ".tmp"file = open(tmp_file, "w", encoding="utf-8", newline="\n")fd.seek(0, 0)line = fd.readline()while line.__len__() > 0:file.write(line.decode("gb18030"))line = fd.readline()file.close()fd.close()return True, tmp_file# 该类使用xml不能解析值中带特殊符号的文件,也不支持没有统一根节点的XML文件,建议使用TinyXMLReader
class XMLReader(_XMLReaderBase):# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):_XMLReaderBase.__init__(self)if is_auto_convert2utf8:is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)fd = xml.dom.minidom.parse(tmp_file_path)if is_tmp_file:if is_convert_to_original_file:os.remove(file_path)os.rename(tmp_file_path, file_path)else:os.remove(tmp_file_path)else:fd = xml.dom.minidom.parse(file_path)index = 0for child in fd.childNodes:if child.nodeType != xml.dom.Node.ELEMENT_NODE:continueself.__read_node(child, self._root, index, None)index += 1return self._root@staticmethoddef __get_attrib(node, rc):if node._attrs is None:returnfor key in node._attrs:v = node._attrs[key]._valuerc.append([key, v])def __read_node(self, node, root, index, parent, layer=1):xml_node = XMLNode()xml_node.name = node.nodeNamexml_node.layer = layerxml_node.index = indexxml_node.parent = parentself.__get_attrib(node, xml_node.properties)i = 0for child in node.childNodes:if child.nodeType != xml.dom.Node.ELEMENT_NODE:continueself.__read_node(child, xml_node.child, i, xml_node, layer + 1)i += 1root.append(xml_node)self._register(xml_node)# 该类需要安装pytinyxml2,参见:https://blog.csdn.net/witton/article/details/100302498
class TinyXMLReader(_XMLReaderBase):# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):_XMLReaderBase.__init__(self)import pytinyxml2doc = pytinyxml2.XMLDocument()if is_auto_convert2utf8:is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)doc.LoadFile(tmp_file_path)if is_tmp_file:if is_convert_to_original_file:os.remove(file_path)os.rename(tmp_file_path, file_path)else:os.remove(tmp_file_path)else:doc.LoadFile(file_path)node = doc.RootElement()index = 0while node is not None:self.__read_node(node, self._root, index, None)node = node.NextSiblingElement()index += 1return self._root@staticmethoddef __get_attrib(node, rc):attrib = node.FirstAttribute()while attrib is not None:key = attrib.Name()v = attrib.Value()rc.append([key, v])attrib = attrib.Next()def __read_node(self, node, root, index, parent, layer=1):xml_node = XMLNode()xml_node.name = node.Value()xml_node.layer = layerxml_node.index = indexxml_node.parent = parentself.__get_attrib(node, xml_node.properties)i = 0child = node.FirstChildElement()while child is not None:self.__read_node(child, xml_node.child, i, xml_node, layer + 1)child = child.NextSiblingElement()i += 1root.append(xml_node)self._register(xml_node)class XMLWriter:def __init__(self, xml_file_path, xml_node_list):doc = xml.dom.minidom.Document()for node in xml_node_list:ele = self.__write_node(node, doc)doc.appendChild(ele)f = open(xml_file_path, 'w', encoding='utf-8')doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')f.close()def __write_node(self, node, doc):ele = doc.createElement(node.name)for prop in node.properties:ele.setAttribute(prop[0], prop[1])for child in node.child:ret = self.__write_node(child, doc)ele.appendChild(ret)return eleclass TinyXMLWriter:# is_convert_spec_character: 是否转换特殊字符def __init__(self, xml_file_path, xml_node_list, is_convert_spec_character=True):import pytinyxml2doc = pytinyxml2.XMLDocument(is_convert_spec_character)decl = doc.NewDeclaration()doc.LinkEndChild(decl)for node in xml_node_list:ele = self.__write_node(node, doc)doc.LinkEndChild(ele)doc.SaveFile(xml_file_path)def __write_node(self, node, doc):ele = doc.NewElement(node.name)for prop in node.properties:ele.SetAttribute(prop[0], prop[1])for child in node.child:ret = self.__write_node(child, doc)ele.LinkEndChild(ret)return eleclass XmlToXls:def __init__(self):self.__is_merge_head = Falseself.__alignment_center = Noneself.__border = None# read_from_xml_file_path:XML源文件完整路径# save_to_xls_file_path:保存转换后的Excel文件完整路径# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件# is_merge_head:如果表头有多行,是否合并层属关系的表头# is_border_cell:是否添加单元格的边框# is_use_tiny_xml:是否使用tinyXMLdef convert(self, read_from_xml_file_path, save_to_xls_file_path,is_auto_convert2utf8=True, is_convert_to_original_file=False,is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):try:self.__is_merge_head = is_merge_headself.__alignment_center = Alignment(horizontal='center', vertical='center')if is_border_cell:side = Side(border_style='thin', color='000000')self.__border = Border(side, side, side, side, side)else:self.__border = Nonewb = openpyxl.Workbook()wb.encoding = 'utf-8'for sh in wb.worksheets:wb.remove_sheet(sh)if is_use_tiny_xml:reader = TinyXMLReader()else:reader = XMLReader()root = reader.read(read_from_xml_file_path,is_auto_convert2utf8,is_convert_to_original_file)self.__write(root, wb)wb.save(save_to_xls_file_path)wb.close()except Exception as e:print(e)# src_path_dir:XML源目录完整路径# dst_path_dir:保存转换后的Excel文件完整目录路径,如果为None或者为空,则直接转换在源文件应对目录下# is_auto_convert2utf8:是否自动转换为UTF8编码# is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件# is_merge_head:如果表头有多行,是否合并层属关系的表头# is_border_cell:是否添加单元格的边框# is_use_tiny_xml:是否使用tinyXMLdef convert_dirs(self, src_path_dir, dst_path_dir=None,is_auto_convert2utf8=True, is_convert_to_original_file=False,is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):if dst_path_dir is not None and dst_path_dir != "":if not os.path.exists(dst_path_dir):os.mkdir(dst_path_dir)for root, dirs, files in os.walk(src_path_dir):for name in files:basename, ext = os.path.splitext(name)if ext != ".xml":continuesrc = os.path.join(root, name)target = basename + ".xlsx"print("处理%s" % src)if dst_path_dir is None or dst_path_dir == "":dst = os.path.join(root, target)else:dst = os.path.join(dst_path_dir, target)self.convert(src, dst, is_auto_convert2utf8, is_convert_to_original_file,is_merge_head, is_border_cell, is_use_tiny_xml)def __write(self, xml_node, wb):self.__map_field = {}for node in xml_node:if node.node_info.map_child.__len__() == 1:self.__is_multi_sheet = Falseself.__write_sheet(node, wb)else:self.__is_multi_sheet = Truefor child in node.child:self.__write_sheet(child, wb)def __write_sheet(self, node, wb):sh = wb.create_sheet(node.name)self.__write_head(node, sh)self.__write_data(node, sh)def __write_head(self, node, sh):node_info = node.node_infoif self.__is_multi_sheet:self.__write_head_cell(sh, node_info.parent.layer, 1, node_info.parent.name)self.__write_head_cell(sh, node_info.layer, 1, node_info.name)comment_str = ""for prop in node.properties:comment_str += '%s="%s"\n' % (prop[0], prop[1])if comment_str != "":sh.cell(node_info.layer, 1).comment = Comment(comment_str, "", width=300)elif not self.__is_multi_sheet:self.__write_head_cell(sh, node_info.layer, 1, node_info.name)child_name = Nonefor key in node_info.map_child:col = 1child = node_info.map_child[key]child_name = child.namefor k in child.map_properties:c = child.map_properties[k]self.__write_head_cell(sh, child.layer, c + col, k)col += child.map_properties.__len__()self.__write_head_ext(child, sh, col)breakif self.__is_multi_sheet:row = 3else:row = 2if child_name is not None:sh.insert_rows(row)self.__write_head_cell(sh, row, 1, child_name)def __write_head_ext(self, node, sh, col):for key in node.map_child:child = node.map_child[key]num = child.map_properties.__len__()for i in range(0, child.max_index + 1):if col > 16384:raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")old_col = colself.__write_head_cell(sh, child.layer - 1, col, child.name)for k in child.map_properties:c = child.map_properties[k]self.__write_head_cell(sh, child.layer, c + col, k)col += numcol = self.__write_head_ext(child, sh, col)if self.__is_merge_head:merge_row = child.layer - 1sh.merge_cells(start_row=merge_row, end_row=merge_row,start_column=old_col, end_column=col - 1)return coldef __write_data(self, node, sh):row = sh.max_row + 1sh.freeze_panes = sh.cell(row, 1)for child in node.child:col = 1self.__write_data_ext(child, sh, row, col)row += 1def __write_data_ext(self, node, sh, row, col):m = node.node_info.map_propertiesmax_prop_num = m.__len__()for prop in node.properties:c = m[prop[0]]self.__write_cell(sh, row, c + col, prop[1])col += max_prop_nummap_child = {}for child in node.child:if map_child.get(child.name) is None:map_child[child.name] = 1else:map_child[child.name] += 1col = self.__write_data_ext(child, sh, row, col)for key in node.node_info.map_child:child = node.node_info.map_child[key]all_count = child.max_index + 1count = map_child.get(key)if count is not None:all_count -= countfor i in range(0, all_count):col += child.map_properties.__len__()return coldef __write_head_cell(self, sh, row, col, value):cell = sh.cell(row, col, value)cell.border = self.__bordercell.alignment = self.__alignment_centerdef __write_cell(self, sh, row, col, value):if value.isdigit():cell = sh.cell(row, col, int(value))else:cell = sh.cell(row, col, value)cell.border = self.__borderclass XlsToXml:def __init__(self):pass@staticmethoddef __add_prop(map_field, col, value):if map_field.__len__() == 0:returnif map_field.get(col) is None:# 找本节点c = col - 1while c >= 1:if map_field.get(c) is not None:node = map_field[c][0]breakc -= 1else:node = map_field[col][0]node.properties.append([value, col])@staticmethoddef __add_node(map_field, row, col, value):node = XMLNode()node.name = valueif map_field.get(col) is not None:node.parent = map_field[col][0]else:# 找父节点c = col - 1while c >= 1:if map_field.get(c) is not None:if row > map_field[c][1]:node.parent = map_field[c][0]breakc -= 1if node.parent is not None:node.parent.child.append(node)node.layer = node.parent.layer + 1else:node.layer = 1map_field[col] = [node, row]return nodedef __read_xls(self, file_path):wb = openpyxl.load_workbook(file_path)root_data = XMLNode()is_multi_sheet = wb.worksheets.__len__() > 1for sh in wb.worksheets:max_row = sh.max_row + 1max_col = sh.max_column + 1if sh.freeze_panes is None:raise Exception("文件[%s]表单[%s]的无冻结窗口,无法确定表头,转为XML失败" %(os.path.basename(file_path), sh.title))head_row, head_col = xls_util.coordinate_to_tuple(sh.freeze_panes)if head_col != 1:raise Exception("文件[%s]表单[%s]的冻结窗口列不为1,无法转为XML" % (os.path.basename(file_path), sh.title))root = Nonemap_field = {}for col in range(1, max_col):for row in range(1, head_row):cell = sh.cell(row, col)value = cell.valuecomment = cell.commentif value is None:continuenext_row = row + 1if next_row >= head_row:self.__add_prop(map_field, col, value)continuenext_row_value = sh.cell(next_row, col).valueif next_row_value is None:self.__add_prop(map_field, col, value)continuenode = self.__add_node(map_field, row, col, value)if root is None:root = nodeif comment is None:continuecomment = comment.textlines = comment.splitlines()for line in lines:props = line.split('=')kv = []for prop in props:prop = prop.replace('"', '')kv.append(prop)if kv.__len__() == 2:node.properties.append(kv)kv = []root_data.name = root.nameroot_data.layer = root.layerif is_multi_sheet and root.child.__len__() > 0:child_list = copy.deepcopy(root.child[0].child)root.child[0].child = []root_data.child.append(root.child[0])root_data_child = root.child[0].childelse:child_list = copy.deepcopy(root.child)root_data_child = root_data.childfor row in range(head_row, max_row):clone = copy.deepcopy(child_list)for child in clone:self.__read_node_data(child, sh, row)root_data_child.append(child)return root_datadef __read_node_data(self, node, sh, row):prop_list = []for prop in node.properties:col = prop[1]value = sh.cell(row, col).valueif value is not None:prop_list.append([prop[0], value])child_list = []for child in node.child:self.__read_node_data(child, sh, row)if child.properties.__len__() > 0 or child.child.__len__() > 0:child_list.append(child)copy.copy(child)node.properties = prop_listnode.child = child_listdef convert(self, src_file_path, dst_file_path, is_use_tiny_xml=True, is_convert_spec_character=False):root = self.__read_xls(src_file_path)if is_use_tiny_xml:TinyXMLWriter(dst_file_path, [root], is_convert_spec_character)else:XMLWriter(dst_file_path, [root])def convert_dirs(self, src_path_dir, dst_path_dir=None, is_use_tiny_xml=True, is_convert_spec_character=False):for root, dirs, files in os.walk(src_path_dir):for name in files:basename, ext = os.path.splitext(name)if ext != ".xls" and ext != ".xlsx":continuesrc = os.path.join(root, name)target = basename + "1" + ".xml"print("处理%s" % src)if dst_path_dir is None or dst_path_dir == "":dst = os.path.join(root, target)else:dst = os.path.join(dst_path_dir, target)try:self.convert(src, dst, is_use_tiny_xml, is_convert_spec_character)except Exception as e:print(e)
有人在问如何使用,给一个XML转Excel的示例,假如有一个test.xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<Test><Data Type="1001" Desc = "这是描述"><Condition Value="10" Name="名字1"><AwardItem ItemId = "5" SubId = "10" Count = "10" /></Condition><Condition Value="20" Name="名字2"><AwardItem ItemId = "5" SubId = "100" Count = "50" /></Condition><Condition Value="30" Name="名字3"><AwardItem ItemId = "5" SubId = "1000" Count = "100" /></Condition></Data><Data Type="1002" Desc = "这是描述"><Condition Value="100" Name="名字10"><AwardItem ItemId = "5" SubId = "10" Count = "10" /></Condition><Condition Value="200" Name="名字20"><AwardItem ItemId = "5" SubId = "100" Count = "50" /></Condition><Condition Value="300" Name="名字30"><AwardItem ItemId = "5" SubId = "1000" Count = "100" /></Condition><Condition Value="400" Name="名字40"><AwardItem ItemId = "5" SubId = "5000" Count = "200" /></Condition></Data>
</Test>
我们写一个XML转XLSX的调用程序test.py
import sys
import xml2xls
import tracebackdef main():if sys.argv.__len__() < 3:print("格式错误,格式:<命令> <XML源文件> <XLSX目标文件>")returnc = xml2xls.XmlToXls()try:c.convert(sys.argv[1], sys.argv[2])print("处理文件%s成功" % sys.argv[1])except Exception as e:print("处理文件%s失败, 异常:%s" % (sys.argv[1], e))print(traceback.format_exc())if __name__ == '__main__':main()
在命令行执行:
python test.py test.xml test.xlsx
就会生成一个test.xlsx的Excel文件,使用Excel打开,结果如图:
从前面的XML可以看到,第一行只有3组Condition,第二行有4组Condition,所以最后按最大值4组来展示,第一行没第4组Condition,则为空。
同时,转换后的Excel表头是锁定状态,并且表头是有层次结构的,对应XML的层次。同样,如果要将一个Excel表转换成XML,也需要有锁定状态的且有层次结构的表头。
祝好!
使用Python实现XML文件转为Excel文件相关推荐
- 批量将txt文件转为excel文件
python萌新,记录一下,自己做项目时遇到需批量将txt文件转为excel文件. 代码如下: `# coding:utf-8 import pandas as pd import os def ge ...
- tsv文件转为Excel文件
目录 需求 思路 实现 需求 现在有多个tsv文件,需要将其都转换为excel文件 思路 实现将一个文件转换成功.将tsv文件按行读取,然后通过xlsxwriter写入新创建的xlsx文件中 实现批量 ...
- vue下载二进制流文件转为Excel文件
let blob = new Blob([data], { type: 'application/octet-stream' });// 转化为blob对象 let filename = file ...
- 使用FastReport-v5.3.17把fp3文件转为excel文件
软件下载地址: http://pan.baidu.com/s/1bpJVl1t 1. 打开FastReport-v5.3.17.zip压缩文件,双击里面的fr5_demo.exe文件进行安装.一直一下 ...
- json文件转为Excel文件
json编码为ANSI,BOM字符报错 import json import pandas as pd data = [] with open ('json.json','r',encoding = ...
- python转csv_python脚本如何将Excel文件转为csv文件(代码)
本篇文章给大家带来的内容是关于python脚本如何将Excel文件转为csv文件(代码),有一定的参考价值,有需要的朋友可以参考一下,希望对你有所帮助.#!/usr/bin/env python __ ...
- python csv转excel_将Excel文件转为csv文件的python脚本
将Excel文件转为csv文件的python脚本 #!/usr/bin/env python __author__ = "lrtao2010" ''' Excel文件转csv文件脚 ...
- python将txt文件转为excel格式以及写入excel超过65536行报错问题解决方法
参考链接: https://blog.csdn.net/levy_cui/article/details/82252183 https://blog.csdn.net/levy_cui/article ...
- python怎么将txt转为excel_使用matlab或python将txt文件转为excel表格
假设txt文件为: 一.matlab代码 data=importdata('data.txt'); xlswrite('data.xls',data); 二.python代码 利用pandas的Dat ...
- python将CSV文件转为excel
前言 有时候为了省去编代码的麻烦,可以将csv文件转化为excel表格,以实现用excel处理文件的目的. 一.将file.csv 转为file.xlsx 示例:pandas 是基于NumPy 的一种 ...
最新文章
- 【learning】矩阵树定理
- Cocos2d之Texture2D类详解之将文件加载成Texture2D对象
- 【LINUX】——linux如何使用Python创建一个web服务
- MySQL系列:innodb源代码分析之线程并发同步机制
- Laravel 校验规则之字段值唯一性校验
- 计算机一级应用于段落还是文字,计算机一级复习资料
- 别了 Java EE!开源组织将其更名为Jakarta
- webGIS的粗浅认识(一)
- C盘空间不足,清理AppData
- 天牛须算法(BAS)python实现
- 投影仪的对比度多少合适,当贝X3高亮对比度和电视基本一样
- 日期转换 NSDate 格式 格林尼治时间 时差
- 【测试开发】Python—logging日志封装
- 一点资讯app signature
- 安卓服务器维护,安卓服务器6月14日2时维护公告
- 阿里云域名备案问题 注销主体步骤
- linux 打开大文件命令,linux下大文件的读取
- libcurl Multi异步接口使用
- 程序人生:初学者中最最最常问的问题都有哪些呢???
- 网络地址转换--静态NAT
热门文章
- 蒲公英联机平台的服务器虚拟IP,蒲公英客户端如何使用固定虚拟IP管理虚拟局域网的步骤是什么?...
- Python实现CarMaker TestRun的泛化
- 网络测试工具iperf使用教程
- Navicat for mysql破解版安装步骤
- CCS安装多版本编译器 Compiler version__更新手动下载、安装方法
- 1月10日云栖精选夜读:专访金榕:四年蜕变,阿里iDST是如何登上浪潮之巅的?
- 中国游戏发展史V-02
- 抖音做综艺,差点意思
- 小天使幼儿园管理系统 免费
- C++——WebServer服务器项目