Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误

Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误

望共同进步

转载请注明地址：http://blog.csdn.net/weixin_39701039/article/details/79576549

"r" 以读方式打开，只能读文件，如果文件不存在，会发生异常

"w" 以写方式打开，只能写文件，如果文件不存在，创建该文件；如果文件已存在，先清空，再打开文件
"rb" 以二进制读方式打开，只能读文件，如果文件不存在，会发生异常

"wb" 以二进制写方式打开，只能写文件，如果文件不存在，创建该文件；如果文件已存在，先清空，再打开文件

这里结合前面写的 Python3.5 爬虫之由浅入深（三、html转excel）来看看'w'和'wb'的区别，已经延伸的说说爬取文件成伪excel时遇到的问题；

一：UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 13785: illegal multibyte sequence

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSouppath = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }def get_Soup(url):response = requests.get(url,headers = headers,timeout = 120)response.encoding = 'utf-8'
    res = response.textsoup = BeautifulSoup(res,'html.parser')return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串

with open(r'G:\任务20180312\test\html_excel/test1.xls','w') as f1:f1.write(result1)

#结果：

因为如果在window下运行，对于Unicode字符，需要print出来的话，由于本地系统是Windows中的cmd，默认codepage是CP936，即GBK的编码，所以python解释器需要先将上述的Unicode字符编码为GBK，然后再在cmd中显示出来。但是由于该Unicode字符串中包含一些GBK中无法显示的字符，导致此时提示“’gbk’ codec can’t encode”的错误的。

这个时候我们可以在with open(..,)括号里加入编码方式，'utf-8',如下代码：

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSouppath = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }def get_Soup(url):response = requests.get(url,headers = headers,timeout = 120)response.encoding = 'utf-8'
    res = response.textsoup = BeautifulSoup(res,'html.parser')return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串


with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:f1.write(result1)

#结果：

二：ValueError: binary mode doesn't take an encoding argument

如下代码：

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSouppath = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }def get_Soup(url):response = requests.get(url,headers = headers,timeout = 120)response.encoding = 'utf-8'
    res = response.textsoup = BeautifulSoup(res,'html.parser')return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串


with open(r'G:\任务20180312\test\html_excel/test1.xls','wb',encoding='utf-8') as f1: f1.write(result1)

#结果：

因为'wb'是以二进制写入文件，而result1是字符串（str），所以报错，写入文件为0kb，即没有结果

这里可以把result1转变问字节串 bytes(result1)

如下：

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSouppath = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }def get_Soup(url):response = requests.get(url,headers = headers,timeout = 120)response.encoding = 'utf-8'
    res = response.textsoup = BeautifulSoup(res,'html.parser')return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串



with open(r'G:\任务20180312\test\html_excel/test1.xls','wb') as f1:f1.write(bytes(result1,encoding='utf-8'))

#结果：

注意，这里with open(..)括号里没有encoding=部分了，因为二进制不能在进行编码了，不然会报错ValueError: binary mode doesn't take an encoding argument

bytes(result1,encoding='utf-8')这里是因为转字符串为二进制需要编码方式

三：得到的文件不是我们想要的表格形式，而是一堆字符串

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSouppath = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }def get_Soup(url):response = requests.get(url,headers = headers,timeout = 120)response.encoding = 'utf-8'
    res = response.textsoup = BeautifulSoup(res,'html.parser')return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串

with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:f1.write(result1)with open(r'G:\任务20180312\test\html_excel/test2.xls', 'w',encoding='utf-8') as f2:f2.write(result2)

#结果：

那现在我们发现区别在于result1和result2

右键网页打开源代码，来查看区别：

区别在于result1比result2少了些代码（因为我们存入的文件形式为伪excel，所以这个是有关系的），现在有html工具（这里我用的editplus）分别将这两部分代码以浏览器形式打开：

PS:所以我们要把带样式的代码也抓取下来，建议可以看看html5和css，了解一下

望有所帮助，望采纳！！