2. str 与 bytes 之间的类型转换

str 与 bytes 之间的类型转换如下:

str ⇒ bytes:bytes(s, encoding='utf8')

bytes ⇒ str:str(b, encoding='utf-8')


str 编码成 bytes 格式:str.encode(s)

bytes 格式编码成 str 类型:bytes.decode(b)

3. strings 分别在 Python2、Python 3下

What is tensorflow.compat.as_str()?

Python 2 将 strings 处理为原生的 bytes 类型,而不是 unicode,

Python 3 所有的 strings 均是 unicode 类型。

1, BefaultSoup 转码逻辑

代码位置 python2.7/site-packages/bs4/dammit.py


def encodings(self):

"""Yield a number of encodings that might work for this markup."""

tried = set()

for e in self.override_encodings:

if self._usable(e, tried):

yield e

# Did the document originally start with a byte-order mark

# that indicated its encoding?

if self._usable(self.sniffed_encoding, tried):

yield self.sniffed_encoding

# Look within the document for an XML or HTML encoding

# declaration.

if self.declared_encoding is None:

self.declared_encoding = self.find_declared_encoding(

self.markup, self.is_html)

if self._usable(self.declared_encoding, tried):

yield self.declared_encoding

# Use third-party character set detection to guess at the

# encoding.

if self.chardet_encoding is None:

self.chardet_encoding = chardet_dammit(self.markup)

if self._usable(self.chardet_encoding, tried):

yield self.chardet_encoding

# As a last-ditch effort, try utf-8 and windows-1252.

for e in ('utf-8', 'windows-1252'):

if self._usable(e, tried):

yield e

解释: 这段代码包含了几个编码测试函数流程, 优先级如下:

1, self.override_encodings 用户定义的编码

2, self.sniffed_encoding

self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)



def strip_byte_order_mark(cls, data):

"""If a byte-order mark is present, strip it and return the encoding it implies."""

encoding = None

if isinstance(data, unicode):

# Unicode data cannot have a byte-order mark.

return data, encoding

if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \

and (data[2:4] != '\x00\x00'):

encoding = 'utf-16be'

data = data[2:]

elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \

and (data[2:4] != '\x00\x00'):

encoding = 'utf-16le'

data = data[2:]

elif data[:3] == b'\xef\xbb\xbf':

encoding = 'utf-8'

data = data[3:]

elif data[:4] == b'\x00\x00\xfe\xff':

encoding = 'utf-32be'

data = data[4:]

elif data[:4] == b'\xff\xfe\x00\x00':

encoding = 'utf-32le'

data = data[4:]

return data, encoding

3, self.declared_encoding

self.declared_encoding = self.find_declared_encoding(

self.markup, self.is_html)



xml_encoding_re = re.compile(

'^'.encode(), re.I)

html_meta_re = re.compile(

']+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)


def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):

"""Given a document, tries to find its declared encoding.

An XML encoding is declared at the beginning of the document.

An HTML encoding is declared in a tag, hopefully near the

beginning of the document.


if search_entire_document:

xml_endpos = html_endpos = len(markup)


xml_endpos = 1024

html_endpos = max(2048, int(len(markup) * 0.05))

declared_encoding = None

declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)

if not declared_encoding_match and is_html:

declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)

if declared_encoding_match is not None:

declared_encoding = declared_encoding_match.groups()[0].decode(

'ascii', 'replace')

if declared_encoding:

return declared_encoding.lower()

return None

self.chardet_encoding = chardet_dammit(self.markup)

很明显, 这个是根据chardet包来判断, chardet根据正文的编码匹配来统计, 会有个confidence的辅助判断

import chardet

def chardet_dammit(s):

return chardet.detect(s)['encoding']

2,Request 转码逻辑

response = requests.get(url, verify=False, headers=configSpider.get_head())

requests 提供了两个编码识别结果


位置: python2.7/site-packages/requests/adapters.py


response.encoding = get_encoding_from_headers(response.headers)




def get_encoding_from_headers(headers):

"""Returns encodings from given HTTP Header Dict.

:param headers: dictionary to extract encoding from.

:rtype: str


content_type = headers.get('content-type')

if not content_type:

return None

content_type, params = cgi.parse_header(content_type)

if 'charset' in params:

return params['charset'].strip("'\"")

if 'text' in content_type:

return 'ISO-8859-1'




def parse_header(line):

"""Parse a Content-type like header.

Return the main content-type and a dictionary of options.


parts = _parseparam(';' + line)

key = parts.next()

pdict = {}

for p in parts:

i = p.find('=')

if i >= 0:

name = p[:i].strip().lower()

value = p[i+1:].strip()

if len(value) >= 2 and value[0] == value[-1] == '"':

value = value[1:-1]

value = value.replace('\\\\', '\\').replace('\\"', '"')

pdict[name] = value

return key, pdict


这个就是取的响应头 header的声明编码,如果有charset具体的编码 则给出, 如果是text/html 则返回 'ISO-8859-1'

很多网页Response-Headers都是直接给一个content-type: text/html, 用 'ISO-8859-1'明显是乱码了


Request还有一个apparent_encoding的编码, 这个很简单也是来自于正文的chardet, 也并不能保证完全准确的

3, Request的content和text



def content(self):

"""Content of the response, in bytes."""

if self._content is False:

# Read the contents.

if self._content_consumed:

raise RuntimeError(

'The content for this response was already consumed')

if self.status_code == 0 or self.raw is None:

self._content = None


self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()

self._content_consumed = True

# don't need to release the connection; that's been handled by urllib3

# since we exhausted the data.

return self._content


def text(self):

"""Content of the response, in unicode.

If Response.encoding is None, encoding will be guessed using


The encoding of the response content is determined based solely on HTTP

headers, following RFC 2616 to the letter. If you can take advantage of

non-HTTP knowledge to make a better guess at the encoding, you should

set ``r.encoding`` appropriately before accessing this property.


# Try charset from content-type

content = None

encoding = self.encoding

if not self.content:

return str('')

# Fallback to auto-detected encoding.

if self.encoding is None:

encoding = self.apparent_encoding

# Decode unicode from given encoding.


content = str(self.content, encoding, errors='replace')

except (LookupError, TypeError):

# A LookupError is raised if the encoding was not found which could

# indicate a misspelling or similar mistake.


# A TypeError can be raised if encoding is None


# So we try blindly encoding.

content = str(self.content, errors='replace')

return content


content是bytes 字节流格式的, 而text是将其转为str

content = str(self.content, encoding, errors='replace')

如果网页正好是utf-8格式的, 因为编码环境# -*- coding: utf-8 -*-, 所以content直接可用; 否则依然会有乱码问题

综上, 最好的解决方案是 结合源码的实现以及自身的需求来实现一套方案:

Headers 声明编码



chardet 模块检测编码

对于 调用Request包, 简单处理:

if response.encoding == 'ISO-8859-1':

response.encoding = response.apparent_encoding



from bs4.dammit import EncodingDetector

self.detector = EncodingDetector(

markup, override_encodings, is_html, exclude_encodings)

print self.detector.encoding

