python自然语言处理学习笔记三

第三章处理原始文本

1 从网络和硬盘访问文本

#<<罪与罚>>的英文翻译未作测试??

From utlib import urlopen

Url=’http://www.gutenberg.org/files/2554/2554.txt’

Raw=urlopen(url).read()

Type(raw)

Len(raw)

Raw[:75]

#分词未作测试??

Tokens=nltk.word_tokenize(raw)

Type(tokens)

Len(tokens)

Tokens[:10]

#切片

Text=nltk.Text(tokens)

Type(text)

Text[1020:1060]

Text.collocations()

#手工挑出文本中的描述信息

Raw.find(‘PART I’)#取得字符串索引值

Raw.rfind(“End of Project Gutenberg’s Crime”)

Raw=raw[5303:1157681]

Raw.find(“PART I”)

处理html 未作测试??

Url=’http://news.bbc.co.uk/2/hi/health/2284783.stm’

Html=urlopen(url).read()

Html[:60]

#对html进行分词

Raw=nltk.clean_html(html)

Tokens=nltk.word_tokenize(raw)

tokens

#取得感兴趣的标识符

Tokens=[96:399]

Text=nltk.Text(tokens)

Text.concordance(‘gene’)

处理搜索引擎的结果

略

处理rss订阅未作测试

Import feedparser

Llog=feedparser.parse(“http://languagelog.ldc.upenn.edu/nll/?feed=atom”)

Llog[‘feed’][‘title’]

Len(llog.entries)

Post=llog.entries[2]

Post.title

Content=post.content[0].value

Content[0:70]

Nltk.word_tokenize(nltk.html_clean(content))

Nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value))

读取本文件

#提示找不到文件

>>> f=open('document.txt')

Traceback (most recent call last):

File "<input>", line 1, in <module>

FileNotFoundError: [Errno 2] No such fileor directory: 'document.txt'

#查看当前目录,在当前目录下添加document.txt文件

>>> import os

>>> os.listdir('.')

['.idea', 'One', 'Two']

#重新打开并读取文件内容

>>> f=open('document.txt')

>>> f.read()

'this is my time\nTime files like anarrow.\nFruit files like a banana.\n'

#一次读取文件中的一行

>>> f=open('document.txt','rU')

>>> for line in f:

... print(line.strip())#删除行尾换行符

...

this is my time

Time files like an arrow.

Fruit files like a banana.

#打开语料库中的文件名

>>> path=nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')

>>> raw=open(path,'rU').read()

从PDF,MS WORD 和其他二进制文件中提取文本

#捕获用户在程序交互时的输入

>>> s=input('Enter some text')

Enter some text>? On an exceptionallyhot eveing early in july

>>> print('You typed',len(nltk.word_tokenize(s)),'words')

You typed 8 words

NLP的流程

>>>raw=open('document.txt').read()

>>> type(raw)

#分词

>>> tokens=nltk.word_tokenize(raw)

>>> type(tokens)

>>> words=[w.lower() for w intokens]

>>> type(words)

>>> vocab=sorted(set(words))

>>> type(vocab)

#可以追加一个元素到一个链表,介不能到一个字符串

>>> vocab.append('blog')

>>> raw.append('blog')

Traceback (most recent call last):

File "<input>", line 1, in <module>

AttributeError: 'str' object has noattribute 'append'

#字符串+字符串链表+链表链表不能加字符串

>>> query='Who knows?'

>>>beatles=['john','paul','george','ringo']

>>> query+beatles

Traceback (most recent call last):

File "<input>", line 1, in <module>

TypeError: Can't convert 'list' object tostr implicitly

2 字符串:最底层的文本处理

#字符串的基本操作

monty = 'Monty python'
print(monty)circus = "Monty python's Flying Circus"
print(circus)circus = 'Monty python\'s Flying Circus'
print(circus)

#多行文本可以使用反斜杠连接或是括号

>>> couplet="Shall I comparethee to a Summer's day?"\

... "Thou are more lovely and moretemperate:"

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

>>> couplet=("Shall I comparethee to a Summer's day?"

... "Thou are more lovely and moretemperate:")

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

#以上方法显示的字符串没有换行,可以使用三引号

>>> couplet='''Shall I comparethee to a Summer's day?

... Thou are more lovely and moretemperate:'''

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

>>>couplet="""Shall I compare thee to a Summer's day?

... Thou are more lovely and moretemperate:"""

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

#字符串的连接操作

>>> 'very'+'very'+'very'

'veryveryvery'

>>> 'very'*3

'veryveryvery'

#输出字符串,使用print()

#访问单个字符串

>>> monty='Monty python'

>>> monty[0]

'M'

#正数为正向索引,负数为反向索引

>>> monty[-1]

#不要在行尾输出换行符 ??

#字符转小写,过滤非字母字符

>>> import nltk

>>> from nltk.corpus importgutenberg

>>>raw=gutenberg.raw('melville-moby_dick.txt')

>>> fdist=nltk.FreqDist(ch.lower()for ch in raw if ch.isalpha())

>>> fdist.keys()

dict_keys(['s', 'z', 'r', 'h', 'a', 'i','n', 'b', 't', 'j', 'o', 'e', 'c', 'm', 'x', 'y', 'g', 'd', 'q', 'v', 'w', 'f','k', 'p', 'u', 'l'])

#访问子字串,类似于对链表的切片操作

>>> monty='Monty python'

>>> monty[6:10]

'pyth'

#使用负数索引

>>> monty[-12:-7]

'Monty'

#分别从字串头和尾开始

>>> monty[:5]

'Monty'

>>> monty[6:]

'python'

#测试字串被包含

>>> phrase='And now for something completelydifferent'

>>> if 'thing' in phrase:

... print("found 'thing")

...

found 'thing

#使用find()查找子字串位

>>> monty.find('python')

#查看字符串的更多操作

>>> help(str)

Help on class str in module builtins:

class str(object)

| str(object='') -> str

| str(bytes_or_buffer[,encoding[, errors]]) -> str

| Create a new string object from the given object. If encoding or

| errors is specified, then the object must expose a data buffer

| thatwill be decoded using the given encoding and error handler.

| Otherwise, returns the result of object.__str__() (if defined)

| orrepr(object).

| encoding defaults to sys.getdefaultencoding().

| errors defaults to 'strict'.

| Methods defined here:

| __add__(self, value, /) #私有方法

| Return self+value.

| __contains__(self, key, /)

| Return key in self.

| __eq__(self, value, /)

| Return self==value.

| __format__(...)

| S.__format__(format_spec) -> str

| Return a formatted version of S as described by format_spec.

| __ge__(self, value, /)

| Return self>=value.

| __getattribute__(self, name, /)

| Return getattr(self, name).

| __getitem__(self, key, /)

| Return self[key].

| __getnewargs__(...)

| __gt__(self, value, /)

| Return self>value.

| __hash__(self, /)

| Return hash(self).

| __iter__(self, /)

| Implement iter(self).

| __le__(self, value, /)

| Return self<=value.

| __len__(self, /)

| Return len(self).

| __lt__(self, value, /)

| Return self<value.

| __mod__(self, value, /)

| Return self%value.

| __mul__(self, value, /)

| Return self*value.n

| __ne__(self, value, /)

| Return self!=value.

| __new__(*args, **kwargs) from builtins.type

| Create and return a new object. See help(type) for accurate signature.

| __repr__(self, /)

| Return repr(self).

| __rmod__(self, value, /)

| Return value%self.

| __rmul__(self, value, /)

| Return self*value.

| __sizeof__(...)

| S.__sizeof__() -> size of S in memory, in bytes

| __str__(self, /)

| Return str(self).

| capitalize(...)

| S.capitalize() -> str

| Return a capitalized version of S, i.e. make the first character

| have upper case and the rest lower case.

| casefold(...)

| S.casefold() -> str

| Return a version of S suitable for caseless comparisons.

| center(...)

| S.center(width[, fillchar]) -> str

| Return S centered in a string of length width. Padding is

| done using the specified fill character (default is a space)

| count(...) #字符串中字符数量

| S.count(sub[, start[, end]]) -> int

| Return the number of non-overlapping occurrences of substring sub in

| string S[start:end]. Optionalarguments start and end are

| interpreted as in slice notation.

| encode(...)

| S.encode(encoding='utf-8', errors='strict') -> bytes

| Encode S using the codec registered for encoding. Default encoding

| is 'utf-8'. errors may be given to set a different error

| handling scheme. Default is 'strict' meaning that encoding errors raise

| aUnicodeEncodeError. Other possible values are 'ignore', 'replace' and

| 'xmlcharrefreplace' as well as any other name registered with

| codecs.register_error that can handle UnicodeEncodeErrors.

| endswith(...) #是否以指定字符串结尾

| S.endswith(suffix[, start[, end]]) -> bool

| Return True if S ends with the specified suffix, False otherwise.

| With optional start, test S beginning at that position.

| With optional end, stop comparing S at that position.

| suffix can also be a tuple of strings to try.

| expandtabs(...)

| S.expandtabs(tabsize=8) -> str

| Return a copy of S where all tab characters are expanded using spaces.

| If tabsize is not given, a tab size of 8 characters is assumed.

| find(...) #查找子字串的第一个索引

| S.find(sub[, start[, end]]) -> int

| Return the lowest index in S where substring sub is found,

| such that sub is contained within S[start:end]. Optional

| arguments start and end are interpreted as in slice notation.

| Return -1 on failure.

| format(...) #格式化字串

| S.format(*args, **kwargs) -> str

| Return a formatted version of S, using substitutions from args andkwargs.

| The substitutions are identified by braces ('{' and '}').

| format_map(...)

| S.format_map(mapping) -> str

| Return a formatted version of S, using substitutions from mapping.

| The substitutions are identified by braces ('{' and '}').

| index(...)

| S.index(sub[, start[, end]]) -> int

| Like S.find() but raise ValueError when the substring is not found.

| isalnum(...) #是否为数字

| S.isalnum() -> bool

| Return True if all characters in S are alphanumeric

| and there is at least one character in S,False otherwise.

| isalpha(...) #是不为字母

| S.isalpha() -> bool

| Return True if all characters in S are alphabetic

| and there is at least one character in S, False otherwise.

| isdecimal(...)

| S.isdecimal() -> bool

| Return True if there are only decimal characters in S,

| False otherwise.

| isdigit(...)

| S.isdigit() -> bool

| Return True if all characters in S are digits

| and there is at least one character in S, False otherwise.

| isidentifier(...)

| S.isidentifier() -> bool

| Return True if S is a valid identifier according

| to the language definition.

| Use keyword.iskeyword() to test for reserved identifiers

| such as "def" and "class".

| islower(...) #是否小写

| S.islower() -> bool

| Return True if all cased characters in S are lowercase and there is

| at least one cased character in S, False otherwise.

| isnumeric(...)

| S.isnumeric() -> bool

| Return True if there are only numeric characters in S,

| False otherwise.

| isprintable(...)

| S.isprintable() -> bool

| Return True if all characters in S are considered

| printable in repr() or S is empty, False otherwise.

| isspace(...)

| S.isspace() -> bool

| Return True if all characters in S are whitespace

| and there is at least one character in S, False otherwise.

| istitle(...)

| S.istitle() -> bool

| Return True if S is a titlecased string and there is at least one

| character in S, i.e. upper- and titlecase characters may only

| follow uncased characters and lowercase characters only cased ones.

| Return False otherwise.

| isupper(...)#是不是大写

| S.isupper() -> bool

| Return True if all cased characters in S are uppercase and there is

| at least one cased character in S, False otherwise.

| join(...) #连接字符串

| S.join(iterable) -> str

| Return a string which is the concatenation of the strings in the

| iterable. The separator betweenelements is S.

| ljust(...)

| S.ljust(width[, fillchar]) -> str

| Return S left-justified in a Unicode string of length width. Padding is

| done using the specified fill character (default is a space).

| lower(...)

| S.lower() -> str

| Return a copy of the string S converted to lowercase.

| lstrip(...)

| S.lstrip([chars]) -> str

| Return a copy of the string S with leading whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| partition(...)

| S.partition(sep) -> (head, sep, tail)

| Search for the separator sep in S, and return the part before it,

| the separator itself, and the part after it. If the separator is not

| found, return S and two empty strings.

| replace(...) #替换

| S.replace(old, new[, count]) -> str

| Return a copy of S with all occurrences of substring

| old replaced by new. If theoptional argument count is

| given, only the first count occurrences are replaced.

| rfind(...) #反向查找

| S.rfind(sub[, start[, end]]) -> int

| Return the highest index in S where substring sub is found,

| such that sub is contained within S[start:end]. Optional

| arguments start and end are interpreted as in slice notation.

| Return -1 on failure.

| rindex(...)

| S.rindex(sub[, start[, end]]) -> int

| Like S.rfind() but raise ValueError when thesubstring is not found.

| rjust(...)

| S.rjust(width[, fillchar]) -> str

| Return S right-justified in a string of length width. Padding is

| done using the specified fill character (default is a space).

| rpartition(...)

| S.rpartition(sep) -> (head, sep, tail)

| Search for the separator sep in S, starting at the end of S, and return

| the part before it, the separator itself, and the part after it. If the

| separator is not found, return two empty strings and S.

| rsplit(...)

| S.rsplit(sep=None, maxsplit=-1) -> list of strings

| Return a list of the words in S, using sep as the

| delimiter string, starting at the end of the string and

| working to the front. If maxsplitis given, at most maxsplit

| splits are done. If sep is not specified, any whitespace string

| is a separator.

| rstrip(...)

| S.rstrip([chars]) -> str

| Return a copy of the string S with trailing whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| split(...)

| S.split(sep=None, maxsplit=-1) -> list of strings

| Return a list of the words in S, using sep as the

| delimiter string. If maxsplit isgiven, at most maxsplit

| splits are done. If sep is not specified or is None, any

| whitespace string is a separator and empty strings are

| removed from the result.

| splitlines(...) #按行分割成字符串链表

| S.splitlines([keepends]) -> list of strings

| Return a list of the lines in S, breaking at line boundaries.

| Line breaks are not included in the resulting list unless keepends

| is given and true.

| startswith(...)

| S.startswith(prefix[, start[, end]]) -> bool

| Return True if S starts with the specified prefix, False otherwise.

| With optional start, test S beginning at that position.

| With optional end, stop comparing S at that position.

| prefix can also be a tuple of strings to try.

| strip(...) #返加首尾没有空白字符

| S.strip([chars]) -> str

| Return a copy of the string S with leading and trailing

| whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| swapcase(...)

| S.swapcase() -> str

| Return a copy of S with uppercase characters converted to lowercase

| and vice versa.

| title(...)

| S.title() -> str

| Return a titlecased version of S, i.e. words start with title case

| characters, all remaining cased characters have lower case.

| translate(...)

| S.translate(table) -> str

| Return a copy of the string S in which each character has been mapped

| through the given translation table. The table must implement

| lookup/indexing via __getitem__, for instance a dictionary or list,

| mapping Unicode ordinals to Unicode ordinals, strings, or None. If

| this operation raises LookupError, the character is left untouched.

| Characters mapped to None are deleted.

| upper(...)

| S.upper() -> str

| Return a copy of S converted to uppercase.

| zfill(...)

| S.zfill(width) -> str

| Pad a numeric string S with zeros on theleft, to fill a field

| of the specified width. The string S is never truncated.

| ----------------------------------------------------------------------

| Static methods defined here:

| maketrans(x,y=None, z=None, /)

| Return a translation table usable for str.translate().

| If there is only one argument, it must be a dictionary mapping Unicode

| ordinals (integers) or characters to Unicode ordinals, strings or None.

| Character keys will be then converted to ordinals.

| If there are two arguments, they must be strings of equal length, and

| in the resulting dictionary, each character in x will be mapped to the

| character at the same position in y. If there is a third argument, it

| must be a string, whose characters will be mapped to None in the result.

链表与字符串的差异

#字符串和链表不能连接

#字符串是不可变的,链表可以修改其元素内容

#链表操作灵活,可以对段落,句子,短语,单词,字符进行操作

3 使用unicode进行文字处理

从文件中提取已编码文件

>>>path=nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

>>> import codecs

#将编码数据读入为unicode字符串,将字符串以指定编码形式写出

>>>f=codecs.open(path,encoding='latin2') #latin-2,也称为iso-8859-2

>>> for line in f:

... line=line.strip()

#unicode_escape是一个虚拟编码,将所有非ascii字符转换成\uXXXX形式

#编点码以两位数字的形式\xXX表示

... print(line.encode('unicode_escape'))

...

b'"Berlinka" to skarb kultury isztuki niemieckiej. Przewiezione przez'

b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowejna Dolny \\u015al\\u0105sk, zosta\\u0142y'

b'odnalezione po 1945 r. na terytoriumPolski. Trafi\\u0142y do Biblioteki'

b'Jagiello\\u0144skiej w Krakowie,obejmuj\\u0105 ponad 500 tys. zabytkowych'

b'archiwali\\xf3w, m.in. manuskrypty Goethego,Mozarta, Beethovena, Bacha.'

#查找字符的整数序数

>>> ord('a')

>>> a=u'\u0061'

>>> a

'a'

>>> print(a)

>>> nacute=u'\u0144'

>>> nacute

'ń'

>>>nacute_utf=nacute.encode('utf-8')

>>> print(repr(nacute_utf))

b'\xc5\x84'

#第三行中超出ascii码范围的字符,输出它的utf-8转义值

>>> import unicodedata

>>>lines=codecs.open(path,encoding='latin2').readlines()

>>> line=lines[2]

>>>print(line.encode('unicode_escape'))

b'Niemc\\xf3w pod koniec II wojny\\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'

>>> for c in line:

... if ord(c)>127:

... print('%rU+%04x%s' % (c.encode('utf-8'),ord(c),unicodedata.name(c)))

...

b'\xc3\xb3'U+00f3LATIN SMALL LETTER O WITHACUTE

b'\xc5\x9b'U+015bLATIN SMALL LETTER S WITHACUTE

b'\xc5\x9a'U+015aLATIN CAPITAL LETTER SWITH ACUTE

b'\xc4\x85'U+0105LATIN SMALL LETTER A WITHOGONEK

b'\xc5\x82'U+0142LATIN SMALL LETTER L WITHSTROKE

Python字符串函数和re模块接收unicode字符串

>>> line.find(u'zostau0142y')

-1

>>> line=line.lower()

>>>print(line.encode('unicode_escape'))

b'niemc\\xf3w pod koniec ii wojny\\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'

>>> import re

>>> m=re.search(u'\u015b\w*',line)

>>> m.group()

'światowej'

Nltk分词器可以将unicode作为输入

>>> nltk.word_tokenize(line)

['niemców', 'pod', 'koniec','ii', 'wojny', 'światowej', 'na', 'dolny', 'śląsk', ',','zostały']

在python中使用本地编码

#在第一行或是第二行添加

# -*- coding: <coding> -*-

Coding可以是latin-1 big5 utf-8

4 使用正则表达式检测词组搭配

使用基本的元字符

>>> import nltk

>>> wordlist=[w for w innltk.corpus.words.words('en') if w.islower()]

#查找以ed结尾的词,search(p,s)表示查找s中是否有p模式

>>> [w for w in wordlist ifre.search('ed$',w)]

['abaissed', 'abandoned', 'abased','abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed',

...

'younghearted', 'zagged', 'zed', 'zeed','zigzagged', 'zonated', 'zoned']

#.匹配单个字符,^表示开始,$表示结束,?表示前面的字符是可选的

#第三个字符是j,第六个字符是t

[w for w in wordlist ifre.search('^..j..t..$',w)]

['abjectly', 'adjuster', 'dejected','dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter','rejector', 'unjilted', 'unjolted', 'unjustly']

范围与闭包

#在手机的t9输入系统输入4653,产生的联想词汇

>>> [w for w in wordlist ifre.search('^[ghi][mno][jlk][def]$',w)]

['gold', 'golf', 'hold', 'hole']

#+表示一个或多个(现使用*)

>>> chat_words=sorted(set(w for win nltk.corpus.nps_chat.words()))

>>> [w for w in chat_words if re.search('^m+i+n+e+$',w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee','miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

>>> [w for w in chat_words ifre.search('^[ha]+$',w)]

['a', 'aaaaaaaaaaaaaaaaa', 'aaahhhh', 'ah','ahah', 'ahahah', 'ahh', 'ahhahahaha', 'ahhh', 'ahhhh', 'ahhhhhh','ahhhhhhhhhhhhhh', 'h', 'ha', 'haaa', 'hah', 'haha', 'hahaaa', 'hahah','hahaha', 'hahahaa', 'hahahah', 'hahahaha', 'hahahahaaa', 'hahahahahaha','hahahahahahaha', 'hahahahahahahahahahahahahahahaha', 'hahahhahah','hahhahahaha']

#匹配元音字线外的其他字母

[^aeiouAEIOU]

#非元音字母组成的词汇

^[aeiouAEIOU]+$

#\.只匹配一个句号

>>> [w for w in wsj ifre.search('^[0-9]+\.[0-9]+$',w)]

['0.0085', '0.05', '0.1', '0.16', '0.2','0.25', '0.28', '0.3', '0.4', '0.5', '0.50', '0.54', '0.56', '0.60', '0.7',

...

'9.8', '9.82', '9.9', '92.9', '93.3','93.9', '94.2', '94.8', '95.09', '96.4', '98.3', '99.1', '99.3']

>>> [w for w in wsj if re.search('^[A-Z]+\$$',w)]

['C$', 'US$']

#{3,5}表示前面至少出现3次,最多出现5次

>>> [w for w in wsj ifre.search('^[0-9]+-[a-z]{3,5}$',w)]

['10-day', '10-lap', '10-year','100-share', '12-point', '12-year', '14-hour', '15-day', '150-point','190-point', '20-point', '20-stock', '21-month', '237-seat', '240-page','27-year', '30-day', '30-point', '30-share', '30-year', '300-day', '36-day','36-store', '42-year', '50-state', '500-stock', '52-week', '69-point','84-month', '87-store', '90-day']

>>> [w for w in wsj ifre.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$',w)]

['black-and-white', 'bread-and-butter','father-in-law', 'machine-gun-toting', 'savings-and-loan']

#|匹配指定的字符串

>>> [w for w in wsj ifre.search('(ed|ing)$',w)]

['62%-owned', 'Absorbed', 'According','Adopting', 'Advanced', 'Advancing', 'Alfred', 'Allied',

...

'yielded', 'yielding', 'yttrium-containing','zoomed']

正则表达示的常用操作符

操作符	功能
.	匹配所有
^abc	以abc开头
Abc$	以abc结尾
[abc]	匹配字符集合
[A-Z0-9]	匹配字符范围
Ed\|ing\|s	匹配指定字符串
*	0个或多个
+	1个或多个
?	0个或一个
{n}	重复n次,(n为非负)
{n,}	至少重复n次
{,n}	重复不多于n次
{m,n}	至少m次不多于n次
A(b\|c)+	括号表示操作符的范围

5 正则表达式的有益应用

提取字符块

#找出一个词中的所有元音字母,计数

>>>word='wupercallifragilisticexpialidocious'

>>> re.findall(r'[aeiou]',word)

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i','e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']

>>>len(re.findall(r'[aeiou]',word))

#找出两个或两个以上的元音序列,计算相对频率

>>>

>>>wsj=sorted(set(nltk.corpus.treebank.words()))

>>> fd=nltk.FreqDist(vs for wordin wsj

... for vs in re.findall(r'[aeiou]{2,}',word))

... fd.items()

dict_items([('eei', 2), ('aia', 1),('aiia', 1), ('au', 106), ('ao', 6), ('eo', 39), ('ioa', 1), ('ia', 253),('uu', 1), ('ui', 95), ('oa', 59), ('iai', 1), ('ueui', 1), ('ae', 11), ('ei',86), ('ai', 261), ('eou', 5), ('ou', 329), ('ee', 217), ('uo', 8), ('iou', 27),('ie', 331), ('uie', 3), ('iu', 14), ('aii', 1), ('iao', 1), ('eu', 18),('ooi', 1), ('ue', 105), ('oui', 6), ('oei', 1), ('ieu', 3), ('oi', 65), ('io',549), ('uou', 5), ('ea', 476), ('oo', 174), ('ua', 109), ('eau', 10), ('oe',15), ('eea', 1), ('aa', 3), ('uee', 4)])

其他操作

#首元音,词尾元音,所有的辅音

>>>regexp=r'^[AEIOUaeiou]+[AEIOUaeiou]+$|[^AEIOUaeiou]'

>>> def compress(word):

... pieces=re.findall(regexp,word)

... return ''.join(pieces)

>>>english_udhr=nltk.corpus.udhr.words('English-Latin1')

>>> print(nltk.tokenwrap(compress(w)for w in english_udhr[:75]))

nvrsl Dclrtn f Hmn Rghts Prmbl Whrs rcgntnf th nhrnt dgnty nd f th ql

nd nlnbl rghts f ll mmbrs f th hmn fmly sth fndtn f frdm , jstc nd pc

n th wrld , Whrs dsrgrd nd cntmpt fr hmnrghts hv rsltd n brbrs cts

whch hv trgd th cnscnc f mnknd , nd th dvntf wrld n whch hmn bngs

shll njy frdm f spch nd

>>>rotokas_words=nltk.corpus.toolbox.words('rotokas.dic')

>>> cvs=[cv for w in rotokas_wordsfor cv in re.findall(r'[ptksvr][aeiou]',w)]

>>> cfd=nltk.ConditionalFreqDist(cvs)

>>> cfd.tabulate()

a e i o u

k 418 148 94 420 173

p 83 31 105 34 51

r 187 63 84 89 79

s 0 0 100 2 1

t 47 8 0 148 37

v 93 27 105 48 49

查找词干

#直接去掉看起来像后缀的字符

>>> def stem(word):

... for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']:

... if word.endswith(suffix):

... return word[:-len(suffix)]

... return word

#使用正则表达示提取词干

>>> re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

['ing']

>>>re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

['processing']

>>>re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

[('process', 'ing')]

#非贪婪

>>>re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processes')

[('process', 'es')]

#贪婪

>>>re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processes')

[('processe', 's')]

>>> def stem(word):

... regexp=r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'

... stem,suffix=re.findall(regexp,word)[0]

... return stem

...

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

>>> tokens=nltk.word_tokenize(raw)

>>> [stem(t) for t in tokens]

['DENNIS', ':', 'Listen', ',', 'strange','women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'bassi', 'for', 'a','system', 'of', 'government.Supreme', 'execut', 'power', 'deriv', 'from', 'a','mandate', 'form', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic','ceremony',’.’]

搜索已分词文本

#找出所有a * man

>>> from nltk.corpus importgutenberg,nps_chat

>>> moby=nltk.Text(gutenberg.words('melville-moby_dick.txt'))

>>>moby.findall(r'<a>(<.*>)<man>')

monied; nervous; dangerous; white; white;white; pious; queer; good;

mature; white; Cape; great; wise; wise;butterless; white; fiendish;

pale; furious; better; certain; complete;dismasted; younger; brave;

brave; brave; brave

#找出以bro结尾的三个词组成的短语

>>>chat=nltk.Text(nps_chat.words())

>>>chat.findall(r'<.*><.*><bro>')

you rule bro; telling you bro; u twiztedbro

#找出以字母l开头的,三个或更多词组成的序列

>>>chat.findall(r'<l.*>{3,}')

lol lol lol; lmao lol lol; lol lol lol; lala la la la; la la la; la

la la; lovely lol lol love; lol lol lol.;la la la; la la la

#在词料库中搜索x and other ys

>>> from nltk.corpus import brown

>>>hobbies_learned=nltk.Text(brown.words(categories=['hobbies','learned']))

>>>hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')

speed and other activities; water and otherliquids; tomb and other

landmarks; Statues and other monuments;pearls and other jewels;

charts and other items; roads and otherfeatures; figures and other

objects; military and other areas; demandsand other factors;

abstracts and other compilations; iron andother metals

6 规范化文本

词干提取器(porter和lancaster)

>>> import nltk

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

>>>tokens=nltk.word_tokenize(raw)

#词干提取器

>>> porter=nltk.PorterStemmer()

>>> lancaster=nltk.LancasterStemmer()

#提取词干

>>> [porter.stem(t) for t intokens]

['DENNI', ':', 'Listen', ',', 'strang','women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bassi', 'for','a', 'system', 'of', 'government.Suprem', 'execut', 'power', 'deriv', 'from','a', 'mandat', 'form', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat','ceremoni', '.']

>>> [lancaster.stem(t) for t intokens]

['den', ':', 'list', ',', 'strange', 'wom','lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bass', 'for', 'a','system', 'of', 'government.supreme', 'execut', 'pow', 'der', 'from', 'a','mand', 'form', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony','.']

#使用词干提取器索引文本

>>> import nltk

class IndexedText(object):

def __init__(self, stemmer, text):

self._text = text

self._stemmer = stemmer

self._index = nltk.Index((self._stem(word), i) for (i, word) inenumerate(text))

def concordance(self, word, width=40):

key = self._stem(word)

wc = int(width / 4)

for i in self._index[key]:

lcontext = ' '.join(self._text[i - wc:i])

rcontext = ' '.join(self._text[i:i + wc])

ldisplay = '%*s' % (width, lcontext[-width:])

rdisplay = '%-*s' % (width, rcontext[:width])

print(ldisplay, rdisplay)

def _stem(self, word):

return self._stemmer.stem(word).lower()

porter = nltk.PorterStemmer()

grail =nltk.corpus.webtext.words('grail.txt')

text = IndexedText(porter, grail)

text.concordance('lie')

#运行结果如下:

r king ! DENNIS : Listen , strange womenlying in ponds distributing swords is no

beata very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of

Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded!

doctors immediately ! No , no , please !Lie down . [ clap clap ] PIGLET : Well

ere is much danger , for beyond the cavelies the Gorge of Eternal Peril , which

you . Oh ... TIM : To the north there lies a cave -- the cave ofCaerbannog --

h it and lived ! Bones of full fifty menlie strewn about its lair . So , brave k

not stop our fight ' til each one of youlies dead , and the Holy Grail returns t

词形归并 (词形归并器)

>>> import nltk

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

>>> tokens=nltk.word_tokenize(raw)

#用作编译文本词汇,或是想要一个有效词条列表

>>> wnl=nltk.WordNetLemmatizer()

>>> [wnl.lemmatize(t) for t intokens]

['DENNIS', ':', 'Listen', ',', 'strange','woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'bassis','for', 'a', 'system', 'of', 'government.Supreme', 'executive', 'power','derives', 'from', 'a', 'mandate', 'form', 'mass', ',', 'not', 'from', 'some','farcical', 'aquatic', 'ceremony', '.']

7 用正则表达式为文本分词

分词的简单方法

#按空格分割文本

>>> import nltk

>>> import re

>>> raw="""'WhenI'm a Duchess,' she said herself,(not in a very hopefultone

... though),'I won't have any peper in mykitchen AT ALL.Soup does very

... well tithout--Maybe it's always peperthat makes people hot-tempered,'..."""

#仅匹配空格,会带有\n

>>> re.split(r' ',raw)

["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very',"hopefultone\nthough),'I", "won't", 'have', 'any', 'peper','in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very\nwell','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]

#使用'[ \t\n]+',可以匹配一个或多个空格,制表符或换行

>>> re.split(r'[\t\n]+',raw)

["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very','hopefultone', "though),'I", "won't", 'have', 'any','peper', 'in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very', 'well','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]

#\W可以匹配字母,数字和下划线以外的字符

>>> re.split(r'\W+',raw)

['', 'When', 'I', 'm', 'a', 'Duchess','she', 'said', 'herself', 'not', 'in', 'a', 'very', 'hopefultone', 'though','I', 'won', 't', 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','Soup', 'does', 'very', 'well', 'tithout', 'Maybe', 'it', 's', 'always','peper', 'that', 'makes', 'people', 'hot', 'tempered', '']

>>> re.findall(r'\w+|\S\w*',raw)

["'When", 'I', "'m",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'I", 'won',"'t", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.Soup', 'does', 'very', 'well', 'tithout', '-', '-Maybe', 'it',"'s", 'always', 'peper', 'that', 'makes', 'people', 'hot','-tempered', ',', "'", '.', '.', '.']

>>>print(re.findall(r"\w+(?:[-']\w)*|'|[-.(]+|\S\w*",raw))

["'", 'When', "I'm",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(', 'not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'", 'I',"won't", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.', 'Soup', 'does', 'very', 'well', 'tithout', '--', 'Maybe',"it's", 'always', 'peper', 'that', 'makes', 'people', 'hot-t','empered', ',', "'", '...']

正则表达式符号

符号	功能
\b	词边界
\d	任一十进制数([0-9])
\D	任何非数字字符[^0-9]
\s	任何空白字符[\t\n\r\f\v]
\S	任何非空白字符([^\t\n\r\f\v])
\w	任何字母数字字符([a-zA-Z0-9])
\W	任何非字线数字字符([^a-zA-Z0-9])
\t	制表符
\n	换行符

Nltk正则表达式分词器

Nltk.regexp_tokenize

>>> text='That U.S.A. poster-printcosts $12.40..,'

>>> pattern=r'''(?x)

... ([A-Z]\.)+

... | \w+(-\w+)*

... | \$?\d+(\.\d+)?%?

... | \.\.\.

... | [][.,;"'?():-_`]

... '''

分词的其他问题

8 分割

断句

#计算每个句子的平均词数

>>>len(nltk.corpus.brown.words())/len(nltk.corpus.brown.sents())

20.250994070456922

#文本断句的例子

>>> import pprint

>>> sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

>>>text=nltk.corpus.gutenberg.raw('chesterton-thursday.txt')

>>>sents=sent_tokenizer.tokenize(text)

>>>pp=pprint.PrettyPrinter(indent=4)

>>> pp.pprint(sents[171:181])

[ 'In the wild events which were to follow this girl had no\n'

'part at all; he never saw her again until all his tale was over.',

'And yet, in some indescribable way, she kept recurring like a\n'

'motive in music through all his mad adventures afterwards, and the\n'

'glory of her strange hair ran like a red thread through those dark\n'

'and ill-drawn tapestries of the night.',

'For what followed was so\n'

'improbable, that it might well have been a dream.',

'When Syme went out into the starlit street, he found it for the\n'

'moment empty.',

'Then he realised (in some odd way) that the silence\n'

'was rather a living silence than a dead one.',

'Directly outside the\n'

'door stood a street lamp, whose gleam gilded the leaves of the tree\n'

'that bent out over the fence behind him.',

'About a foot from the\n'

'lamp-post stood a figure almost as rigid and motionless as the\n'

'lamp-post itself.',

'The tall hat and long frock coat were black; the\n'

'face, in an abrupt shadow, was almost as dark.',

'Only a fringe of\n'

'fiery hair against the light, and also something aggressive in the\n'

'attitude, proclaimed that it was the poet Gregory.',

'He had something\n'

'of the look of a masked bravo waiting sword in hand for his foe.']

分词

>>>text='doyouseethekittyseethedoggydoyoulikethekittylikethedoggy'

>>>seg1='0000000000000001000000000010000000000000000100000000000'

>>>seg2='0100100100100001001001000010100100010010000100010010000'

>>> defsegment(text,segs):

... words=[]

... last=0

... for i in range(len(segs)):

... if segs[i]=='1':

... words.append(text[last:i+1])

... last=i+1

... words.append(text[last:])

... return words

...

>>> segment(text,seg1)

['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']

>>> segment(text,seg2)

['do', 'you', 'see', 'the', 'kitty', 'see','the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']

#计算存储词典和重构源文本的成本

>>>seg3='0000100100000011001000000110000100010000001100010000001'

>>> defevaluate(text,segs):

... words=segment(text,segs)

... text_size=len(words)

... lexicon_size=len(''.join(list(set(words))))

... return text_size+lexicon_size

...

>>> segment(text,seg3)

['doyou', 'see', 'thekitt', 'y', 'see','thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']

>>> evaluate(text,seg1)

>>> evaluate(text,seg2)

>>> evaluate(text,seg3)

#使用模拟退火算法的非确定性搜索

from random import randint

def flip(segs, pos):return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]def flip_n(segs, n):for i in range(n):segs = flip(segs, randint(0, len(segs) - 1))return segsdef anneal(text, segs, iterations, cooling_rate):temperature = float(len(segs))while temperature > 0.5:best_segs, best = segs, evaluate(text, segs)for i in range(iterations):guess = flip_n(segs, int(round(temperature)))score = evaluate(text, guess)if score < best:best, best_segs = score, guessscore, segs = best, best_segstemperature = temperature / cooling_rateprint(evaluate(text, segs), segment(text, segs))printreturn segs

>>> anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty','likethedoggy']

60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']

58 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'ul', 'ike', 'thekitty', 'l', 'i', 'k', 'e', 'thedoggy']

54 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ik', 'e', 'thedoggy']

51 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ike', 't','hedoggy']

48 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'like', 'thekitty', 'like', 't', 'hedoggy']

45 ['doyou', 'see', 'thekitty', 'see', 't','hedoggy', 'doyou', 'like', 'thekitty', 'like', 't', 'hedoggy']

42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']

42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy','doyou', 'like', 'thekitty', 'like', 'thedoggy']

42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']

'0000100100000001001000000010000100010000000100010000000'

9 格式化:从链表到字符串

从链表到字符串

>>>silly=['We','called','him','Tortoise','because','he','taught','us','.']

>>> ' '.join(silly)

'We called him Tortoise because he taughtus .'

>>> ";".join(silly)

'We;called;him;Tortoise;because;he;taught;us;.'

>>> "".join(silly)

'WecalledhimTortoisebecausehetaughtus.'

字符串与格式

#显示对象内容的两种方式

>>> word='cat'

>>> sentence="""hello

... world"""

>>> print(word)

cat

>>> print(sentence)

hello

World

>>> word

'cat'

>>> sentence

'hello\nworld'

>>> import nltk

>>>fdist=nltk.FreqDist(['dog','cat','dog','cat','dog','snake','dog','cat'])

>>> for word in fdist:

... print(word,'->',fdist[word],';',)

...

dog -> 4 ;

snake -> 1 ;

cat -> 3 ;

#使用字符串格式化表达式

>>> for word in fdist:

... print('%s->%d' % (word,fdist[word]),)

...

dog->4

snake->1

cat->3

排列

#布朗语料库不同部分的频率模型

>>> importnltk

>>>from nltk.corpus import brown

>>>def tabulate(cfdist, words, categories):

print('%-16s' % 'Category', )

for word in words:

print('%6s' % word, end="",sep=None)

print()

for category in categories:

print('%-16s' % category,end="", sep=None)

for word in words:

print('%6d' %cfdist[category][word], end="", sep=None)

print()

>>>cfd = nltk.ConditionalFreqDist(

(genre, word)

for genre in brown.categories()

for word in brown.words(categories=genre))

genres= ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

modals= ['can', 'could', 'may', 'might', 'must', 'will']

tabulate(cfd,modals, genres)

python自然语言处理学习笔记三相关推荐

Python自然语言处理学习笔记(2)：Preface 前言
Updated 1st:2011/8/5 Updated 2nd:2012/5/14 中英对照完成 Preface 前言 This is a book about Natural Language ...
python自然语言处理-学习笔记（一）之nltk入门
nltk学习第一章一,入门 1,nltk包的导入和报的下载 import nltk nltk.download() (eg: nltk.download('punkt'),也可以指定下载那个包) 2 ...
Python自然语言处理学习笔记(7)：1.5 自动理解自然语言
Updated log 1st:2011/8/5 1.5 Automatic Natural Language Understanding 自然语言的自动理解 We have been explori ...
Python基础函数学习笔记(三)
数学函数,字符串,对象. Simple Python Build-in Functions abs,max(x1,x2,x3...),min(x1,x2,x3...),pow(a,b),round(x ...
python自然语言处理学习笔记一
第一章语言处理与python 1 语言计算文本与词汇 NLTK入门下载安装nltk http://www.nltk.org 下载数据 >>> import nltk >& ...
Python计算机视觉编程学习笔记三图像到图像的映射
图像到图像的映射 (一)单应性变换 1.2 仿射变换 (二)图像扭曲 2.1 图像中的图像 2.2 图像配准 (三)创建全景图 3.1 RANSAC 3.2 拼接图像 (一)单应性变换单应性变换是将 ...
Python自然语言处理学习笔记(32)：4.4 函数：结构化编程的基础
4.4 Functions: The Foundation of Structured Programming 函数:结构化编程的基础 Functions provide an effective ...
Python自然语言处理学习笔记(19):3.3 使用Unicode进行文字处理
3.3 Text Processing with Unicode 使用Unicode进行文字处理 Our programs will often need to deal with differe ...
Python自然语言处理学习笔记(68)：7.9 练习
7.9 Exercises 练习 ☼ The IOB format categorizes tagged tokens as I, O and B. Why are three tags nec ...

python自然语言处理学习笔记三

python自然语言处理学习笔记三相关推荐

最新文章

热门文章