

  • 基于规则的预处理
  • 常规预处理
  • spaCy库的常规使用
  • pointer-generator





def clean_text(text):"""Clean text:param text: the string of text:return: text string after cleaning"""# unittext = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text)        # e.g. 4kgs => 4 kgtext = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text)         # e.g. 4kg => 4 kgtext = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text)          # e.g. 4k => 4000text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)# acronymtext = re.sub(r"can\'t", "can not", text)text = re.sub(r"cannot", "can not ", text)text = re.sub(r"what\'s", "what is", text)text = re.sub(r"What\'s", "what is", text)text = re.sub(r"\'ve ", " have ", text)text = re.sub(r"n\'t", " not ", text)text = re.sub(r"i\'m", "i am ", text)text = re.sub(r"I\'m", "i am ", text)text = re.sub(r"\'re", " are ", text)text = re.sub(r"\'d", " would ", text)text = re.sub(r"\'ll", " will ", text)text = re.sub(r"c\+\+", "cplusplus", text)text = re.sub(r"c \+\+", "cplusplus", text)text = re.sub(r"c \+ \+", "cplusplus", text)text = re.sub(r"c#", "csharp", text)text = re.sub(r"f#", "fsharp", text)text = re.sub(r"g#", "gsharp", text)text = re.sub(r" e mail ", " email ", text)text = re.sub(r" e \- mail ", " email ", text)text = re.sub(r" e\-mail ", " email ", text)text = re.sub(r",000", '000', text)text = re.sub(r"\'s", " ", text)# spelling correctiontext = re.sub(r"ph\.d", "phd", text)text = re.sub(r"PhD", "phd", text)text = re.sub(r"pokemons", "pokemon", text)text = re.sub(r"pokémon", "pokemon", text)text = re.sub(r"pokemon go ", "pokemon-go ", text)text = re.sub(r" e g ", " eg ", text)text = re.sub(r" b g ", " bg ", text)text = re.sub(r" 9 11 ", " 911 ", text)text = re.sub(r" j k ", " jk ", text)text = re.sub(r" fb ", " facebook ", text)text = re.sub(r"facebooks", " facebook ", text)text = re.sub(r"facebooking", " facebook ", text)text = re.sub(r"insidefacebook", "inside facebook", text)text = re.sub(r"donald trump", "trump", text)text = re.sub(r"the big bang", "big-bang", text)text = re.sub(r"the european union", "eu", text)text = re.sub(r" usa ", " america ", text)text = re.sub(r" us ", " america ", text)text = re.sub(r" u s ", " america ", text)text = re.sub(r" U\.S\. ", " america ", text)text = re.sub(r" US ", " america ", text)text = re.sub(r" American ", " america ", text)text = re.sub(r" America ", " america ", text)text = re.sub(r" quaro ", " quora ", text)text = re.sub(r" mbp ", " macbook-pro ", text)text = re.sub(r" mac ", " macbook ", text)text = re.sub(r"macbook pro", "macbook-pro", text)text = re.sub(r"macbook-pros", "macbook-pro", text)text = re.sub(r" 1 ", " one ", text)text = re.sub(r" 2 ", " two ", text)text = re.sub(r" 3 ", " three ", text)text = re.sub(r" 4 ", " four ", text)text = re.sub(r" 5 ", " five ", text)text = re.sub(r" 6 ", " six ", text)text = re.sub(r" 7 ", " seven ", text)text = re.sub(r" 8 ", " eight ", text)text = re.sub(r" 9 ", " nine ", text)text = re.sub(r"googling", " google ", text)text = re.sub(r"googled", " google ", text)text = re.sub(r"googleable", " google ", text)text = re.sub(r"googles", " google ", text)text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)text = re.sub(r"the european union", " eu ", text)text = re.sub(r"dollars", " dollar ", text)# punctuationtext = re.sub(r"\+", " + ", text)text = re.sub(r"'", " ", text)text = re.sub(r"-", " - ", text)text = re.sub(r"/", " / ", text)text = re.sub(r"\\", " \ ", text)text = re.sub(r"=", " = ", text)text = re.sub(r"\^", " ^ ", text)text = re.sub(r":", " : ", text)text = re.sub(r"\.", " . ", text)text = re.sub(r",", " , ", text)text = re.sub(r"\?", " ? ", text)text = re.sub(r"!", " ! ", text)text = re.sub(r"\"", " \" ", text)text = re.sub(r"&", " & ", text)text = re.sub(r"\|", " | ", text)text = re.sub(r";", " ; ", text)text = re.sub(r"\(", " ( ", text)text = re.sub(r"\)", " ( ", text)# symbol replacementtext = re.sub(r"&", " and ", text)text = re.sub(r"\|", " or ", text)text = re.sub(r"=", " equal ", text)text = re.sub(r"\+", " plus ", text)text = re.sub(r"₹", " rs ", text)      # 测试!text = re.sub(r"\$", " dollar ", text)# remove extra spacetext = ' '.join(text.split())return text




import spacynlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')for token in doc:print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)




Text: The original word text.
Lemma: The base form of the word.
POS: The simple part-of-speech tag.
Tag: The detailed part-of-speech tag.
Dep: Syntactic dependency, i.e. the relation between tokens.
Shape: The word shape – capitalisation, punctuation, digits.
is alpha: Is the token an alpha character?
is stop: Is the token part of a stop list, i.e. the most common words of the language?




import json
import string
import nltk
from my_method import noun_chunk
from nltk.corpus import stopwords
from my_method import get_lemma
from my_method import get_tokens#read data
with open('data/cristic_consensus.json','r') as f:consensus = json.load(f)
with open('data/cristic.json','r') as f:cristic = json.load(f)#tokenize(rmove stopwords)
cristic_token = []
for item in cristic:temp = []for item_01 in item:temp_01 = get_lemma(item_01)temp_01 = get_tokens(item_01)temp.append(temp_01)cristic_token.append(temp)
with open('data/cristic_token.json','w') as f:json.dump(cristic_token, f)consensus_token = []
for item in consensus:temp = get_lemma(item)temp = get_tokens()consensus_token.append(temp)with open('data/consensus_token.json','w') as f:json.dump(consensus_token, f)#remove stopword
critics_temp = []
critics = [[x.lower() for x in c] for c in cristic]
for item in critics:temp = [''.join(c for c in item_01 if c not in string.punctuation) for item_01 in item]critics_temp.append(temp)critics = critics_temp
critics = [[nltk.word_tokenize(x) for x in item] for item in critics]critics = [[' '.join(c for c in item if c not in stopwords.words('english')) for item in item_01] for item_01 in critics]with open('data/cristic_no_stop.json', 'w') as f:json.dump(critics,f)#get noun
consensus_noun_chunk = [noun_chunk(item) for item in consensus]
with open('data/consensus_noun_chunk.json','w') as f:json.dump(consensus_noun_chunk, f)cristic_noun_chunk = []
for _ in cristic:temp_01 = [noun_chunk(item) for item in _]cristic_noun_chunk.append(temp_01)
with open('data/cristic_noun_chunk.json','w') as f:json.dump(cristic_noun_chunk, f)


import nltk
import string
from nltk.corpus import stopwords
import spacy
import re
from nltk.stem.porter import *nlp = spacy.load('en_core_web_lg')#词性还原
def stem_tokens(tokens, stemmer):stemmed = []for item in tokens:stemmed.append(stemmer.stem(item))return stemmed#注意啦,这边接受的就是一个字符串
def ie_process(document):sentences = nltk.sent_tokenize(document)sentences = [nltk.word_tokenize(sent) for sent in sentences]sentences = [nltk.pos_tag(sent) for sent in sentences]return sentencesdef get_tokens(document):document = document.lower()document = ''.join(c for c in document if c not in string.punctuation)document = nltk.word_tokenize(document)document = [c for c in document if c not in stopwords.words('english')]#stemmer = PorterStemmer()#document = stem_tokens(document, stemmer)return documentdef get_lemma(document):document = nlp(document)document = ' '.join(token.lemma_ for token in document)return documentdef noun_chunk(document):doc = nlp(document)document = [item.text for item in doc.noun_chunks]return documentdef clean_title(document):document = re.split(r'[_-]',  document)return document






import json
from textblob import TextBlob
sentiment_test = []
noun_phrases_test = []
with open('rottentomatoes.json', 'r') as f:data_all = json.load(f)data_test = data_all[0]data_test = data_test['_critics']data_test = list(data_test.values())#print(data_test)for review in data_test:testimonial = TextBlob(review)sentiment_test.append(testimonial.sentiment.polarity)noun_phrases_test.append(testimonial.noun_phrases)print(sentiment_test)print(noun_phrases_test)


In [2]: from textblob import TextBlob...: testimonial = TextBlob("Textblob is amazingly simple to use. What great ...: fun!")...: print(testimonial.sentiment)...:
Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

使用TextBlob情感分析的结果,以元组的方式进行返回,形式如(polarity, subjectivity). 其中polarity的分数是一个范围为 [-1.0 , 1.0 ] 浮点数, 正数表示积极,负数表示消极。subjectivity 是一个 范围为 [0.0 , 1.0 ] 的浮点数,其中 0.0 表示 客观,1.0表示主观的。





with open('data/cristic_token.json') as f:cristic = json.load(f)with open('data/consensus_token.json') as f:consensus = json.load(f)print(len(consensus))
overlap_total = []
for n in range(3731):overlap = [list(set(consensus[n]).intersection(set(item))) for item in cristic[n]]overlap_total.append(overlap)print(n)




import spacynlp = spacy.load('en_core_web_md')  # make sure to use larger model!
tokens = nlp(u'dog cat banana')for token1 in tokens:for token2 in tokens:print(token1.text, token2.text, token1.similarity(token2))



dog dog 1.0
dog cat 0.80168545
dog banana 0.24327646
cat dog 0.80168545
cat cat 1.0
cat banana 0.2815437
banana dog 0.24327646
banana cat 0.2815437
banana banana 1.0


nlp = spacy.load('en')




  1. python网络爬虫系列(八)——常见的反爬手段和解决方法

    常见的反爬手段和解决思路 学习目标 了解 服务器反爬的原因 了解 服务器常反什么样的爬虫 了解 反爬虫领域常见的一些概念 了解 反爬的三个方向 了解 常见基于身份识别进行反爬 了解 常见基于爬虫行为进 ...

  2. Linux下进程隐藏的常见手法及侦测手段

    痕迹清理 1.  退出前 history -c 2.  多使用sftp吧 0.0 3.  web日志删除一些 4.  用户目录下很多 history,一言不合就是删 :) 4.  btmp wtmp ...

  3. 【爬虫进阶】常见的反爬手段和解决方法(建议收藏)

    爬虫进阶:常见的反爬手段和解决思路 1 服务器反爬的原因 2 服务器常反什么样的爬虫 3 反爬虫领域常见的一些概念 4 反爬的三个方向 5 常见基于身份识别进行反爬 5.1 通过headers字段来反 ...

  4. 不可不知!4种常见的黑客攻击手段

    在计算机安全方面,黑客是专注于计算机和网络系统安全机制的人.今天给大家揭秘4种常见的计算机攻击手段,让大家更好了解计算机安全知识. 特洛伊木马 一个特洛伊木马是,这似乎是做一件事情,但实际上做一套程序 ...

  5. 安全漏洞防御(9)常见的网站攻击手段及预防措施

    XSS XSS攻击的全称是跨站脚本攻击(Cross Site Scripting),为了不和层叠样式表 (Cascading Style Sheets,CSS)的缩写混淆,故将跨站脚本攻击缩写为XSS ...

  6. 网络安全之几种常见的黑客攻击手段

    常见的黑客攻击手段 常见的攻击手段有:ARP攻击,DoS攻击,DDoS攻击,SYN攻击,缓冲区溢出攻击,等等.下面我将对这几种攻击做个介绍. 1 ARP攻击 ARP(Address Resolutio ...

  7. 真空泵常见故障处理和预防维护手段

    在制造业工厂生产过程中,真空泵是一个至关重要的设备.它的作用是将气体排出设备,从而创造一个真空环境,以确保半导体生产的质量和效率.然而,由于长期使用和不当维护,真空泵可能会出现故障,影响生产效率. 本 ...

  8. 常见的Web攻击手段,拿捏了!

    大家好,我是小菜. 一个希望能够成为 吹着牛X谈架构 的男人!如果你也想成为我想成为的人,不然点个关注做个伴,让小菜不再孤单! 本文主要介绍 互联网中常见的 Web 攻击手段 如有需要,可以参考 如有 ...

  9. 常见的Web攻击手段-整理

    整理常见的Web攻击手段: XSS攻击 CSRF攻击 SQL注入攻击 文件上传漏洞 DDoS攻击 其他攻击手段 XSS攻击 XSS(Cross Site Scripting)跨站脚本攻击,为了不与层叠 ...


  1. Nginx+Tomcat实现反向代理与动静分离
  2. Nature指数2021亚太区排名:7所中国高校挺进前10!看看有没有你的母校?
  3. boost::log模块使用属性关键字的示例
  4. ArcGIS Android工程迁移到其他电脑不能打开的问题
  5. 逆向工程核心原理学习笔记(十一):栈
  6. 表现SOTA!DetCo算法:目标检测无监督对比学习
  7. ps计算机按键.,计算机一级Photoshop视图操作快捷键
  8. 运维测试工作笔记0003---使用Jmeter测试_http接口_高并发测试
  9. 老年人用什么方式存款最好?
  10. Qt——P12 信号连接信号
  11. PyTorch——深度神经网络的写作笔记
  12. bzoj 3514: Codechef MARCH14 GERALD07加强版
  13. 《WebGL编程指南》学习笔记——1.WebGL概述
  14. python 饼图代码_Python中使用Matplotlib画出饼图的代码实例
  15. 千呼万唤始出来,犹抱琵琶半遮面——python变量和数据类型
  16. 什么是微前端及微前端优缺点
  17. 山西医科大学计算机部,山西医科大学医学计算机教学改革探讨.pdf
  18. Error while extracting response for type [] and content type [],json返回值被解析为xml
  19. 安卓手机APP读写高频RFID标签(校园卡)NDEF格式数据设计
  20. 高级创意,单片机电子DIY制作精华资料汇总


  1. 130道Python练习题,涵盖基础内容的方方面面
  2. DB2插入单引号 双引号的问题
  3. ODM 对象文档映射
  4. 详解视频封装格式之MP4
  5. 福晟集团用创新理念引领发展新趋势
  6. 基金经理研究所 | 从兴全合润看谢治宇的攻守道
  7. 树莓派机器视觉环境搭建
  8. 字节跳动校招面试题演练
  9. 案例复盘:从上海首例遗弃犬只案看 如何精准研判舆情争议点
  10. linux qt 俄罗斯方块,使用Qt开发俄罗斯方块游戏