Python3--爬取海词信息

上代码：

#!/usr/bin/python3import queue
import threading
import requests,csv,time,random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd exitFlag = 0#利用pandas读取csv文件
def getNames(csvfile):data = pd.read_csv(csvfile,delimiter='|')                   # 1--读取的文件编码问题有待考虑names = data['EnName']return names#获取ip列表
def get_ip_list():      f=open('ip.txt','r')      ip_list=f.readlines()      f.close()      return ip_list      #从IP列表中获取随机IP
def get_random_ip(ip_list):      proxy_ip = random.choice(ip_list)      proxy_ip=proxy_ip.strip('\n')      proxies = {'https': proxy_ip}      return proxies   #功能：将信息写入文件
def write_file(filePath,row):        with open(filePath,'a+',encoding='utf-8',newline='') as csvfile:        spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)        spanreader.writerow(row)  def get_content(url,ip_list):try:try:time.sleep(1)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)except:print("重新运行")time.sleep(10)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40)except:print("第二次重新运行")time.sleep(15)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers)req.encoding = 'utf-8'soup = BeautifulSoup(req.text,'lxml')content = soup.find_all('div',class_='mbox')return req.status_code, content#获取准确的英文名、中文名、名字含义、来源、性别等信息
def get_infor_header(content):content = content.find_all('span')EnName = []CnName = []Gender = []Source = []Meaning = []EnName.append(content[0].get_text())if len(content) != 1:CnName.append(content[1].get_text())Meaning.append(content[2].get_text()) Source.append(content[3].get_text())Gender.append(content[4].em.get('title'))else:CnName.append('')Meaning.append('') Source.append('')Gender.append('')#信息的链接方式EnName|CnName|Gender|Source|Meaninglist_header = EnName + CnName + Gender + Source + Meaningreturn list_header#获取英文名对应的名人
def get_infor_celebrity(content):content = content.find_all('li')list_celebrity = []str_celebrity=''for each in content:if not str_celebrity:str_celebrity +=each.get_text()else:str_celebrity +='@' + each.get_text()list_celebrity.append(str_celebrity)return list_celebrityclass myThread (threading.Thread):def __init__(self, threadID, name, q,ip_list):threading.Thread.__init__(self)self.threadID = threadIDself.name = nameself.q = qself.ip_list = ip_listdef run(self):print ("开启线程：" + self.name)process_data(self.name, self.q,ip_list)print ("退出线程：" + self.name)def process_data(threadName, q,ip_list):while not exitFlag:queueLock.acquire()if not workQueue.empty():data = q.get()queueLock.release()print ("%s processing %s" % (threadName, data))url = 'http://ename.dict.cn/{}'.format(data)status_code, content = get_content(url,ip_list)if status_code==200:#获取准确的中文名、名字含义、来源、性别等信息list_header = get_infor_header(content[0])#获取名人信息list_celebrity = get_infor_celebrity(content[1])row = list_header + list_celebrityqueueLock.acquire()write_file('haici_infor.csv',row)queueLock.release()else:queueLock.release()time.sleep(1)threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"]
nameList = getNames('A-Z.csv')
queueLock = threading.Lock()
workQueue = queue.Queue(100000)
threads = []
threadID = 1# 创建新线程
ip_list = get_ip_list()
for tName in threadList:thread = myThread(threadID, tName, workQueue,ip_list)thread.start()threads.append(thread)threadID += 1# 填充队列
queueLock.acquire()
for word in nameList:workQueue.put(word)
queueLock.release()# 等待队列清空
while not workQueue.empty():pass# 通知线程是时候退出
exitFlag = 1# 等待所有线程完成
for t in threads:t.join()
print ("退出主线程")

Python3--爬取海词信息相关推荐

Beautiful爬取海词网词汇意思和短语
Beautiful爬取海词网词汇意思和短语直接上代码 # -*- encoding:utf-8 -*- import urllib.request # 导入urllib库的request模块 fro ...
Python3 爬取豆瓣电影信息
原文链接: Python3 爬取豆瓣电影信息上一篇: python3 爬取电影信息下一篇: neo4j 查询豆瓣api https://developers.douban.com/wiki/?t ...
python3爬取豆瓣电影信息，图片，有源码（使用简单）
首先下载安装python3安装教程在控制台(Windows按 win+R)下载python插件: python -m pip install --upgrade pip # 更新 pip insta ...
python3爬取微信通讯录信息并保存头像
安装 pip3 install itchat 代码 # -*- coding: utf-8 -*- import itchat#用于二维码登录微信, itchat.auto_login() #获取通讯 ...
Python爬取12306车票信息
Python3爬取12306车票信息第一次写爬虫,咱从入门级--12306车票爬取开始我们要爬取的信息是https://www.12306.cn/index/上的车票信息当我们选择出发地和目的 ...
Python3 爬取携程网[2]: 爬取北京五星级酒店详细信息
目录 1. 需求分析 2. 实验环境 3. 具体实现 3.1 分析页面 3.2 请求 3.3 响应 3.3.1 BeautifulSoup提取标签信息 3.3.2 正则表达式提取字符串信息 3.3.3 ...
Python2 Python3 爬取赶集网租房信息,带源码分析
*之前偶然看了某个腾讯公开课的视频,写的爬取赶集网的租房信息,这几天突然想起来,于是自己分析了一下赶集网的信息,然后自己写了一遍,写完又用用Python3重写了一遍.之中也遇见了少许的坑.记一下.算是 ...
Python3爬取网页信息乱码怎么解决？（更新：已解决）
更新:乱码问题已经解决了. 将下面代码中的红色部分改为下面这样就不会出现个别职位信息乱码的情况了. soup2 = BeautifulSoup(wbdata2, 'html.parser',from_ ...
Selenium+Python3爬取微博我发出的评论信息
Selenium+Python3爬取微博我发出的评论信息需求代码注: 需求记录对话信息:对话文本.时间.用户.被回复链接.被回复用户.被回复文本. 将数据信息持久化保存,可选择截图. 代码 # ...

Python3--爬取海词信息

Python3--爬取海词信息相关推荐

最新文章

热门文章