小小Python爬虫（0）

#目前问题：爬一会就报“RuntimeError: can't start new thread”错误

#Python的语法有些不太适应，这两天从网上搬了些代码组了个小爬虫，把糗事百科的段子按用户ID分类写入到文件

import urllib.request
import urllib.parse
import time
import os
import threading
import queue
import bs4
from bs4 import BeautifulSoup
import shutil
import errnoheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Accept':'text/html;q=0.9,*/*;q=0.8','Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close','Referer':None #注意如果依然不能抓取，这里可以设置抓取网站的host}class Fetcher:def __init__(self,threads_num):self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)self.opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]self.lock = threading.Lock() #线程锁self.q_req = queue.Queue() #任务队列self.q_ans = queue.Queue() #完成队列self.__q_retry = queue.Queue() #重试队列self.threads_num = threads_num + 1 # 1代表了重试线程self.__threads = []#重试线程self.__retry_thread = threading.Thread(target=self.threadretry)self.__threads.append(self.__retry_thread)self.__retry_thread.setDaemon(True)self.__retry_thread.start()#开启@threads_num个工作线程for i in range(threads_num):t = threading.Thread(target=self.threadget)self.__threads.append(t)t.setDaemon(True)t.start()self.running = 0def __del__(self): #解构时需等待两个队列完成#self.q_req.join()#self.q_ans.join()#self.__q_retry.join()for i in range(self.threads_num+1):self.__threads[i].join(1)def taskleft(self):return self.q_req.qsize()+self.q_ans.qsize()+self.runningdef push(self,req):self.q_req.put(req)def pop(self):return self.q_ans.get()def threadretry(self):while True:req = self.__q_retry.get() #get(self,block=True,timeout=None)with self.lock:self.running += 1try:ans = self.opener.open(req).read()except urllib.error.URLError as e:ans = ''if hasattr(e, 'reason'):print('We failed to reach a server.')print('Reason: ', e.reason)elif hasattr(e, 'code'):print('The server cannot fulfill the request.')print('Reason: ', e.code)else:if ans:self.q_ans.put((req,ans))with self.lock:self.running -= 1self.__q_retry.task_done()def threadget(self):while True:req = self.q_req.get()with self.lock: #要保证该操作的原子性，进入critical areaself.running += 1try:ans = self.opener.open(req).read()except urllib.error.URLError as e:ans = ''if hasattr(e, 'reason'):print('We failed to reach a server.')print('Reason: ', e.reason)elif hasattr(e, 'code'):print('The server cannot fulfill the request.')print('Reason: ', e.code)else:if ans:self.q_ans.put((req,ans))else:self.__q_retry.put(req)with self.lock:self.running -= 1self.q_req.task_done()def create_dir(userid,domain='qiushibaike'):dir_name = domain + '/' + useridtry:os.mkdir(dir_name)except OSError as e:if e.errno == errno.EEXIST and os.path.isdir(dir_name):passelse:print(str(e))def userid_exist(userid):return os.path.isdir('qiushibaike'+'/'+userid)def get_file_name(userid):current_time = time.strftime("%Y-%m-%d",time.localtime())return 'qiushibaike'+'/'+userid+'/'+current_time+'.txt'def write_file(file,soup):count=0for ii in soup.find_all("div",class_="content clearfix"):#print(ii.a["href"])#print(ii.a.text)if ii.a.text:count += 1file.write(bytes(ii.a["href"],encoding="utf-8"))file.write(bytes('\r\n',encoding="utf-8"))file.write(bytes(ii.a.text,encoding="utf-8"))file.write(bytes("\r\n\r\n",encoding="utf-8"))return countdef get_max_page(soup):#ii=bs4.element.Tag()num=0for jj in soup.find_all('a',rel="next",class_=None):num=int(jj.text)return numdef store_this_user(userid):if userid_exist(userid):print("该用户貌似已经检索")returncreate_dir(userid)file_name = get_file_name(userid)file = open(file_name, 'wb')ff = Fetcher(3)ff.push('http://www.qiushibaike.com/users/'+userid)req,ans = ff.pop()soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")user_name=""for ii in soup.find_all('span',class_="user_center"):user_name = ii.textif not user_name:del fffile.close()return#把第一页的写进文件count = write_file(file,soup)print(user_name+" "+str(count)+"条糗事 [http://www.qiushibaike.com/users/"+userid+"/articles/page/1]")#把余下的页面请求完max_page = get_max_page(soup)+1for i in range(2,max_page):#print("加入列表 [http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i)+"]")ff.push("http://www.qiushibaike.com/users/"+userid+"/articles/page/"+str(i))while ff.taskleft():req,ans = ff.pop()soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")count = write_file(file,soup)print(user_name+" "+str(count)+"条糗事 ["+req+"]")del fffile.close()def main():#os.mkdir('qiushibaike')#store_this_user("13843355")ff = Fetcher(3)ff.push('http://www.qiushibaike.com/textnew')while True:next_link=""req,ans = ff.pop()soup = BeautifulSoup(ans.decode('utf-8'),"html.parser")for ii in soup.find_all('a',class_="next",text="下一页"):next_link = ii["href"]next_link = "http://www.qiushibaike.com"+next_linkff.push(next_link)for ii in soup.find_all('div',class_="author"):#print(ii.a["href"].split('/')[2])store_this_user(ii.a["href"].split('/')[2])'''file.close()ff = Fetcher(10)ff.push('http://www.qiushibaike.com/users/14870461')req,ans = ff.pop()print(ans.decode('utf8'))#os.system("pause")testgbk='汉字'testunit=testgbk.encode('gbk')    #--汉字解码print(testunit)testutf8=testgbk.encode('utf-8')  #--转utf-8编码print(testutf8)testunit=testutf8.decode('utf-8')  #--utf-8解码print(testunit)testgbk=testunit.encode('gbk')    #--转gbk编码print(testgbk)
''''''links = ['http://item.jd.com/%d.html'%i for i in range(1746854,1746860)]ff = Fetcher(10)for url in links:ff.push(url)while ff.taskleft():(url,content) = ff.pop()print(url,len(content))
''''''
url = 'http://www.sina.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' : 'Michael Foord', 'location' : 'pythontab', 'language' : 'Python' }
headers = { 'User-Agent' : user_agent }data = urllib.parse.urlencode(values)
#req = urllib.request.Request(url, data, headers)
req = urllib.request.Request('http://www.baidu.com')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
''''''
##################################
def cbk(a, b, c):#回调函数#@a: 已经下载的数据块#@b: 数据块的大小#@c: 远程文件的大小per = 100.0 * a * b / cif per > 100:per = 100num = int(per)print('[',end='')for i in range(num):print('#',end='')print('%.2f]' %(per), end='')url = 'http://www.sina.com.cn'
local = 'e:\\sina.html'
urllib.request.urlretrieve(url, local, cbk)input()
os.system("pause")
##################################
''''''
try:response = urllib.request.urlopen(req)print('ffdfsdfsf')
except urllib.error.URLError as e:if hasattr(e, 'reason'):print('We failed to reach a server.')print('Reason: ', e.reason)elif hasattr(e, 'code'):print('The server cannot fulfill the request.')print('Reason: ', e.code)
else:#print(response.info)#print(response.getcode())response_context = response.read()print(response_context.decode("utf8"))
'''if __name__=="__main__":main()

小小Python爬虫（0）相关推荐

小小Python爬虫一
今天星期六OJ的蛋疼,于是就找点其他的事情干,忽然在知乎上看到一个帖子说自己找工作各种受挫的,找工作?于是就看到回答的人的签名是招聘员工的,觉得这个是不错的主意.是的,有点想写那么个能爬知乎签名的爬虫 ...
qq纵横四海源码_【0基础】纵横中文网python爬虫实战
原文在此~ [0基础]纵横中文网python爬虫实战mp.weixin.qq.com 大家好,我是你们的机房老哥! 在粉丝群的日常交流中,爬虫是比较常见的话题.python最强大的功能之一也是爬虫. ...
python免费领取视频-最经典Python爬虫全套视频免费领，带你从0开始学爬虫
爬虫,是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.如果把整个互联网的数据比喻为一座宝藏,那爬虫就是来教大家如何来高效地挖掘这些宝藏.可以说,掌握了爬虫技能,你就成了所有互联网信息公司幕后 ...
快看这里，豆瓣9.0的Python爬虫宝藏书籍，自学爬虫必备~
哈喽~大家好!我是恰恰.今天我们来学些什么呢,那就是爬虫啦!说到学习爬虫,相信很多人都是听过一句话,"爬虫爬的好,监狱进的早"!虽然有点夸张的感觉,但是这也侧面说明,如果学会了爬虫 ...
python爬虫selenium爬不到frame 的tag标记下#document==0的内容解决
python爬虫selenium爬不到frame 的tag标记下#document==0的内容解决前言按理来说,selenium可以获取当前页面的所有源代码,但却爬不到frame 的tag标记下# ...
python 浏览器下载文件_同样一个下载地址，用python爬虫爬取的种子文件大小为0，而用浏览器是可以正常下载下来的？...
1.访问某个网页,用浏览器可以下载其中嵌入的种子文件,种子文件大小是正常的,用迅雷工具也可以正常下载,但是用python爬虫爬取,并且下载下来的数据大小为0? 2.这是我自己写的代码. url = ' ...
python爬虫小白轻松从0到1_如何从0到1，学习Python的流程【小白入门】
Hello World! 写下Hello World是你在学任何编程语言时做的第一件事,除了让你对第一次代码运行感到兴奋,它也会帮你检测目前运行环境是否正常. 我们第一个运行的程序! 重点可读性-- ...
python爬虫小白轻松从0到1_小白学 Python 爬虫（1）：开篇
人生苦短,我用 Python 引言各位同学大家好,好久不见(可能只有一两天没见:囧)~~~ 先讲一件事情,昨天为啥没更新. emmmmmmmmm,当然是因为加班啦,快到年底了,公司项目比较忙,最近的 ...
手把手从0开始学会Python爬虫，从大一初学者视角，带你实现爬虫攥写
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档文章目录前言一.Python爬虫是什么? 二.使用步骤 1装入所需要的库 2.这里直接给出爬虫通用框架 3.requests库的7 ...

小小Python爬虫（0）

小小Python爬虫（0）相关推荐

最新文章

热门文章