Python 微信爬虫完整实例【单线程与多线程】

本文实例讲述了Python 实现的微信爬虫。分享给大家供大家参考，具体如下：

单线程版：

import urllib.request
import urllib.parse
import urllib.error
import re,time
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
list_url = []
###使用代理获取网页url内容
def use_proxy(url):try:# proxy = urllib.request.ProxyHandler({'http':proxy_addr})　　　　##使用代理版# operner = urllib.request.build_opener()# urllib.request.install_opener(operner)headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")operner = urllib.request.build_opener()operner.addheaders = [headers]urllib.request.install_opener(operner)data = urllib.request.urlopen(url).read().decode('utf-8')# print (data)return dataexcept urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)elif hasattr(e, "reason"):print(e.reason)except Exception as e:print("exception" + str(e))time.sleep(1)
##获取要爬取的url
def get_url(key, pagestart, pageend):try:keycode = urllib.parse.quote(key)for page in range(pagestart, pageend + 1):url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode, page)data1 = use_proxy(url)#print("data1的内容是", data1)listurl_pattern = '<h3>.*?("http://.*?)</h3>'result = re.compile(listurl_pattern, re.S).findall(data1)for i in range(len(result)):res = result[i].replace("amp;", "").split(" ")[0].replace("\"", "")list_url.append(res)#print(list_url)return list_urlexcept urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)elif hasattr(e, "reason"):print(e.reason)except Exception as e:print("exception:", e)
##通过获取的url爬行内容数据并处理
def get_url_content(list_url):fh1=open("D:\\python-script\\1.html", 'wb')html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''fh1.write(html1.encode("utf-8"))fh1.close()fh = open("D:\\python-script\\1.html", 'ab')for url in list_url:data_content = use_proxy(url)#print (data_content)#sys.exit()title_pattern = '<h2.*>.*?</h2>'result_title = re.compile(title_pattern, re.S).findall(data_content)##标题(str)res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>","").strip()content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'content = re.compile(content_pattern, re.S).findall(data_content)try:fh.write(res_title.encode("utf-8"))for i in content:fh.write(i.strip().encode("utf-8"))except UnicodeEncodeError as e:continuefh.write("</body></html>".encode("utf-8"))
if __name__ == '__main__':pagestart = 1pageend = 2key = "人工智能"get_url(key, pagestart, pageend)get_url_content(list_url)

多线程版：

import urllib.request
import urllib.parse
import urllib.error
import re,time
import queue
import threading
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
urlque = queue.Queue()
list_url = []
###使用代理获取网页url内容
def use_proxy(url):try:# proxy = urllib.request.ProxyHandler({'http':proxy_addr})# operner = urllib.request.build_opener()# urllib.request.install_opener(operner)headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")operner = urllib.request.build_opener()operner.addheaders = [headers]urllib.request.install_opener(operner)data = urllib.request.urlopen(url).read().decode('utf-8')#print (data)return dataexcept urllib.error.URLError as e:if hasattr(e,"code"):print (e.code)elif hasattr(e,"reason"):print (e.reason)except Exception as e:print ("exception"+str(e))time.sleep(1)
###获取文章的url连接，并将连接加入到队列
class get_url(threading.Thread):def __init__(self,key,pagestart,pageend,urlque):threading.Thread.__init__(self)self.pagestart = pagestartself.pageend = pageendself.key = keyself.urlque = urlquedef run(self):try:keycode = urllib.parse.quote(self.key)for page in range(self.pagestart,self.pageend+1):url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode,page)data = use_proxy(url)print ("data1的内容是",data)listurl_pattern = '<h3>.*?("http://.*?)</h3>'result = re.compile(listurl_pattern,re.S).findall(data)print (result)if len(result) == 0:print ("没有可用的url")sys.exit()for i in range(len(result)):res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")#list_url.append(res)    #加入列表self.urlque.put(res)      ##加入队列self.urlque.task_done()#return list_urlexcept urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)elif hasattr(e, "reason"):print(e.reason)except Exception as e:print ("exception:",e)
##根据url获取文章内容
class get_url_content(threading.Thread):def __init__(self,urlque):threading.Thread.__init__(self)self.urlque = urlquedef run(self):fh1 = open("D:\\python-script\\1.html", 'wb')html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''fh1.write(html1.encode("utf-8"))fh1.close()fh = open("D:\\python-script\\1.html", 'ab')while True:try:url = self.urlque.get()data_content = use_proxy(url)title_pattern = '<h2.*>.*?</h2>'result_title = re.compile(title_pattern, re.S).findall(data_content)##标题res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>","").strip()content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'content = re.compile(content_pattern, re.S).findall(data_content)#c = '<p style="max-width: 100%;box-sizing: border-box;min-height: 1em;text-indent: 2em;word-wrap: break-word !important;">'# for i in content:#   ##内容#   c_content=i.replace(c, "").replace("<br /></p>", "").replace("</p>", "")fh.write(res_title.encode("utf-8"))for i in content:fh.write(i.strip().encode("utf-8"))except UnicodeEncodeError as e:continuefh.close()
class contrl(threading.Thread):def __init__(self,urlqueue):threading.Thread.__init__(self)self.urlqueue = urlqueuewhile True:print ("程序正在执行")if self.urlqueue.empty():time.sleep(3)print ("程序执行完毕")exit()
if __name__ == '__main__':pagestart = 1pageend = 2key = "人工智能"get_url = get_url(key,pagestart,pageend,urlque)get_url.start()get_content = get_url_content(urlque)get_content.start()cntrol = contrl(urlque)cntrol.start()

内容就以上怎么多，最后给大家推荐一个口碑不错的公众号【程序员学府】，这里有很多的老前辈学习技巧，学习心得，面试技巧，职场经历等分享，更为大家精心准备了零基础入门资料，实战项目资料，每天都有程序员定时讲解Python技术，分享一些学习的方法和需要留意的小细节

Python 微信爬虫完整实例【单线程与多线程】相关推荐

Python开发爬虫完整代码解析
Python开发爬虫完整代码解析移除python 三天时间,总算开发完了.说道爬虫,我觉得有几个东西需要特别注意,一个是队列,告诉程序,有哪些url要爬,第二个就是爬页面,肯定有元素缺失的,这个究 ...
python爬虫实例教程-python动态爬虫的实例分享
本文主要和大家分享python动态爬虫的实例分享,用Python实现常规的静态网页抓取时,往往是用urllib2来获取整个HTML页面,然后从HTML文件中逐字查找对应的关键字.如下所示:#encod ...
python爬虫完整实例-python爬虫实战之爬取京东商城实例教程
前言本文主要介绍的是利用python爬取京东商城的方法,文中介绍的非常详细,下面话不多说了,来看看详细的介绍吧. 主要工具 scrapy BeautifulSoup requests 分析步骤 1. ...
python爬虫完整实例-Python爬虫实例
基本GET请求1. 最基本的GET请求可以直接用get方法 response = requests.get("http://www.baidu.com/") 2. 添加 heade ...
python爬虫完整实例-python爬虫实例项目大全
WechatSogou [1]- 微信公众号爬虫.基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫,返回结果是列表,每一项均是公众号具体信息字典. DouBanSpider [2]- ...
python 微信爬虫_python3简单实现微信爬虫
使用ghost.py 通过搜搜的微信搜索来爬取微信公共账号的信息 # -*- coding: utf-8 -*- import sys reload(sys) import datetime imp ...
python中国大学排名爬虫写明详细步骤-python网络爬虫入门实例：中国大学排名定向爬虫...
中国大学排名定向爬虫的设计和实现一.环境安装: 1.选择一个适合自己的IDE(以下代码用Jupyter Notebook编写) 2.打开cmd,安装requests库和beautifulsoup4 ...
python微信小程序实例制作入门_python flask零基础打造微信小程序实战教程
资源目录: ├─python3+flask │ ├─第1章介绍 │ └─1-1 导学--Python Flask 构建微信小程序.mp4 │ ├─第2章微信小程序介绍 │ ├─2-1 小程序是什么 ...
python微信小程序实例_python+Mysql写微信小程序后台
python比较简单,学了用处比较多,所以推荐写微信小程序的后台. (php.java等做后台太复杂了,学起来费劲) [0--假设] 1.Python开发环境已经搭好了,我这边喜欢用VScode. 2 ...

Python 微信爬虫完整实例【单线程与多线程】

Python 微信爬虫完整实例【单线程与多线程】相关推荐

最新文章

热门文章