python wechatsougou_使用Python的requests库模拟登陆微信搜狗,爬取100X10篇微信文章并保存到MySQL...
自学的python和爬虫技术。使用到Redis,MySQL数据库;request请求模块;re,Xpath解析模块;dumps, loads序列化和反序列化。还可以配合代理池使用。
爬取的是https://weixin.sogou.com/。网站只能微信扫码登陆,不登录只能访问十页,这里
使用Session保持会话实现登陆状态爬取100页,cookie是自己添加的。
第一次发文,试一试页面效果hhhh,有机会再具体解析思路。
算是我自学的一个阶段性成果,因为我是外行,代码中肯定有些愚蠢的地方,贴出源代码来供大家批判指正。
python3.6环境,运行确保安装相关python库以及数据库
from requests import Request, ConnectionError, ReadTimeout
from requests import Session
from pickle import dumps, loads
from urllib.parse import urlencode
from lxml import etree
import pymysql
#控制代理池的开启
from run import main
import time
import re
from redis import StrictRedis
TIMEOUT = 10
MAX_FAILED_TIME = 3
REDIS_KEY = 'weixinrequests_liu'
REDIS_HOST = "localhost"
REDIS_PORT = 6379
REDIS_PASSWORD = None
MYSQL_HOST = "localhost"
MYSQL_NAME = 'root'
MYSQL_PASSWORD = '*****'
MYSQL_DATABASE = 'samp_db'
MYSQL_PORT = 3306
PROXY_POOL_URL = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=2f80314147e34f04bf5705006d837a86&orderno=YZ2018112477906eltR9&returnType=1&count=10'
# PROXY_POOL_URL = 'http://localhost:5555/random'
KEY_WORDS = "杨超越"
COOKIE = 'SUV=0081272D655E8569592959DA71D95636; SUID=8AB1E7652F20910A00000000595B0BDA; ld=4Zllllllll2bLZA5lllllVsKxm1lllllBqf3vZllll9lllllxllll5@@@@@@@@@@; ABTEST=3|1543134403|v1; IPLOC=CN3100; weixinIndexVisited=1; sct=1; ppinf=5|1543150392|1544359992|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTYlOUQlOEUlRTUlOEQlOUF8Y3J0OjEwOjE1NDMxNTAzOTJ8cmVmbmljazoxODolRTYlOUQlOEUlRTUlOEQlOUF8dXNlcmlkOjQ0Om85dDJsdUgyb3R2dks1bFoydFpTU1Q3MDJWY0VAd2VpeGluLnNvaHUuY29tfA; pprdig=QPtri4HHDEm4Gz9hxTvUj8MO9ymKgOe2EgkwA3uuYG0JIjd1IM8NnWkE6f1vrIt4mlMoC1Nmomb6ntUbGAjANhGeEkJOid0_Yk4g0yBHTA0FQ3_WMsbYhS0SQN_Sbmvj66AVXN93ZhbqAzmflqUoyNdw7YGswhD3tZ7J0xu0i0U; sgid=12-38075013-AVv6mzhP6ltMX5qfqx8JPJc; SUIR=AC6DB982E7ED939ECB44C2A1E85C066B; SNUID=AA6ABE84E0E49B9DDE6C0D11E196941C; JSESSIONID=aaaZoBR-hzstpadzRe7Cw; ppmdig=1543197423000000309d9113ade27bf0e5a2598fd5f1caba'
START_ID = 10
DIC_TIME = 0
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': COOKIE,
# 'Host':'weixin.sogou.com',
'Upgrade-Insure-Request': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
TABLE_NAME = 'seshi3'
MAX_PAGE = 100
PROXY_ON = False
class MySQL():
def __init__(self, host=MYSQL_HOST, username=MYSQL_NAME, password=MYSQL_PASSWORD, port=MYSQL_PORT, database=MYSQL_DATABASE):
try:
self.db = pymysql.connect(host=host, user=username, password=password, db=database, port=port)
self.cursor = self.db.cursor()
except Exception as e:
print(e)
def insert(self, table, data):
keys = ','.join(data.keys())
values = ','.join(len(data)*['%s'])
print(values)
sql_query = 'INSERT INTO {}({}) values ({})'.format(table, keys, values)
try:
self.cursor.execute(sql_query, tuple(data.values()))
self.db.commit()
print('真的插入成功了')
except Exception as e:
print(e,'插入失败')
self.db.rollback()
class WeixinRequest(Request):
def __init__(self, url, callback, method='GET', headers=None, need_proxy=False, fail_time=0, timeout=TIMEOUT):
Request.__init__(self, method, url, headers)
self.callback = callback
self.need_proxy = need_proxy
self.fail_time = fail_time
self.timeout = timeout
class RedisQueue():
def __init__(self):
self.db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
def add(self, request):
if isinstance(request, WeixinRequest):
return self.db.rpush(REDIS_KEY, dumps(request))
else:
print("meiyoujiaaa")
return False
def pop(self):
if self.db.llen(REDIS_KEY):
return loads(self.db.lpop(REDIS_KEY))
else:
return False
def empty(self):
return self.db.llen(REDIS_KEY) == 0
def get_proxy():
if PROXY_ON:
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print('获得代理', response.text)
return response.text
return None
except Exception as e:
print(e)
return None
False
class Spider():
base_url = r'https://weixin.sogou.com/weixin'
session = Session()
queue = RedisQueue()
mysql = MySQL()
def __init__(self, dictime, start_id, table_name, headers, key_words, max_page):
self.rtime = start_id
self.dictime = dictime
self.start_id = start_id
self.table_name = table_name
self.headers = headers
self.key_words = key_words
self.max_page = max_page
def start(self):
self.session.headers.update(self.headers)
start_url = 'https://weixin.sogou.com/weixin?query={}&_sug_type_=&sut=4722&lkt=1%2C1542979381375%2C1542979381375&s_from=input&_sug_=y&type=2&sst0=1542979381478&page={}&ie=utf8&w=01019900&dr=1'.format(self.key_words, self.start_id)
wexin_request = WeixinRequest(start_url, callback=self.parse_index, need_proxy=False)
self.queue.add(wexin_request)
print(wexin_request.url, "是开始获取的url")
def parse_index(self, r):
html = etree.HTML(r.text)
html_li = html.xpath("//ul[@class='news-list']/li")
for url_list in html_li:
url = url_list.xpath(".//h3/a/@href")[0]
print('parese_index获取到这个微信url(dictime,rtime)', (self.dictime,self.rtime), url)
self.dictime += 1
yield WeixinRequest(url=url, callback=self.parse_detail, need_proxy=True, headers=self.headers, method='GET')
pattern2 = re.compile(r'''sogou_next.*?="(.*?)"''')
next = re.findall(pattern2, r.text)[0] if len(re.findall(pattern2, r.text)) >= 1 else 0
if next:
next_url = self.base_url + next
self.rtime += 1
print('哈哈哈parese_index自动获取到这个下一页url(dictime,rtime)', (self.dictime, self.rtime), next_url)
yield WeixinRequest(url=next_url, callback=self.parse_index, need_proxy=True, headers=self.headers)
else:
next_url = 'https://weixin.sogou.com/weixin?query={}&_sug_type_=&sut=4722&lkt=1%2C1542979381375%2C1542979381375&s_from=input&_sug_=y&type=2&sst0=1542979381478&page={}&ie=utf8&w=01019900&dr=1'.format(self.key_words,self.rtime+1)
self.rtime += 1
if self.rtime <= self.max_page:
print('啊啊啊自动构造这个下一页url(dictime,rtime)', (self.dictime, self.rtime), next_url)
yield WeixinRequest(url=next_url, callback=self.parse_index, need_proxy=True, headers=self.headers)
def parse_detail(self, response):
r = response.text
pattern = re.compile(r'publish_time = "(.*?)"')
date = re.findall(pattern, r)[0] if len(re.findall(pattern, r))>=1 else "日期出错"
# print(r.text)
html = etree.HTML(r)
title = html.xpath("//h2[@class='rich_media_title']//text()")[0].strip() if len(html.xpath("//h2[@class='rich_media_title']//text()"))>=1 else "标题出错"
lend = len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span"))
wechat = html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()]/a/text()")[0].strip('\n').strip() if len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()]/a/text()"))>=1 else "wechat出错"
pattern3 = re.compile(r'''js_name.*?>(.*?)
nickname = re.findall(pattern3,r)[0].strip('\n').strip() if len(re.findall(pattern3,r))>0 else 'nickname出错'
if lend == 3:
original = \
html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()-2]/text()")[0].strip('\n').strip() if len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()-2]/text()"))>=1 else "original出错"
else:
original = ''
content_list = html.xpath("//div[@id='img-content']/div[2]//p//text()") if len(html.xpath("//div[@id='img-content']/div[2]//p//text()")) >=1 else "内容出错"
neirong = ""
for i in content_list:
neirong += i.strip()
data = {
'title':title,
'original':original,
'nickname':nickname,
'date': date,
'wechat':wechat,
'content':neirong
}
yield data
def schedule(self):
while not self.queue.empty():
print("开始调度次", "\n", "="*20)
weixin_request = self.queue.pop()
print("调度选出了这个url", weixin_request.url, '选出了这个url')
# 存在两种可能的请求
callback = weixin_request.callback
# 先请求
response = self.request(weixin_request)
print(response, '获得相应*********')
if hasattr(response, 'status_code'):
print(response.status_code, '上面url的状态')
if response and (response.status_code == 200):
# 再解析
results = list(callback(response))
if results:
# print("解析结果",results)
for result in results:
if isinstance(result, WeixinRequest):
self.queue.add(result)
print('解析结果,搜索页文章列表', result.url)
# self.rtime += 1
if isinstance(result, dict):
self.mysql.insert(table=self.table_name, data=result)
print("插入到MYSQL",result)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
def error(self, weixin_request):
# 接受一个请求
print("休息二秒")
weixin_request.fail_time += 1
print("请求失败", weixin_request.fail_time, "次", weixin_request.url)
time.sleep(20)
if weixin_request.fail_time < MAX_FAILED_TIME:
self.queue.add(weixin_request)
def request(self,weixin_request):
try:
proxy = get_proxy()
print("代理状态,", proxy)
if proxy:
proxies ={
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
return self.session.send(weixin_request.prepare(), proxies=proxies)
time.sleep(3)
return self.session.send(weixin_request.prepare())
except (ConnectionError, ReadTimeout) as e:
print(e)
return False
def run(self):
self.start()
self.schedule()
if __name__ == '__main__':
#mysql内容编码utf8m4
spider = Spider(
dictime=DIC_TIME,
start_id=START_ID,
table_name=TABLE_NAME,
key_words=KEY_WORDS,
headers=HEADERS,
max_page=MAX_PAGE
)
spider.run()
print(spider.rtime, "请求页数")
print(spider.dictime, "请求篇数")
# MySQL存储结果
# 运行结果
python wechatsougou_使用Python的requests库模拟登陆微信搜狗,爬取100X10篇微信文章并保存到MySQL...相关推荐
- 使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中
import requests import time, random, csv from fake_useragent import UserAgent from bs4 import Beauti ...
- Python爬取URP教务系统课程表并保存到excel
Python爬取URP教务系统课程表并保存到excel 爬取URP教务系统课程表最终结果如图所示: 接下来开始操作: 首先打开教务系统->按F12->点击Network->刷新一下界 ...
- python3爬虫系列03之requests库:根据关键词自动爬取下载百度图片
python3爬虫系列03之requests库:根据关键词自动爬取下载百度图片 1.前言 在上一篇文章urllib使用:根据关键词自动爬取下载百度图片 当中,我们已经分析过了百度图片的搜索URL的变化 ...
- Python爬取中国大学排名,并且保存到excel中
前言 以下文章来源于数据分析和Python ,作者冈坂日川 今天发的是python爬虫爬取中国大学排名,并且保存到excel中,当然这个代码很简单,我用了半小时就写完了,我的整体框架非常清晰,可以直接 ...
- requests库(正则提取)爬取千图网
requests库(正则提取)爬取千图网 首先分析网页结构 打开千图网的网址搜索春节 打开网页源代码,发现跳转链接存在网页源代码里 接下来我们就利用正则表达式去提取 正则表达式最主要的就是找到你想要信 ...
- Python爬虫,30秒爬取500+篇微信文章!太强啦!
引言 由于工作需要,给公司前端做了一个小工具,使用python语言,爬取搜狗微信的微信文章, 从热门到时尚圈,并且包括每个栏目下面的额加载更多内容选项 一共加起来500+篇文章 需求 爬取这些文章获取 ...
- python爬微信群_利用Python爬虫实现30秒爬取500篇微信文章
引言 由于工作需要,给公司前端做了一个小工具,使用python语言,爬取搜狗微信的微信文章,附搜狗微信官方网址 从热门到时尚圈,并且包括每个栏目下面的额加载更多内容选项 一共加起来500+篇文 ...
- 利用python requests库模拟登陆知乎
当初搜模拟登陆的时候在知乎上也找到一些内容. 以下是代码 import requests import time import json import os import re import sys ...
- 利用Python requests库模拟登陆学校教务系统
在研究了一会requests库的实现之后.发现requests的确非常强大.. 几行代码就登陆上了学校的教务系统,但也许是我们学校的教务系统做的太烂了吧23333.动不动就血崩. 下面是代码. imp ...
最新文章
- 先写ppt,再写报告,在做实验是一个不错的方法!
- spark安装须知:SPARK_DIST_CLASSPATH配置
- 二十四、深入Java抽象类,抽象方法和接口
- 6-7 求链表的倒数第m个元素 (25 分)
- javascript 西瓜一期 01.什么是编程 什么是编程语言
- 何恺明随机连接神经网络复现
- python中int对象不可调用_'int'对象在python中不可调用
- Spark Streaming之容错机制以及事务语义
- Android 打造完美的侧滑菜单/侧滑View控件
- 一维搜索斐波那契C语言,斐波那契数列在一维搜索中的应用
- hive报错(1)MoveTask
- OpenGL笔记5 shader 调试信息获取 Debug
- 使用xftp无法连接阿里云服务器 或者linux
- xml建模包括以下_数据挖掘--建模与挖掘的结合
- Redis开发与运维
- 考研计算机专业课961考什么,北航计算机考研(961)经验谈
- 获取本机号码及sim卡信息
- 经常被问到的有深度有内涵的数据结构面试题
- Docker(二十)--Docker k8s--Kubernetes存储--Volumes配置管理
- http状态码301、302、303、307、308区别