python爬取今日头条_爬取今日头条街拍图片

参考于崔庆才的Python爬虫教程，但是崔的视频时间过长，今日头条网站反爬虫也有了变化，因此写下此文章来记录自己的爬取过程遇到的问题，也给大家一些借鉴。欢迎大家讨论。

一、获取索引页。

我们会发现doc下服务器给出的response里面全是些js代码，没有我们想要的二级页面链接。

然后查看XHR下，preview会发现我们要的数据全在这里面，他是以一个json对象的存放的，由此我们知道他是由Ajax渲染的。（一般下滑加载出数据的基本都是ajax动态渲染的）。再看他的请求字符参数、请求头参数有很多，不过没什么问题，直接复制过来即可。

def get_index(offset, keyword):

para = {

'aid': '24',

'app_name': 'web_search',

'offset': offset,

'format': 'json',

'keyword': keyword,

'autoload': 'true',

'count': '20',

'en_qc': '1',

'cur_tab': '1',

'from': 'search_tab',

'pd': 'synthesis',

'timestamp': '1593665684775',

'_signature': '7' +

'yD7.AAgEBApd0Jxcjcfwe8huuAALHovmaBcff719uWpg6PhnCCgTgbuUckg1kLI3dQFdQZ1b3VwbtvV9P3ZaGHpjzTDdgJm.hxt6TELcPiJAsAOBYizC - 15.PpPHKolFtN'

}

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',

'cookie': 'tt_webid=6844895200352765453; ttcid=147baf1d0de44bcfaa3aaccb43de319231; csrftoken=e72569f7c754e03b930bbb54f34b7759; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6845096485504583176; s_v_web_id=verify_kc5ojvq4_iZKfsEzB_AubR_44PZ_AgkE_hvFA5OJnJ4nc; SLARDAR_WEB_ID=ba4439ab-f467-42a9-9dcd-8be836757cc3; __tasessionId=mbg1eigaz1593748245904; tt_webid=6845096485504583176; __ac_nonce=05efeab2d00f5555e228f; __ac_signature=_02B4Z6wo00f01JctsQwAAIBDjnNXOcxFq8yXKbWAAHs9kTU3N2TXnkkV75OXvSXoT5pfhWopyS8N2zbrE1VOQlMWB8pFBr0ZvUmL9qtI0GvgW6d9cv9TeAEXgf1HZ3bDCooMMUBc9Rq-r53241; tt_scid=8TNWiPBmsnVRkGGZJ5frIVGKsH4gMngyY0s4KQWdyckfgNuI45yMdiqvIa1sudDF15e8'

}

url = 'https://www.toutiao.com/api/search/content/?' + urlencode(para) # urlencode可以吧字典类型转化为url中的请求参数

try:

response = requests.get(url, headers=headers)

if response.status_code == 200:

print('索引页请求成功')

response.encoding = 'UTF-8'

return response.text

return None

except RequestException:

print('索引页请求失败')

return None

二、解析索引页数据。

由于这个数据是在js中存储的，一般的爬虫库并不适用，所以我们使用正则表达式的方法将他匹配出来，然后取出各个索引的Url与Title。这里要注意提取出来的url虽然可以使selenium使用，但是并不是非常格式化的url，我们需要把group/换成a，拼接成一个新的url，这个url才是我们要的。（如果不这样虽然这个带group的url可以在浏览器中访问，selenium也可以但是requests是请求不到的）。

def parse_index_json(html):

data = json.loads(html) # 解析为json对象

if data and 'data' in data.keys():

for item in data.get('data'): # 遍历data这个json数据

if item and 'abstract' in item.keys():

yield (item.get('article_url').repalce('group/','a'), item.get('display').get('title').get('text'))

三、获取详情页。

这个页面我花了整整一天时间才获取到，在网上看别人爬的也都是使用requests，但是我使用requests请求他，不管如何配置headers，就是请求不到数据（我这里显示的情况是，程序执行到requests.get（）会卡一会然后报错，总之请求不到html），请求个几十次可能才有一个url能请求到，我感觉是他反爬可能检测到了，于是我使用了selenium(不能使用headless Chrome模式，不然返回的还是None)，，去请求数据，果然成功了。（不过这种方法比较慢，不知道有什么好的方法，大家如果有请分享一下哈）。

添加：根据评论里的指出了我的url没有去掉group错误，我修改之后，requests请求还是不能请求到数据，不知为何。

def get_detail(url):

driver = webdriver.Chrome() # 声明浏览器驱动对象

try:

driver.get(url)

html = driver.page_source

driver.close()

return html

except RequestException:

print('详情页请求失败')

return None

四、解析详情页数据。

还是打开doc下的response查看网页代码，我们发现我们要的数据就在这里面的js代码中，虽然进行转码了，但是他的后缀跟图片url的后缀是一样的，于是同样对他使用正则表达式取出来转码，即可得到图片的url。

def parse_detail(html):

result = re.findall('JSON.parse\("(.*?)"\),', html, re.S)

if result:

data = result[0].encode('utf-8').decode('unicode-escape') # 去掉\\\u002F，先用utf-8编码再用unicode-escape解码

data = json.loads(data)

if data and 'sub_images' in data.keys(): # 过滤掉不能爬的

sub_images = data.get('sub_images')

![json.png](/img/bVbI3kO)'url') for item in sub_images]

return image_url

五、获取图片页。

直接使用requests请求即可。

def download_image(img_url):

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',

'Connection': 'keep - alive',

}

try:

response = requests.get(img_url, headers=headers)

if response.status_code == 200:

save_image(response.content) # 返回二进制内容，返回图片一般返回二进制内容

return None

except RequestException:

print('图片请求失败')

return None

六、下载图片，将图片信息保存至MongoDB，不再叙述。

以下是完整代码：

import os

from hashlib import md5

import requests

from requests.exceptions import RequestException

from urllib.parse import urlencode

import json

import re

from selenium import webdriver

import pymongo

from config import *

client = pymongo.MongoClient(MONGO_URL)

db = client[MONGO_DB]

def get_index(offset, keyword):

para = {

'aid': '24',

'app_name': 'web_search',

'offset': offset,

'format': 'json',

'keyword': keyword,

'autoload': 'true',

'count': '20',

'en_qc': '1',

'cur_tab': '1',

'from': 'search_tab',

'pd': 'synthesis',

'timestamp': '1593665684775',

'_signature': '7' +

'yD7.AAgEBApd0Jxcjcfwe8huuAALHovmaBcff719uWpg6PhnCCgTgbuUckg1kLI3dQFdQZ1b3VwbtvV9P3ZaGHpjzTDdgJm.hxt6TELcPiJAsAOBYizC - 15.PpPHKolFtN'

}

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',

}

url = 'https://www.toutiao.com/api/search/content/?' + urlencode(para) # urlencode可以吧字典类型转化为url中的请求参数

try:

response = requests.get(url, headers=headers)

if response.status_code == 200:

print('索引页请求成功')

response.encoding = 'UTF-8'

return response.text

return None

except RequestException:

print('索引页请求失败')

return None

def parse_index_json(html):

data = json.loads(html) # 解析为json对象

if data and 'data' in data.keys():

for item in data.get('data'): # 遍历data这个json数据

if item and 'abstract' in item.keys():

yield (item.get('article_url'), item.get('display').get('title').get('text'))

def get_detail(url):

driver = webdriver.Chrome() # 声明浏览器驱动对象

try:

driver.get(url)

html = driver.page_source

driver.close()

return html

except RequestException:

print('详情页请求失败')

return None

def parse_detail(html):

result = re.findall('JSON.parse\("(.*?)"\),', html, re.S)

if result:

data = result[0].encode('utf-8').decode('unicode-escape') # 去掉\\\u002F，先用utf-8编码再用unicode-escape解码

data = json.loads(data)

if data and 'sub_images' in data.keys(): # 过滤掉不能爬的

sub_images = data.get('sub_images')

image_url = [item.get('url') for item in sub_images]

return image_url

def save_to_mongo(result):

if db[MONGO_TABLE].insert_one(result):

print('存储到MongoDB成功', result)

return True

else:

return False

def download_image(img_url):

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',

'Connection': 'keep - alive',

}

try:

response = requests.get(img_url, headers=headers)

if response.status_code == 200:

save_image(response.content) # 返回二进制内容，返回图片一般返回二进制内容

return None

except RequestException:

print('图片请求失败')

return None

def save_image(content):

file_path = '{0}/{1}.{2}'.format(os.getcwd() + '\\toutiaojiepai', md5(content).hexdigest(), 'jpg')

if not os.path.exists(file_path):

with open(file_path, 'wb') as f:

f.write(content)

f.close()

def main():

index_html = get_index(0, '街拍')

# print(index_html) #index_html是一个字符串，json格式

for url, title in parse_index_json(index_html):

if url and title:

detail_html = get_detail(url)

img_url = parse_detail(detail_html)

result = {

'url': url,

'title': title,

'image_url': img_url

}

save_to_mongo(result)

if img_url:

for j in img_url:

download_image(j)

if __name__ == '__main__':

main()

config.py为MongoDB配置文件：

MONGO_URL='localhost'

MONGO_DB='toutiao'

MONGO_TABLE='jiepai'

python爬取今日头条_爬取今日头条街拍图片相关推荐

Python爬虫：爬取今日头条“街拍”图片（修改版）
前言在参考<Python3网络爬虫开发实战>学习爬虫时,练习项目中使用 requests ajax 爬取今日头条的"街拍"图片,发现书上的源代码有些已经不适合现在了, ...
爬取今日头条街拍图片
** *爬取今日头条街拍图片 * ** # coding=utf-8 import os import re import time from multiprocessing.pool import ...
爬取街拍图片_如何拍摄好街拍照片
爬取街拍图片 Street photography is about documenting the day to day life of a city. It's about capturing t ...
我的爬虫之爬今日头条街拍图片
近日学习了python 爬虫方面的内容 ,决定实战--爬今日头条的街拍图片首先先分析今日头条的请求方式,进入https://www.toutiao.com F12 搜索街拍查看当前请求 http ...
利用Ajax爬取今日头条头像，街拍图片。关于崔庆才python爬虫爬取今日头条街拍内容遇到的问题的解决办法。
我也是初学爬虫,在看到崔庆才大佬的爬虫实战:爬取今日头条街拍美图时,发现有些内容过于陈旧运行程序时已经报错,网页的源代码早已不一样了.以下是我遇到的一些问题. 1.用开发者选项筛选Ajax文件时预览看 ...
java爬取今日头条_今日头条抓取街拍图片数据集
spider1: 抓取街拍页面的所有入口链接: 1.数据查看到,街拍页面需要的数据集都在data这个集合中,而data是整个数据集字典的一个键,data这个键又包括了一个list,list中是一个个字 ...
python爬今日头条组图_python 爬虫抓取今日头条街拍图片
1. 打开google浏览器,输入www.toutiao.com, 搜索街拍.html 2.打开开发者选项,network监看加载的xhr, 数据是ajax异步加载的,能够看到preview里面的da ...
今日头条街拍图片爬取
其中遇到的问题和一些新知识: 1. 注意页面请求参数:(会改变) 即Query String Parameters 例: 今日头条里街拍综合的数据为 'offset': 0, 'format': 'j ...
Scrapy 爬取今日头条街拍图片
scrapy 爬取今日头条图片保存至本地之前用 requests 爬取过今日头条街拍的图片,当时只是爬取每篇文章的缩略图,今天尝试用 scrapy 来大规模爬取街拍详细图片. 分析页面今日头条的内 ...

python爬取今日头条_爬取今日头条街拍图片

python爬取今日头条_爬取今日头条街拍图片相关推荐

最新文章

热门文章