python 抓取头条街拍图片

#抓取头条图片，存入文本文件

#根据崔大庆视频整理

import requests
import re
import json
import os
from requests.exceptions import RequestException
from multiprocessing import Pool
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from json.decoder import JSONDecodeError
headers = {
'User-Agent': 'User-Agent  Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Accept': 'application/json, text/javascript',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN'
}
def get_page_index(offset,keyword):
data={
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':20,
'cur_tab':1
    }
url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return  None
    except RequestException:
print('请求索引页面错误')
return None

def parse_page_index(html):
try:
data = json.loads(html)
#print(data)
        if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass

def get_page_detail(url):
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return  None
    except RequestException:
print('请求索引页面错误')
return None

#获取详情页
def parse_page_detail(html,url):
# 需要安装lxml包 安装32位的lxml‑4.0.0‑cp36‑cp36m‑win32.whl
    # https://www.zhihu.com/question/49221958/answer/114914375
    soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
# 正则获取html的js 内容
    img_pattern = re.compile('var gallery = (.*?);',re.S)
result = re.search(img_pattern,html)
if result:
# 获取sub_images 列表
        data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:down_image(image)
return {
'title':title,
'images':images,
'url':url
}
def down_image(url):
print('正在下载图片',url)
names = re.split('/',url)
print('图片名称：',names[4])
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
#response.content -- 图片二进制
            #response.text -- 文字
            save_image(response.content,name=names[4])
return  None
    except RequestException:
print('请求图片错误',url)
return None

#下载 照片，需要在工程建img文件夹
def save_image(content,name):
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'img',name,'jpg')
print('图片路径',file_path)
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
#写入文本文件
def writeToFile(content):
with open("toutiaojiepai.txt",'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + "\n")
f.close()
def main(offset):
html = get_page_index(offset,'街拍')
#print(html)
    for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html,url)
#print(result)
            writeToFile(result)
if __name__ == '__main__':
#main()
    groups = [x*20 for x in range(1,21)]
pool = Pool()
pool.map(main,groups)

python 抓取头条街拍图片相关推荐

python爬虫抓取头条街拍美女图片
开发环境:windows 7 开发工具:pycharm python版本:python 3.7 用到的库:os,urllib,requests,hashlib 关键步骤: 通过浏览器分析找到请求接口 ...
python爬虫实战，requests模块，Python实现抓取头条街拍美图
前言利用Python爬取的是今日头条中的街拍美图.废话不多说. 让我们愉快地开始吧~ 开发工具 Python版本: 3.6.4 相关模块: re: requests模块: 以及一些Python自带的 ...
爬取今日头条街拍图片
** *爬取今日头条街拍图片 * ** # coding=utf-8 import os import re import time from multiprocessing.pool import ...
python抓取头条文章
python抓取头条美文并存储到mongodb # Author:song from multiprocessing import Pool from urllib.parse import urle ...
python美女源代码_单身程序员，每晚用python抓取百万张美女图片，连女友都不想找了...
每当夜深人静时,这位长期单身的程序员就会起床开电脑,然后用python抓取百万张美女图片,存进U盘,目的目前还不知道,但技术是万能的,这样一来,可能连找女朋友的钱都省了. 其实,还有更好看的! 而且还 ...
Python爬虫：爬取今日头条“街拍”图片（修改版）
前言在参考<Python3网络爬虫开发实战>学习爬虫时,练习项目中使用 requests ajax 爬取今日头条的"街拍"图片,发现书上的源代码有些已经不适合现在了, ...
Scrapy 爬取今日头条街拍图片
scrapy 爬取今日头条图片保存至本地之前用 requests 爬取过今日头条街拍的图片,当时只是爬取每篇文章的缩略图,今天尝试用 scrapy 来大规模爬取街拍详细图片. 分析页面今日头条的内 ...
我的爬虫之爬今日头条街拍图片
近日学习了python 爬虫方面的内容 ,决定实战--爬今日头条的街拍图片首先先分析今日头条的请求方式,进入https://www.toutiao.com F12 搜索街拍查看当前请求 http ...
python爬今日头条组图_python 爬虫抓取今日头条街拍图片
1. 打开google浏览器,输入www.toutiao.com, 搜索街拍.html 2.打开开发者选项,network监看加载的xhr, 数据是ajax异步加载的,能够看到preview里面的da ...

python 抓取头条街拍图片

python 抓取头条街拍图片相关推荐

最新文章

热门文章