一个爬取谷歌图片的python程序

查看了一些别人写的代码，照着大体的模板，写了一个自己的版本，亲测可用。

输入：一个文本，关键词断行分隔。

特点：一类别一文件夹，可使用自定义多线程下载，可自定义下载图片数目上限。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspectimport io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Connection': 'keep - alive','content-type': 'application/json'
}class myThread(threading.Thread):def __init__(self, threadID, urls_list, names_list, id_list, path):threading.Thread.__init__(self)self.threadID = threadIDself.urls_list = urls_listself.names_list = names_listself.id_list = id_listself.path = pathdef _async_raise(self, tid, exctype):"""raises the exception, performs cleanup if needed"""tid = ctypes.c_long(tid)if not inspect.isclass(exctype):exctype = type(exctype)res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))if res == 0:raise ValueError("invalid thread id")elif res != 1:# """if it returns a number greater than one, you're in trouble,# and you should call it again with exc=NULL to revert the effect"""ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)raise SystemError("PyThreadState_SetAsyncExc failed")def stop_thread(self):self._async_raise(self.ident, SystemExit)def run(self):print("线程%s开始" % self.threadID)for i in range(len(self.urls_list)):# 下载try:ir = requests.get(self.urls_list[i], headers=headers, timeout=10)open(self.path + '/%d.jpg' % (self.id_list[i]),'wb').write(ir.content)print("download picture id: %d sucess" % self.id_list[i])except Exception as ex2:print('download error！！' + str(ex2))continue# self.stop_thread()print("线程%s退出" % self.threadID)class Crawler():def __init__(self, query, path, thread_count):self.url = base_url_part1 + search_query + base_url_part2self.search_query = queryself.path = pathself.thread_count = thread_count# 启动Chrome浏览器驱动def start_brower(self):chrome_options = Options()chrome_options.add_argument("--disable-infobars")# chrome_options.add_argument('--headless')"""谷歌浏览器驱动地址"""executable_path = "C:/Anaconda3/chromedriver.exe""""启动Chrome浏览器"""driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)"""最大化窗口，因为每一次爬取只能看到视窗内的图片"""driver.maximize_window()"""浏览器打开爬取页面"""driver.get(self.url)return driverdef downloadImg(self, driver):"""滑动滚动条至：加载更多处"""end = Falsewhile True:html_page = driver.page_sourcehtml = etree.HTML(html_page)pictures = html.xpath('//*[@id="rg_s"]/div')google_url = 'https://www.google.com'urls_list = []names_list = []for picture in pictures:url = picture.xpath('./div/text()')if url != []:raw_data = str(url[0])raw_data_dict = json.loads(raw_data)urls_list.append(raw_data_dict["ou"])name = picture.xpath('./a[2]/div[@class="mVDMnf nJGrxf"]/text()')names_list.append(str(name[0]))# 比较当前的下载数目是否已经满足要求if len(names_list) >= download_count:urls_list = urls_list[:download_count]names_list = names_list[:download_count]breakif end is True:break# 滑动刷新for i in range(5):pos = i * 50000js = "document.documentElement.scrollTop=%d" % posdriver.execute_script(js)time.sleep(1)try:driver.find_element_by_xpath("./*//input[@value='显示更多结果']").click()except:end = Truecontinuetime.sleep(1)file_write = open(self.path + '/' + self.search_query + '.txt','w+',encoding='utf-8')# 在txt中书写id-图片名length = len(names_list)id_list = [i for i in range(length)]for i in id_list:file_write.write(str(i) + ' ' + names_list[i] + '\n')file_write.close()time.sleep(10)# 开始下载thread_list = []next_start = 0for i in range(thread_count):start_id = next_startend_id = int(float(length) / thread_count * (i + 1))end_id += 1next_start = end_idthread_list.append(myThread(i, urls_list[start_id:end_id],names_list[start_id:end_id], id_list[start_id:end_id],self.path))thread_list[i].start()for i in range(thread_count):thread_list[i].join()def run(self):driver = self.start_brower()self.downloadImg(driver)driver.close()print("{} download has finished.".format(self.search_query))if __name__ == '__main__':start = time.time()# base_url_part1以及base_url_part2都是固定不变的，无需更改base_url_part1 = 'https://www.google.com/search?q='base_url_part2 = '&source=lnms&tbm=isch'# 下载图片数目download_count = 2000# 爬取关键字file_read = open('search_imgs.txt', 'r+')search_list = file_read.readlines()totalPath = 'F:/张晋豪资料包/人工智能/视频分析资料/正式工作/爬虫/google_picture/picture/downloads2/'# 针对每一个开始下载craw_list = []for search_query in search_list:search_query = search_query.strip()thread_count = 200  # 每个类别的下载线程数path = os.path.join(totalPath, search_query)try:if not os.path.exists(path):os.mkdir(path)time.sleep(1)except Exception as e:print(e)craw = Crawler(query=search_query, path=path, thread_count=thread_count)craw.run()end = time.time()print('all have been downloaded.')print('total cost time %d' % (end - start))

新版代码（一个类别只下得了10几张……现在网页没给jpg的绝对地址，64位2进制编码的10几张破了，没有地址后缀的那种我暂时搞不定，以后再说吧）

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect
import base64
import io
import sys
import urllib
import urllib.request
from io import StringIO
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36','Connection': 'keep - alive','content-type': 'application/json'
}class myThread(threading.Thread):def __init__(self, threadID, urls_list, names_list, id_list, path):threading.Thread.__init__(self)self.threadID = threadIDself.urls_list = urls_listself.names_list = names_listself.id_list = id_listself.path = pathdef _async_raise(self, tid, exctype):"""raises the exception, performs cleanup if needed"""tid = ctypes.c_long(tid)if not inspect.isclass(exctype):exctype = type(exctype)res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))if res == 0:raise ValueError("invalid thread id")elif res != 1:# """if it returns a number greater than one, you're in trouble,# and you should call it again with exc=NULL to revert the effect"""ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)raise SystemError("PyThreadState_SetAsyncExc failed")def stop_thread(self):self._async_raise(self.ident, SystemExit)def run(self):print("线程%s开始" % self.threadID)for i in range(len(self.urls_list)):# data = self.urls_list[i].split(',')[1]# if data != self.urls_list[i]:if self.urls_list[i].startswith('data:image/jpeg;base64,'):data = self.urls_list[i].replace('data:image/jpeg;base64,', '')image_data = base64.b64decode(data, '-_')fh = open(self.path + '/%d.jpeg' % (self.id_list[i]), 'wb')fh.write(image_data)fh.close()# 下载else:try:# request = urllib.request.Request(self.urls_list[i])# response = urllib.request.urlopen(request)# get_img = response.read()# tmpIm = StringIO(get_img)# with open(self.path + '/%d.jpeg' % (self.id_list[i]),#           'wb') as fp:#     fp.write(tmpIm)# ir = requests.get(#     self.urls_list[i], headers=headers, timeout=10)ir = requests.get(self.urls_list[i], verify=False)with open(self.path + '/%d.jpeg' % (self.id_list[i]), 'wb') as f:f.write(ir.content)# urllib.request.urlretrieve(self.id_list[i], self.path + '/%d.jpg' % (self.id_list[i]))print("download picture id: %d success" % self.id_list[i])except Exception as ex2:print('download error！！' + str(ex2))continueself.stop_thread()print("线程%s退出" % self.threadID)class Crawler():def __init__(self, query, path, thread_count):self.url = base_url_part1 + search_query + base_url_part2self.search_query = queryself.path = pathself.thread_count = thread_count# 启动Chrome浏览器驱动def start_brower(self):chrome_options = Options()chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("--no-sandbox")# chrome_options.add_argument('--headless')"""谷歌浏览器驱动地址"""executable_path = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe""""启动Chrome浏览器"""driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)"""最大化窗口，因为每一次爬取只能看到视窗内的图片"""driver.maximize_window()"""浏览器打开爬取页面"""driver.get(self.url)return driverdef downloadImg(self, driver):"""滑动滚动条至：加载更多处"""end = Falsecount = 0while True:html_page = driver.page_sourcehtml = etree.HTML(html_page)pictures = html.xpath('//*[@id="islrg"]/div[1]/div')google_url = 'https://www.google.com'urls_list = []names_list = []for picture in pictures:url = picture.xpath('./a[1]/div[1]/img/@src')if url != []:urls_list.append(url[0])count += 1names_list.append('%d.jpg' % (count))# raw_data = str(url[0])# raw_data_dict = json.loads(raw_data)# urls_list.append(raw_data_dict["ou"])# name = picture.xpath(#     './a[2]/div[@class="mVDMnf nJGrxf"]/text()')# names_list.append(str(name[0]))# 比较当前的下载数目是否已经满足要求if len(names_list) >= download_count:urls_list = urls_list[:download_count]names_list = names_list[:download_count]breakif end is True:break# 滑动刷新for i in range(5):pos = i * 50000js = "document.documentElement.scrollTop=%d" % posdriver.execute_script(js)time.sleep(1)try:driver.find_element_by_xpath("./*//input[@value='显示更多结果']").click()except:end = Truecontinuetime.sleep(1)file_write = open(self.path + '/' + self.search_query + '.txt','w+',encoding='utf-8')# 在txt中书写id-图片名length = len(names_list)id_list = [i for i in range(length)]for i in id_list:file_write.write(str(i) + ' ' + urls_list[i] + '\n')file_write.close()time.sleep(10)# 开始下载thread_list = []next_start = 0for i in range(thread_count):start_id = next_startend_id = int(float(length) / thread_count * (i + 1))end_id += 1next_start = end_idthread_list.append(myThread(i, urls_list[start_id:end_id],names_list[start_id:end_id], id_list[start_id:end_id],self.path))thread_list[i].start()for i in range(thread_count):thread_list[i].join()def run(self):driver = self.start_brower()self.downloadImg(driver)driver.close()print("{} download has finished.".format(self.search_query))if __name__ == '__main__':start = time.time()# base_url_part1以及base_url_part2都是固定不变的，无需更改base_url_part1 = 'https://www.google.com/search?q='base_url_part2 = '&source=lnms&tbm=isch'# 下载图片数目download_count = 50# 爬取关键字file_read = open('search_imgs.txt', 'r+')search_list = file_read.readlines()totalPath = './picture/'# 针对每一个开始下载craw_list = []for search_query in search_list:search_query = search_query.strip()thread_count = 10  # 每个类别的下载线程数path = os.path.join(totalPath, search_query)try:if not os.path.exists(path):os.mkdir(path)time.sleep(1)except Exception as e:print(e)craw = Crawler(query=search_query, path=path, thread_count=thread_count)craw.run()end = time.time()print('all have been downloaded.')print('total cost time %d' % (end - start))

一个爬取谷歌图片的python程序相关推荐

【网络爬虫】运行该程序获取距离2022年高考仅剩多少天，一个利用网络爬虫爬取高考倒计时的python程序
程序解决问题描述如下: 利用网络爬虫在一个2022年高考倒计时网站上爬取距离2022年高考的天数,并将爬取到的高考倒计时天数以文本文件保存到电脑磁盘中. 打开网址按F12可以看到我们利用网络爬虫从网址 ...
python爬虫爬取京东图片（python小白笔记七）
有时候需要统计,图片参考等,用python爬虫.爬下来的图片再存储到本地,同时把文件的名称取出一下.同时,python真是个有趣的东西,欢迎一起交流学习. 代码如下: 我的只是提取第一页,同时把图片保 ...
python爬取论坛图片_[python爬虫] Selenium定向爬取虎扑篮球海量精美图片
前言: 作为一名从小就看篮球的球迷,会经常逛虎扑篮球及湿乎乎等论坛,在论坛里面会存在很多精美图片,包括NBA球队.CBA明星.花边新闻.球鞋美女等等,如果一张张右键另存为的话真是手都点疼了.作为程序员 ...
python爬取谷歌翻译
由于谷歌翻译的api官方接口是有次数限制和收费的,于是乎,自己动手丰衣足食,通过抓包,js加密拼接,自己用python封装了一个爬取谷歌中英文翻译的接口,目前比较稳定无次数限制,地址:http://g ...
Python网络爬虫——爬取网站图片小工具
最近初学python爬虫,就写了一个爬取网站图片的小工具,界面如下: 用到的包主要是爬虫常用的urllib,urllib2和图形界面用的Tkinter,完整代码如下: # -*- coding:utf ...
Python 3.5_简单上手、爬取百度图片的高清原图
利用工作之余的时间,学习Python差不多也有小一个月的时间了,路漫漫其修远兮,我依然是只菜鸟. 感觉学习新技术确实是一个痛并快乐着的过程,在此分享些心得和收获,并贴一个爬取百度图片原图的代码. 代码 ...
Python 爬虫实例（1）—— 爬取百度图片
爬取百度图片在Python 2.7上运行 #!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: loveNightimport json ...
百度小程序html解析图片过大_如何快速高效爬取谷歌百度必应的图片
导读有些时候我们需要构建一个自己的数据集来训练模型.但是,却苦于没有大量的数据,此时就需要去谷歌.百度.必应搜索引擎上去爬取一些图片作为自己的数据集. 很自然的,我们就会想到写一个爬虫的程序去爬取图 ...
一个咸鱼的python_一个咸鱼的Python爬虫之路（三）：爬取网页图片
学完Requests库与Beautifulsoup库我们今天来实战一波,爬取网页图片.依照现在所学只能爬取图片在html页面的而不能爬取由JavaScript生成的图. 所以我找了这个网站 http: ...

一个爬取谷歌图片的python程序

一个爬取谷歌图片的python程序相关推荐

最新文章

热门文章