
  1. 功能描述:
  • 多线程网络爬虫,爬取网页图片地址(也可提取其他特征的URL)
  • 使用python开发一个迷你定向抓取器,实现对种子链接的广度优先抓取,并把URL长相符合特定pattern的网页保存到磁盘上。
  1. 程序运行:
  • python -c spider.conf
  1. 配置文件
  • spider.conf:


  • feedfile: ./urls                                                                     # 种子文件路径
  • result: ./                                                           # 抓取结果存储文件, 一行一个
  • max_depth: 6                                                                    # 最大抓取深度(种子为0级)
  • crawl_interval: 1                                                             # 抓取间隔. 单位: 秒
  • crawl_timeout: 2                                                             # 抓取超时. 单位: 秒
  • thread_count: 8                                                             # 抓取线程数
  • filter_url: .*.(gif|png|jpg|bmp)$                                       # URL特征
  1. 种子文件urls:
  1. 抓取策略
  • 广度优先的网页抓取策略
  • 多线程抓取
  • 获取符合特征的链接地址并存储到文件(例如gif|png|jpg|bmp为扩展格式的 url)
  • 链接的绝对路径存储到result.data文件中, 一行一个 (图片也可直接保存至本地)
  • 从HTML提取链接时支持处理相对路径及绝对路径

#!/usr/bin/env python
# Copyright (c) 2020, Inc. All Rights Reserved
This module is the main module
@Time    : 2020/11/09
@File    :
@Author  :
"""import log
from worker.SpiderWorker import SpiderWorker
from worker.param_parser import parm_parserdef main():"""Main method to run mini spider"""# get input paramsargs = parm_parser.get_args()# init log configlog.init_log('./log/mini_spider')if args:# read config file spider.confconf_params = parm_parser.set_config_by_file(args.conf)# use config set up spider initial paramsspider = SpiderWorker(conf_params)# init result_path, make it completespider.set_path()# init url queuespider.set_url_queue()# start to crawl urlspider.start_crawl_work()returnif __name__ == '__main__':main()


result: ./
max_depth: 6
crawl_interval: 1
crawl_timeout: 2
thread_count: 8
filter_url: .*\.(gif|png|jpg|bmp)$   多线程模块

#!/usr/bin/env python
# Copyright (c) 2020, Inc. All Rights Reserved
This module is threading module, it is used to enable multithreading and multi line processing of requests
@Time    : 2020/11/09
@File    :
@Author  :
"""import logging
import re
import time
import threading
from worker.UrlHandler import UrlHandlerclass SpiderThread(threading.Thread):"""Provide multi thread for mini spider"""def __init__(self, urlqueue, result_path, max_depth, interval, timeout, filter_url, total_urlset):threading.Thread.__init__(self)self.urlqueue = urlqueueself.result_path = result_pathself.max_depth = max_depthself.interval = intervalself.timeout = timeoutself.filter_url = filter_urlself.total_urlset = total_urlsetself.lock = threading.Lock()def can_download(self, url):"""Judge whether the url can be download. write your download rules here.:param url: target url:return: True, False"""if not UrlHandler.is_url(url):return Falsetry:# Regular expression matching image URLpattern = re.compile(self.filter_url)except Exception as e:logging.error("the filter url %s is not re..compile fail: %s" % (self.filter_url, e))return False# if url length < 1 or url is not image type urlif len(url.strip(' ')) < 1 or not pattern.match(url.strip(' ')):return False# if url has been in total url set (avoid repeat downloads)if url in self.total_urlset:return Falsereturn Truedef run(self):"""Run crawling threadGet task from queue and add sub url into queue, crawling page strategy -- BFS.:return: no return"""while True:try:# get url and the page levelurl, level = self.urlqueue.get(block=True, timeout=self.timeout)except Exception as e:logging.error('Can not finish the task. job done. %s' % e)break# print url is Noneself.urlqueue.task_done()# sleep intervaltime.sleep(self.interval)# judge if url can be downloadif self.can_download(url):UrlHandler.download_url(self.result_path, url)# put a lock on add url to total url setself.lock.acquire()self.total_urlset.add(url)self.lock.release()# get the sub urls from urlsuburls = UrlHandler.get_urls(url)suburl_level = level + 1# if sub url level larger than max_depth, stop crawling page deeperif suburl_level > self.max_depth:continuefor suburl in suburls:self.urlqueue.put((suburl, suburl_level))  主工作模块

#!/usr/bin/env python
# Copyright (c) 2020, Inc. All Rights Reserved
This module is main worker, central module for crawling tasks
@Time    : 2020/11/09
@File    :
@Author  :
import os
from queue import Queue
import logging
from worker.SpiderThread import SpiderThreadclass SpiderWorker(object):def __init__(self, *args, **kwargs):params = args[0]self.urls = params[0]self.result_path = params[1]self.maxdepth = params[2]self.interval = params[3]self.timeout = params[4]self.thread_count = params[5]self.filter_url = params[6]self.total_urlset = set()self.urlqueue = Queue()def set_abs_dir(self, path):"""Complete url path ,and mkdir if it not exits:param path: url path:return: result output path"""file_dir = os.path.join(os.getcwd(), path)if not os.path.exists(file_dir):try:os.mkdir(file_dir)except os.error as err:logging.error("mkdir result-saved dir error: %s. " % err)return str(file_dir)def set_path(self):"""Complete the path:return:"""self.result_path = self.set_abs_dir(self.result_path)def set_url_queue(self):"""Set url queue:return: True or False"""try:self.urlqueue.put((self.urls, 0))except Exception as e:logging.error(e)return Falsereturn Truedef start_crawl_work(self):"""Start to work:return: nothing"""thread_list = []for i in range(self.thread_count):thread = SpiderThread(self.urlqueue, self.result_path, self.maxdepth, self.interval,self.timeout, self.filter_url, self.total_urlset)thread_list.append(thread)"%s start..." % thread in thread_list:thread.join()"thread %s work is done " %"queue is all done")return   URL处理,http请求模块

#!/usr/bin/env python
# Copyright (c) 2020, Inc. All Rights Reserved
This module is used to handle URL and HTTP related requests
@Time    : 2020/11/09
@File    :
@Author  :
import os
from urllib import parse, request
import logging
import chardet
from bs4 import BeautifulSoup
import requestsclass UrlHandler(object):"""Public url tools for handle url"""@staticmethoddef is_url(url):"""Ignore url starts with Javascipt:param url::return: True or False"""if url.startswith("javascript"):return Falsereturn True@staticmethoddef get_content(url, timeout=10):"""Get html contents:param url: the target url:param timeout: request timeout, default 10:return: content of html page, return None when error happens"""try:response = requests.get(url, timeout=timeout)except requests.HTTPError as e:logging.error("url %s request error : %s" % (url, e))return Noneexcept Exception as e:logging.error(e)return Nonereturn UrlHandler.decode_html(response.content)@staticmethoddef decode_html(content):"""Decode html content:param content: origin html content:return: returen decoded html content. Error return None"""encoding = chardet.detect(content)['encoding']if encoding == 'GB2312':encoding = 'GBK'else:encoding = 'utf-8'try:content = content.decode(encoding, 'ignore')except Exception as err:logging.error("Decode error: %s.", err)return Nonereturn content@staticmethoddef get_urls(url):"""Get all suburls of this url:param url: origin url:return: the set of sub_urls"""urlset = set()if not UrlHandler.is_url(url):return urlsetcontent = UrlHandler.get_content(url)if content is None:return urlsettag_list = ['img', 'a', 'style', 'script']linklist = []for tag in tag_list:linklist.extend(BeautifulSoup(content).find_all(tag))# get url has attr 'src' and 'href'for link in linklist:if link.has_attr('src'):urlset.add(UrlHandler.parse_url(link['src'], url))if link.has_attr('href'):urlset.add(UrlHandler.parse_url(link['href'], url))return urlset@staticmethoddef parse_url(url, base_url):"""Parse url to make it complete and standard:param url: the current url:param base_url: the base url:return: completed url"""if url.startswith('http') or url.startswith('//'):url = parse.urlparse(url, scheme='http').geturl()else:url = parse.urljoin(base_url, url)return url@staticmethoddef download_image_file(result_dir, url):"""Download image as file, save in result dir:param result_dir: base_path:param url: download url:return: succeed True, fail False"""if not os.path.exists(result_dir):try:os.mkdir(result_dir)except os.error as err:logging.error("download to path, mkdir errror: %s" % err)try:path = os.path.join(result_dir, url.replace('/', '_').replace(':', '_').replace('?', '_').replace('\\', '_'))"download url..: %s" % url)request.urlretrieve(url, path, None)except Exception as e:logging.error("download url %s fail: %s " % (url, e))return Falsereturn True@staticmethoddef download_url(result_file, url):"""Download the URL that matches the characteristics, and save in a file:param result_file: base_path:param url: download url:return: succeed True, fail False"""try:path = os.path.join(os.getcwd(), result_file)"download url..: %s" % url)with open(path, 'a') as f:f.write(url + '\n')except Exception as e:logging.error("download url %s fail: %s " % (url, e))return Falsereturn True  参数解析模块

#!/usr/bin/env python
# Copyright (c) 2020, Inc. All Rights Reserved
This module is used to parse params
@Time    : 2020/11/09
@File    :
@Author  :
import argparse
import logging
import configparserclass parm_parser(object):@staticmethoddef set_config_by_file(config_file):"""Set spiderworker params by config file:param : config file:return: True, False"""config = configparser.ConfigParser(), encoding='utf-8')urls = config['spider']['feedfile']  # feedfile pathresult_path = config['spider']['result']  # result storage filemax_depth = config['spider']['max_depth']  # max scratch depthcrawl_interval = config['spider']['crawl_interval']  # scratch intervalcrawl_timeout = config['spider']['crawl_timeout']  # scratch timeoutthread_count = config['spider']['thread_count']  # scratch threadfilter_url = config['spider']['filter_url']  # URL characteristicsreturn urls, result_path, int(max_depth), int(crawl_interval), int(crawl_timeout), int(thread_count), filter_url@staticmethoddef get_args():"""Get console args and parse:return: nothing"""try:parser = argparse.ArgumentParser(prog='other_mini_spider',usage='minispider using method',description='other_mini_spider is a Multithreaded crawler')parser.add_argument('-c', '--conf', help='config_file')parser.add_argument('-v', '--version', help='version', action="store_true")except argparse.ArgumentError as e:logging.error("get option error : %s." % e)returnargs = parser.parse_args()if args.version:parm_parser.version()if args.conf:return args@staticmethoddef version():"""Print mini spider version"""print("other_mini_spider version 1.0.0")


    Python 多线程抓取网页 - 糖拌咸鱼 - 博客园 Python 多线程抓取网页 最近,一直在做网络爬虫相关的东西. 看了一下开源C++写的larbin爬虫,仔细阅读了里面的设计思想和一些关键技术 ...

