Scrapy爬取IT桔子死亡公司库及资本机构数据

此数据爬取仅作学习研究用，严禁用做商业用途

目标设定

爬取IT桔子死亡公司、投资机构、LP、GP、基金机构数据，并录入MongoDB，全量爬取并判重。

前期准备

分析请求路径

通过Chrome进行抓包分析和测试，获得以下获取数据的以下路径：

死亡公司

请求路径：https://www.itjuzi.com/api/closure
请求方法：GET
请求参数：page 页码

投资机构

请求路径：https://www.itjuzi.com/api/investments
请求方法：POST
请求参数：page 页码

LP

请求路径：https://www.itjuzi.com/api/lp
请求方法：POST
请求参数：page 页码

GP

请求路径：https://www.itjuzi.com/api/gp
请求方法：POST
请求参数：page 页码

基金机构

请求路径：https://www.itjuzi.com/api/fund
请求方法：POST
请求参数：page 页码

分析响应报文

此处，使用的EOLINKER做的API测试，以死亡公司为例：

info内的数据都是我们想要的数据

数据存储规划

爬取的数据都存入MongoDB的itorange数据库内，数据集名分别为death_company、gp、lp、fund、investments，即爬虫任务的名称，数据记录为各公司机构详细信息。

代码实现

死亡公司、投资机构、LP、GP、基金机构五类数据，分别配置一个独立的爬虫任务，并传入独立的处理管道。

由于爬虫任务中请求方法和参数处理不一样，故一个爬虫任务一个文件；处理管道除具体处理不一样，都需要连接数据库和打印始末日志，故可创建基类，各处理管道重写基类方法即可。

此外，为避免频繁访问被封，每次请求前暂停随机秒数。

主要代码文件实现如下：

setting.py 配置文件

# MongoDB数据库连接
MONGO_HOST = '106.13.73.198'
MONGO_PORT = 31000
MONGO_USER = 'root'
MONGO_PASSWORD = '@wjbd'
MONGO_DB = 'itorange'
# 绕过防爬规则
ROBOTSTXT_OBEY = False

items.py 数据模型文件

# -*- coding: utf-8 -*-
import scrapy# 死亡公司字段模型
class DeathCompanyItem(scrapy.Item):company_info = scrapy.Field()# 投资机构字段模型
class InvestmentItem(scrapy.Item):investment_info = scrapy.Field()# LP字段模型
class LpItem(scrapy.Item):lp_info = scrapy.Field()# GP字段模型
class GpItem(scrapy.Item):gp_info = scrapy.Field()# 基金机构字段模型
class FundItem(scrapy.Item):fund_info = scrapy.Field()

spider.py 爬虫任务文件

# -*- coding: utf-8 -*-
"""
死亡公司库爬取
"""
from ..items import DeathCompanyItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)# 死亡公司库爬虫
class DeathCompanySpider(scrapy.Spider):name = 'death_company'allowed_domains = ['www.itjuzi.com']# API接口路径start_url = 'https://www.itjuzi.com/api/closure'# 页码循环次数MAX_PAGE = 630# 停歇时间，防止频繁访问封IPidle_time=random.randint(0,5)# 传入对应的处理管道custom_settings={'ITEM_PIPELINES':{'itorange.pipelines.DeathCompanyPipeline':300}}# 传入页码参数def start_requests(self):# 稍作停歇，防止被封time.sleep(self.idle_time)for i in range(1,self.MAX_PAGE+1):data = dict()data['page']=iyield scrapy.Request(url=self.start_url,meta=data, callback=self.parse,dont_filter=True)# 解析响应报文def parse(self, response):# 解析全部报文try:result=json.loads(response.body)except Exception as e:logging.error('返回报文解析错误！')return# 判断响应是否成功result_code=result['code']if result_code!=200:return# 提取死亡公司信息try:death_company_item=DeathCompanyItem()company_info=result['data']['info']death_company_item['company_info']=company_infoyield death_company_itemexcept Exception as e:logging.error('未发现死亡公司信息！')return

# -*- coding: utf-8 -*-
"""
投资机构数据爬取
"""
from ..items import InvestmentItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)# 投资机构爬虫
class InvestmentsSpider(scrapy.Spider):name = 'investments'allowed_domains = ['www.itjuzi.com']# API接口路径start_url = 'https://www.itjuzi.com/api/investments'# 页码循环次数MAX_PAGE = 1# 停歇时间，防止频繁访问封IPidle_time=random.randint(0,5)# 传入对应的处理管道custom_settings = {'ITEM_PIPELINES': {'itorange.pipelines.InvestmentPipeline': 300}}# 传入页码参数def start_requests(self):# 稍作停歇，防止被封time.sleep(self.idle_time)for i in range(1, self.MAX_PAGE+1):data = dict()data['page'] = iyield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)# 解析响应报文def parse(self, response):print(response.headers)# 解析全部报文try:result = json.loads(response.body)except Exception as e:logging.error('返回报文解析错误！')return# 判断响应是否成功result_code = result['code']if result_code != 200:return# 提取投资机构信息try:investmen_item = InvestmentItem()investment_info = result['data']['data']investmen_item['investment_info'] = investment_infoyield investmen_itemexcept Exception as e:logging.error('未发现投资机构信息！')return

# -*- coding: utf-8 -*-"""
GP数据爬取
"""
from ..items import GpItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)class GpSpider(scrapy.Spider):name = 'gp'allowed_domains = ['www.itjuzi.com']# API接口路径start_url = 'https://www.itjuzi.com/api/gp'# 页码循环次数MAX_PAGE = 805# 停歇时间，防止频繁访问封IPidle_time=random.randint(0,5)# 传入对应的处理管道custom_settings = {'ITEM_PIPELINES': {'itorange.pipelines.GpPipeline': 300}}# 传入页码参数def start_requests(self):# 稍作停歇，防止被封time.sleep(self.idle_time)for i in range(1, self.MAX_PAGE+1):data = dict()data['page'] = iyield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)# 解析响应报文def parse(self, response):print(response.headers)# 解析全部报文try:result = json.loads(response.body)except Exception as e:logging.error('返回报文解析错误！')return# 判断响应是否成功result_code = result['code']if result_code != 200:return# 提取GP信息try:gp_item = GpItem()gp_info = result['data']['list']gp_item['gp_info'] = gp_infoyield gp_itemexcept Exception as e:logging.error('未发现GP信息！')return

# -*- coding: utf-8 -*-"""
LP数据爬取
"""
from ..items import LpItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)class LpSpider(scrapy.Spider):name = 'lp'allowed_domains = ['www.itjuzi.com']# API接口路径start_url = 'https://www.itjuzi.com/api/lp'# 页码循环次数MAX_PAGE = 1# 停歇时间，防止频繁访问封IPidle_time=random.randint(0,5)# 传入对应的处理管道custom_settings = {'ITEM_PIPELINES': {'itorange.pipelines.LpPipeline': 300}}# 传入页码参数def start_requests(self):# 稍作停歇，防止被封time.sleep(self.idle_time)for i in range(1, self.MAX_PAGE+1):data = dict()data['page'] = iyield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)# 解析响应报文def parse(self, response):print(response.headers)# 解析全部报文try:result = json.loads(response.body)except Exception as e:logging.error('返回报文解析错误！')return# 判断响应是否成功result_code = result['code']if result_code != 200:return# 提取LP信息try:lp_item = LpItem()lp_info = result['data']['data']lp_item['lp_info'] = lp_infoyield lp_itemexcept Exception as e:logging.error('未发现LP信息！')return

# -*- coding: utf-8 -*-"""
基金机构数据爬取
"""
from ..items import FundItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)class FundSpider(scrapy.Spider):name = 'fund'allowed_domains = ['www.itjuzi.com']# API接口路径start_url = 'https://www.itjuzi.com/api/fund'# 页码循环次数MAX_PAGE = 2500# 停歇时间，防止频繁访问封IPidle_time=random.randint(0,5)# 传入对应的处理管道custom_settings = {'ITEM_PIPELINES': {'itorange.pipelines.FundPipeline': 300}}# 传入页码参数def start_requests(self):# 稍作停歇，防止被封time.sleep(self.idle_time)for i in range(1, self.MAX_PAGE+1):data = dict()data['page'] = iyield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)# 解析响应报文def parse(self, response):print(response.headers)# 解析全部报文try:result = json.loads(response.body)except Exception as e:logging.error('返回报文解析错误！')return# 判断响应是否成功result_code = result['code']if result_code != 200:return# 提取GP信息try:fund_item = FundItem()fund_info = result['data']['list']fund_item['fund_info'] = fund_infoyield fund_itemexcept Exception as e:logging.error('未发现基金机构信息！')return

pipelines.py 处理管道文件

# -*- coding: utf-8 -*-
"""
分管道处理不同的爬虫任务
"""
import logging
import pymongo
from scrapy.utils.project import get_project_settings
# 读取默认配置文件
settings = get_project_settings()
# 设置日志输出级别
logging.getLogger().setLevel(logging.INFO)# 封装MongoDB文档操作
class Mongo:@classmethoddef getDoc(cls):db_name = settings['MONGO_DB']cls.client = pymongo.MongoClient(settings['MONGO_HOST'], settings['MONGO_PORT'])try:cls.db = cls.client.admin.authenticate(settings['MONGO_USER'], settings['MONGO_PASSWORD'])logging.info('MongoDB密码验证成功！')if cls.db:logging.info('MongoDB连接成功！')return cls.client[db_name]else:logging.error('MongoDB连接失败！')return Noneexcept Exception as e:logging.info('MongoDB密码验证失败:%s' % (e))# 基础任务管道类
class BasePipeline(object):# 建立数据库连接，选择相应数据集def open_spider(self, spider):logging.info('==================当前爬虫任务:%s' % spider.name)self.doc = Mongo.getDoc()self.collection = self.doc[spider.name]logging.info('%s文档已创建，准备写入！' % spider.name)# 爬虫结束def close_spider(self, spider):logging.info('=======爬虫任务:%s结束！' % spider.name)# 死亡公司库任务
class DeathCompanyPipeline(BasePipeline):# 处理爬虫数据def process_item(self, item, spider):info_list = item['company_info']for info in info_list:# 根据公司ID判重com_id = info['com_id']result = self.collection.find({'com_id': com_id})if len(list(result)) > 0:logging.info('数据已存在，无需插入！')continue# 不重复则插入try:self.collection.insert_one(info)logging.info('已写入%s！' % spider.name)except Exception as e:logging.error('写入出错：%s' % (e))return item# 投资机构任务
class InvestmentPipeline(BasePipeline):# 处理爬虫数据def process_item(self, item, spider):info_list = item['investment_info']for info in info_list:# 根据ID判重invetment_id = info['id']result = self.collection.find({'id': invetment_id})if len(list(result)) > 0:logging.info('数据已存在，无需插入！')continue# 不重复则插入try:self.collection.insert_one(info)logging.info('已写入%s！' % spider.name)except Exception as e:logging.error('写入出错：%s' % (e))return item# LP任务
class LpPipeline(BasePipeline):# 处理爬虫数据def process_item(self, item, spider):info_list = item['lp_info']for info in info_list:# 根据ID判重lp_id = info['id']result = self.collection.find({'id': lp_id})if len(list(result)) > 0:logging.info('数据已存在，无需插入！')continue# 不重复则插入try:self.collection.insert_one(info)logging.info('已写入%s！' % spider.name)except Exception as e:logging.error('写入出错：%s' % (e))return item# GP任务
class GpPipeline(BasePipeline):# 处理爬虫数据def process_item(self, item, spider):info_list = item['gp_info']for info in info_list:# 根据ID判重gp_id = info['id']result = self.collection.find({'id': gp_id})if len(list(result)) > 0:logging.info('数据已存在，无需插入！')continue# 不重复则插入try:self.collection.insert_one(info)logging.info('已写入%s！' % spider.name)except Exception as e:logging.error('写入出错：%s' % (e))return item# 基金机构任务
class FundPipeline(BasePipeline):# 处理爬虫数据def process_item(self, item, spider):info_list = item['fund_info']for info in info_list:# 根据FUND_ID判重fund_id = info['fund_id']result = self.collection.find({'fund_id': fund_id})if len(list(result)) > 0:logging.info('数据已存在，无需插入！')continue# 不重复则插入try:self.collection.insert_one(info)logging.info('已写入%s！' % spider.name)except Exception as e:logging.error('写入出错：%s' % (e))return item

任务执行

# 死亡公司库爬取
> scrapy crawl death_company
# 投资机构爬取
> scrapy crawl investments
# GP爬取
> scrapy crawl gp
# LP爬取
> scrapy crawl lp
# 基金机构爬取
> scrapy crawl fund

执行结果

完整代码查看

https://gitee.com/angryshar128/Spider.git