python爬取玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中

1、创建Scrapy项目

scrapy startproject ExGrain

2.进入项目目录，使用命令genspider创建Spider

scrapy genspider exgrain ex-grain.cn

3、定义要抓取的数据（处理items.py文件）

# -*- coding: utf-8 -*-
import scrapyclass ExgrainItem(scrapy.Item):# 文章的目录news_path = scrapy.Field()# 文章的分类news_cate = scrapy.Field()# 文章标题news_title = scrapy.Field()# 文章发布日期news_date = scrapy.Field()# 文章来源news_source = scrapy.Field()# 文章导读news_guide = scrapy.Field()# 文章内容news_content = scrapy.Field()# 文章链接news_url = scrapy.Field()

4、编写提取item数据的Spider（在spiders文件夹下：exgrain.py）

# -*- coding: utf-8 -*-
# 爬取中国谷物网玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中，偶尔出现抓取数据不准确的情况
import scrapy
from ExGrain.items import ExgrainItem
import re
import os
import requests
from bs4 import BeautifulSoup
import timeclass ExgrainSpider(scrapy.Spider):name = 'exgrain'allowed_domains = ['ex-grain.cn']# 玉米、小麦、稻米信息start_urls = ['http://www.ex-grain.cn/xxfb/list.htm?type=010301','http://www.ex-grain.cn/xxfb/list.htm?type=010302','http://www.ex-grain.cn/xxfb/list.htm?type=010201']url = "http://www.ex-grain.cn"def parse(self, response):items = []# 获取下一页next_url = response.xpath('//tr/td[@class="grayr"]/a/@href').extract()news_url = response.xpath('//tr/td/a[@class="new List"]/@href').extract()for i in range(len(news_url)):item = ExgrainItem()item['news_url'] = self.url + news_url[i]items.append(item)for item in items:time.sleep(2)yield scrapy.Request(url=item['news_url'], meta={'meta_1': item}, callback=self.parse_news)# 处理下一页for url in next_url:full_url = self.url + urlyield scrapy.Request(url=full_url, callback=self.parse)def parse_news(self, response):item = ExgrainItem()# 提取每次Response的meta数据meta_1 = response.meta['meta_1']# 获取文章标题,有空格news_title = response.xpath('//tr/td[@class="h13"]/span/text()').extract()[0].replace(" ", "")# print("news_title_1",news_title)item['news_title'] = news_title# 获取文章来源,需要处理数据:发布时间：2018-07-18 10:54:46  |来源：  |作者：source_list = response.xpath('//tr[2]/td[@class="h3"]/text()').extract()[0]# 获取来源后的字段source = source_list.split("|")[1][3:].strip()if source == "":item["news_source"] = "中国谷物网"else:item["news_source"] = source# 获取发布时间：2018-07-18news_date = source_list.split("：")[1].split(" ")[0]html = requests.get(meta_1['news_url'])# 正则匹配文章内容patt = re.compile(r'<td style="width:890px;display:block;word-break:(.*) align="left">(.*)')# 匹配结果result = patt.search(html.text)# 获取文章内容news_content = result.group(2)# 将文字内容结果字体改变成微软雅黑item['news_content'] = news_content.replace('宋体', '微软雅黑').replace('仿宋','微软雅黑').replace('Courier New','微软雅黑')# 获取文章导读,只获取文章内容的一部分soup = BeautifulSoup(html.text, "lxml")content_list = []for i in soup.select("p"):content_list.append(i.get_text())# 将列表连接起来并去掉首尾空格news_guide_list = "".join(content_list).replace(" ", "")# 如果文章内容是以"<p>&nbsp;</p><table"开头的，文章可能是表格，导读就是文章标题if news_content[:19] == "<p>&nbsp;</p><table":news_guide = news_titleelse:if len(news_guide_list[:70]) != 0:news_guide = news_guide_list[:70].replace("\xa0", "") + "......"else:news_guide = news_guide_list.replace("\xa0", "")item['news_guide'] = news_guideitem['news_date'] = news_date# 判断属于哪个类目# 小麦类目wheat_news_url = "http://www.ex-grain.cn/island/FX_010302"wheat_if_belong = meta_1['news_url'].startswith(wheat_news_url)# 玉米类目corn_news_url = "http://www.ex-grain.cn/island/FX_010301"corn_if_belong = meta_1['news_url'].startswith(corn_news_url)# 水稻类目rice_news_url = "http://www.ex-grain.cn/island/FX_010201"rice_if_belong = meta_1['news_url'].startswith(rice_news_url)if wheat_if_belong:item['news_cate'] = '小麦'news_path = "./Data/小麦/" + news_date + "/" + news_title# 如果目录不存在则创建if (not os.path.exists(news_path)):os.makedirs(news_path)item['news_path'] = news_pathprint("处理数据:%s" % (news_path[7:]))elif corn_if_belong:item['news_cate'] = '玉米'news_path = "./Data/玉米/" + news_date + "/" + news_title# 如果目录不存在则创建if (not os.path.exists(news_path)):os.makedirs(news_path)item['news_path'] = news_pathprint("处理数据:%s" % (news_path[7:]))elif rice_if_belong:item['news_cate'] = '水稻'news_path = "./Data/水稻/" + news_date + "/" + news_title# 如果目录不存在则创建if (not os.path.exists(news_path)):os.makedirs(news_path)item['news_path'] = news_pathprint("处理数据:%s" % (news_path[7:]))item['news_url'] = meta_1['news_url']yield item

5.处理pipelines管道文件保存数据，可将结果保存到文件中（pipelines.py）

# -*- coding: utf-8 -*-
import json# 转码操作，继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):def default(self, o):if isinstance(o, bytes):return str(o, encoding='utf-8')return json.JSONEncoder.default(self, o)class ExgrainPipeline(object):def process_item(self, item, spider):self.fail_count = 0try:file_name = item['news_title']with open(item['news_path'] + "/" + file_name + ".html", "w+")as f:f.write(item['news_content'])except:self.fail_count += 1print("%s文件保存失败，请注意！"%item['news_title'])self.file_name_fail = item['news_title']with open(item['news_path'] + "/" + "[失败！]/"+self.file_name_fail + ".html", "w+")as f:f.write("<p>写入失败！</p>")return itemdef close_spider(self, spider):if self.fail_count != 0:print("%s文件保存失败了..."%self.file_name_fail)print("数据保存本地处理完毕，谢谢使用！")

6.增加ExGrainpipelines.py文件，同时将数据保存到mysql数据库中

# -*- coding: utf-8 -*-
import json
import pymysql
# 转码操作，继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):def default(self, o):if isinstance(o, bytes):return str(o, encoding='utf-8')return json.JSONEncoder.default(self, o)class DBPipeline(object):def __init__(self):# 连接数据库self.connect = pymysql.connect(host='localhost',port=3306,db='python3',user='root',passwd='123456',charset='utf8',use_unicode=True)# 通过cursor执行增删查改self.cursor = self.connect.cursor()# 来个计数器，统计写入了多少self.count = 0# @classmethod# def from_settings(cls, settings):#     dbargs = dict(#         host=settings['MYSQL_HOST'],#         db=settings['MYSQL_DBNAME'],#         user=settings['MYSQL_USER'],#         passwd=settings['MYSQL_PASSWD'],#         port=settings['MYSQL_PORT'],#         charset='utf8',#         cursorclass=pymysql.cursors.DictCursor,#         use_unicode=True,# )# dbpool = adbapi.ConnectionPool('pymysql', **dbargs)# return cls(dbpool)# def __init__(self,dbpool):#     self.dbpool=dbpooldef process_item(self, item, spider):try:# 查重处理self.cursor.execute("""SELECT news_url FROM exgrain WHERE news_url = %s""",item['news_url'])# 是否有重复数据repetition = self.cursor.fetchone()# 重复if repetition:print("数据库已有此条数据，不再添加",repetition[0])else:print("写入数据库中...")# 插入数据self.cursor.execute("""INSERT INTO exgrain(news_cate,news_title, news_date, news_source, news_guide ,news_content, news_url)VALUES(%s,%s, %s, %s, %s, %s, %s)""",(item['news_cate'],item['news_title'],item['news_date'],item['news_source'],item['news_guide'],item['news_content'],item['news_url']))self.count += 1# 提交sql语句self.connect.commit()except Exception as error:# 出现错误时打印错误日志log(error)return itemdef close_spider(self, spider):self.cursor.close()self.connect.close()print("数据库处理完毕，本次共计增加%d条数据，谢谢使用！"%self.count)

7.配置settings文件（settings.py，调用数据库成功例子：https://blog.csdn.net/z564359805/article/details/81561912）

# Obey robots.txt rules，具体含义参照：https://blog.csdn.net/z564359805/article/details/80691677
ROBOTSTXT_OBEY = False # # 将数据保存在mysql
# MYSQL_HOST = 'localhost'
# MYSQL_DBNAME = 'python3'
# MYSQL_USER = 'root'
# MYSQL_PASSWD = '123456'
# MYSQL_PORT = 3306# 下载延迟
DOWNLOAD_DELAY = 4
# Override the default request headers:添加User-Agent信息
DEFAULT_REQUEST_HEADERS = {      'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',      # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      # 'Accept-Language': 'en',
}  # Configure item pipelines去掉下面注释，打开管道文件
ITEM_PIPELINES = {'ExGrain.pipelines.ExgrainPipeline': 100,'ExGrain.ExGrainpipelines.DBPipeline': 300,}# 还可以将日志存到本地文件中（可选添加设置）
LOG_FILE = "exgrain.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True

8.记得提前打开mysql数据库，并且建立好相应的表

# 创建谷物网文章的数据库表
CREATE TABLE exgrain(id int PRIMARY KEY auto_increment not null,news_cate varchar(2),news_title varchar(100),news_date date,
news_source varchar(30),news_guide VARCHAR(150),news_content MEDIUMTEXT,news_url VARCHAR(90));

9.以上设置完毕，进行爬取：执行项目命令crawl，启动Spider：

scrapy crawl exgrain

PS:（偶尔出现抓取文章标题或者文章内容不准确的情况，一直未解决，网站本身刷新的时候数据会改变，不知道怎么解决？）

python爬取玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中相关推荐

练习：使用Python爬取COVID-19疫情国内当日数据
练习:使用Python爬取COVID-19疫情国内当日数据推荐公众号:数据酷客 (里面有超详细的教程) 代码来源数据酷客公众号教程 URL它是Uniform Resource Locator的缩写, ...
如何使用python编程抢京东优惠券知乎_学好Python爬取京东知乎价值数据
原标题:学好Python爬取京东知乎价值数据 Python爬虫为什么受欢迎如果你仔细观察,就不难发现,懂爬虫.学习爬虫的人越来越多,一方面,互联网可以获取的数据越来越多,另一方面,像 Python这 ...
python关于二手房的课程论文_基于python爬取链家二手房信息代码示例
基本环境配置 python 3.6 pycharm requests parsel time 相关模块pip安装即可确定目标网页数据哦豁,这个价格..................看到都觉得脑阔 ...
使用python爬取BOSS直聘岗位数据并做可视化（Boss直聘对网页做了一些修改，现在的代码已经不能用了）
使用python爬取BOSS直聘岗位数据并做可视化结果展示首页岗位信息岗位详情薪资表学历需求公司排名岗位关键词福利关键词代码展示爬虫代码一.导入库二.爬取数据 1.爬取数据代 ...
python爬取火车票网的时刻表数据
python爬取火车票网的时刻表数据导包 import re,requests,datetime,time,json from prettytable import PrettyTable from ...
通过爬取天猫商品评论实例分析Python爬取ajax动态生成的数据
本文主要通过爬取天猫商品kindle的评论为例来说明利用python爬取ajax动态生成的数据的方式,本文使用的工具如下: 工具 chrome浏览器[寻找评论的动态链接] python3.5[执行代码 ...
Python爬取药监局化妆品管理信息发现的问题
Python爬取药监局化妆品管理信息 **1.json格式本质上是字符串!!! 今天在爬取国家药监局化妆品管理信息的时候,发现"json数据本质上是字符串",以前我还以为json本 ...
使用Python爬取51job招聘网的数据
使用Python爬取51job招聘网的数据进行网站分析获取职位信息存储信息最终代码进行网站分析进入https://www.51job.com/这个网站我在这就以python为例搜索职位跳 ...
Python 爬取拉勾招聘信息
Python 爬取拉勾招聘信息故事背景最近有个好哥们啊浪迫于家里工资太低,准备从北方老家那边来深圳这边找工作,啊浪是学平面设计的知道我在深圳这边于是向我打听深圳这边平面设计薪资水平,当时我有点懵逼 ...

python爬取玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中

python爬取玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中相关推荐

最新文章

热门文章