scrapy爬取《坏蛋是怎样练成的4》
scrapy具体介绍就不用说了,自己百度一下。或者参考以下文档
https://blog.csdn.net/u011054333/article/details/70165401
直接在cmd里运行
scrapy startproject huaidan
scrapy genspider huaidan huaida4.com
然后贴代码放到spiders文件夹里
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 from urllib import parse 5 import re 6 7 class huaidan(scrapy.Spider): 8 name = "huaidan" 9 allowed_domains = ["www.huaidan4.com"] 10 start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html", 11 "http://www.huaidan4.com/di-er-juan.html", 12 "http://www.huaidan4.com"] 13 14 15 #提取下一页文章url交给scrpy进行下载 16 def parse(self, response): 17 #获取文章url 18 all_article=response.css('.container ul li a::attr(href)').extract() 19 all_url=[] 20 for article_url in all_article: 21 if article_url in all_url: 22 pass 23 else: 24 all_url.append(article_url) 25 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail) 26 27 28 29 30 #提取文章的具体字段 31 def parse_detail(self,response): 32 #获取文章标题 33 article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first() 34 35 #获取创建时间 36 create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip() 37 38 #获取文章正文 39 article_text = response.css('.post_entry,p::text').extract_first() 40 #处理正文标点符号和无用的信息 41 article_text = re.sub('</?\w+[^>]*>','',article_text) 42 article_text = article_text.replace("\', \'","") 43 article_text = article_text.replace("\\u3000","").strip() 44 article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","") 45 article_text = article_text.replace("(新书上传,求收藏,推荐!!!!!!!!!!!!!!!!!!!!)","") 46 article_text = article_text.replace("\\r\\n", "\n") 47 article_text = article_text.replace("免费小说", "") 48 article_text = article_text.replace("www.huaidan4.com", "") 49 article_text = article_text.replace("neirong_2();", "") 50 article_text = article_text.replace("dibutuijian();", "") 51 article_text = article_text.replace("◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。", "") 52 article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品,作者是曹三少,如果你喜欢坏蛋是怎样炼成的4,请收藏本站以便下次阅读。","") 53 article_text = re.sub('/?\s+', '', article_text) 54 55 #保存文件 56 self.save_article(article_title,create_time,str(article_text)) 57 58 #保存文件的方法 59 def save_article(self,article_title,create_time,article_text): 60 biaoti = re.sub('\W+','-',article_title) 61 with open(biaoti+'.txt','w',encoding='utf-8') as file: 62 neirong = (article_title+'\n'+create_time+'\n'+article_text) 63 file.write(neirong) 64 file.close()
以上内容初步完成了把文章保存在本地
---------------------------------------------------------------------------------------------------------------------------------------------------------------
下面内容完成把文章保存到mysql数据库
items.py负责存放爬取节点数据
import scrapyclass HuaidanItem(scrapy.Item):catalogues=scrapy.Field()id=scrapy.Field()article_title = scrapy.Field()article_text = scrapy.Field()create_time = scrapy.Field()
piplines负责处理items里的内容
# -*- coding: utf-8 -*-import pymysql
from twisted.enterprise import adbapi# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#from scrapy.pipelines.images import ImagesPipeline
#from scrapy.pipelines.files import FilesPipelineclass HuaidanPipeline(object):def process_item(self, item, spider):return item#直接插入到mysql数据库
class MysqlPiplines(object):def __init__(self):self.db=pymysql.connect(host="192.168.7.5",user="huaidan",password="huaidan123",database="huaidan",charset = 'utf8')self.cursor=self.db.cursor()def process_item(self, item, spider):self.insert(item["catalogues"],int(item["id"]),item["article_title"],item["create_time"],item["article_text"])return itemdef insert(self,catalogues,id,article_title,create_time,article_text):selectsql="select id from diyijuan where id = %d " \" union select id from dierjuan where id =%d" \" union select id from disanjuan where id =%d" \" union select id from other where id =%d " % (id,id,id,id)self.cursor.execute(selectsql)if self.cursor.fetchone() is None:insertsql="insert into %s values (%d,'%s','%s','%s');" % (catalogues,id,article_title,create_time,article_text)try:self.cursor.execute(insertsql)self.db.commit()except:self.db.rollback()def spider_closed(self,spider):self.db.close()#异步插入到mysql数据库
class MysqlTwisted(object):def __init__(self,dbpool):self.dbpool = dbpool@classmethoddef from_settings(cls,settings):dbparms = dict(host = settings["MYSQL_HOST"],user = settings["MYSQL_USER"],passwd = settings["MYSQL_PASSWORD"],db = settings["MYSQL_DBNAME"],charset = 'utf8',cursorclass = pymysql.cursors.DictCursor,use_unicode = True,)dbpool=adbapi.ConnectionPool("pymysql", **dbparms)return cls(dbpool)# 使用twisted讲mysql插入变成异步执行def process_item(self, item, spider):query = self.dbpool.runInteraction(self.do_insert,item)query.addErrback(self.handle_error)# 处理异步插入异常def handle_error(self,faileure):print(faileure)# 执行具体的插入def do_insert(self,cursor,item):#查询id是否已经存在id=int(item["id"])selectsql = "select id from diyijuan where id = %d " \" union select id from dierjuan where id =%d" \" union select id from disanjuan where id =%d" \" union select id from other where id =%d " % (id,id,id,id)cursor.execute(selectsql)#如果执行不成功,代表不存在数据库。则执行插入步骤if cursor.fetchone() is None:insertsql = "insert into %s values (%d,'''%s''','''%s''','''%s''');" % (item["catalogues"], id, item["article_title"], item["create_time"], item["article_text"])cursor.execute(insertsql)class myarticlepipline(object):def process_item(self, item, spider):return item
settings.py负责存放整体设置
# -*- coding: utf-8 -*-
import os
# Scrapy settings for huaidan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'huaidan'SPIDER_MODULES = ['huaidan.spiders']
NEWSPIDER_MODULE = 'huaidan.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'huaidan (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'huaidan.middlewares.HuaidanSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'huaidan.middlewares.HuaidanDownloaderMiddleware': 543,
#}# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#下面内容代表执行piplines动作顺序,数字越小,越先执行。
ITEM_PIPELINES = {#'huaidan.pipelines.HuaidanPipeline': 300,#'scrapy.pipelines.files.FilesPipeline':2,#'huaidan.pipelines.myarticlepipline':1,#'huaidan.pipelines.MysqlPiplines':2, #直接插入到mysql数据库的方法'huaidan.pipelines.MysqlTwisted':1, #异步插入到mysql数据库的方法
}
project_dir = os.path.abspath(os.path.dirname(__file__))
FILES_URLS=FIELD =""
FILES_STORE = os.path.join(project_dir,'files')# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'#把数据库信息存放到settings里可以直接调用
MYSQL_HOST = "192.168.7.5"
MYSQL_DBNAME = "huaidan"
MYSQL_USER = "huaidan"
MYSQL_PASSWORD = "huaidan123"
转载于:https://www.cnblogs.com/guoyabin/p/9109933.html
scrapy爬取《坏蛋是怎样练成的4》相关推荐
- Python3.x使用Scrapy将爬取数据存储成Json
Python3.x使用Scrapy将爬取数据存储成Json 豆瓣电影排名前250链接 https://movie.douban.com/top250 注:前提安装好python及所需的环境 1.scr ...
- 练手实例:Scrapy爬取一本完整小说(章节乱序问题解决)
戳这里查看此小说 整体都很简单,没啥多说的,10分钟搞定 外循环在主页面找url进行拼接,小循环解析详细页内容提取小说文本. biquge.py import scrapy from scrapy.s ...
- 【爬虫】Scrapy爬取腾讯社招信息
目标任务:爬取腾讯社招信息,需要爬取的内容为:职位名称,职位的详情链接,职位类别,招聘人数,工作地点,发布时间. 一.预备基础 1.Scrapy简介 Scrapy是用纯Python实现一个为了爬取网站 ...
- 如何用 Python + Scrapy 爬取视频?
今天将带大家简单了解Scrapy爬虫框架,并用一个真实案例来演示代码的编写和爬取过程. 一.scrapy简介 1. 什么是Scrapy Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框 ...
- Scrapy爬取美女图片续集 (原创)
上一篇咱们讲解了Scrapy的工作机制和如何使用Scrapy爬取美女图片,而今天接着讲解Scrapy爬取美女图片,不过采取了不同的方式和代码实现,对Scrapy的功能进行更深入的运用. 在学习Scra ...
- 基于Python、scrapy爬取软考在线题库
前言 前段时间,报名个软件设计师考试,自然需要复习嘛,看到软考在线这个平台有历年来的题目以及答案,想法就是做一个题库小程序咯,随时随地可以打开复习.很多人问,这不出现很多类似的小程序了?是的,但是他们 ...
- python爬虫scrapy爬取新闻标题及链接_python爬虫框架scrapy爬取梅花网资讯信息
原标题:python爬虫框架scrapy爬取梅花网资讯信息 一.介绍 本例子用scrapy-splash爬取梅花网(http://www.meihua.info/a/list/today)的资讯信息, ...
- Scrapy 爬取今日头条街拍图片
scrapy 爬取今日头条图片保存至本地 之前用 requests 爬取过今日头条街拍的图片,当时只是爬取每篇文章的缩略图,今天尝试用 scrapy 来大规模爬取街拍详细图片. 分析页面 今日头条的内 ...
- scrapy爬取斗图表情
用scrapy爬取斗图表情,其实呀,我是运用别人的博客写的,里面的东西改了改就好了,推存链接" http://www.cnblogs.com/jiaoyu121/p/6992587.html ...
- [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(四) —— 应对反爬技术(选取 User-Agent、添加 IP代理池以及Cookies池 )
上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) -- 数据的持久化--使用MongoDB存储爬取的数据 最近项目有些忙,很多需求紧急上线,所以一直没能完善< 使用 ...
最新文章
- 2022-2028年中国橡胶带行业市场运营格局及未来前景分析报告
- Monitor CodeForces - 846D ——二维前缀和
- C++ 第五课:C/C++ 数据类型
- anguarjs 上传图片预览_MIUI12 20.10.29更新,新版「模糊预览图」
- 一行命令同时修改maven项目中多个mudule的版本号
- go mysql 查询数据_MySQL常用语句之查询数据-Go语言中文社区
- python property函数_Python内置函数property()如何使用
- word是不是计算机硬件,word及excel模拟试题1_计算机硬件及应用_IT/计算机_资料
- ubuntu环境下redis的安装配置
- 控制台应用程序转成MFC程序错误—OcrRec.exe触发一个触点,原因可能是堆被破坏
- java numberformat_java 之 格式化输出 NumberFormat
- 罗永浩承认鸟巢发布会不成功,还说苹果把大家都带歪了,你怎么看?
- IBM InfoSphere Optim数据增长解决方案:在Optim归档文件上启用安全性
- linux网卡驱动内核文件目录,LINUX内核升级更新网卡驱动
- 泛函分析——内积空间定义的概念
- 一个可以为你的任何创作加速的神软件
- dedecms 栏目绑定二级域名
- CCPC-Wannafly Comet OJ 夏季欢乐赛(2019)部分题解
- 一分钟学会看k线图_教你一分钟怎样学会看k线图(纯干货)
- 线框图和原型图的区别_有用的线框图和原型制作工具–综述