利用Scray框架爬去segementfault上的保存到数据库然后发送到自己的邮箱

先显示部分源码：

coding:utf-8

! /usr/bin/python

'''
Author fiz
Date:2016-03-30
Segement Blog 内容爬去
'''

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse

class SegeblogSpider(CrawlSpider):

name = 'blog'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/t/python/blogs?page=1',]
#此处注意?要转义
rules = [ Rule(sle(allow=('t/python/blogs\?page={1,}'),), follow=True,callback='parse_item1') ]

def parse_item1(self, response):sel = Selector(response)items = []base_url = get_base_url(response)postTitle = sel.css('div.tab-content').css("section")#全部的问题数量每一页postCon = sel.css('div.postCon div.c_b_p_desc')# #标题、url和描述的结构是一个松散的结构，后期可以改进for index in range(len(postTitle)):item = CnblogsItem()#问题名称item['title'] = postTitle[index].css("a").xpath('text()').extract()[0]# item['link'] = 'https://segmentfault.com'+postTitle[index].css('a').xpath('@href').extract()[0]#提问人的主页链接#问题页面链接item['link'] = 'https://segmentfault.com'+postTitle[index].css("h2.title").css('a').xpath('@href').extract()[0]#在当前页面进行爬去

        #当前爬去的页面item['listUrl'] = base_urlitem['desc'] = postTitle[index].css("div.views ").xpath("text()").extract()[0]#print base_url + "********\n"items.append(item)return items

添加Pipleline保存到数据库

-- coding: utf-8 --

Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/i...

from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors

'''
保存Json格式
'''

class JsonWithEncodingCnblogsPipeline(object):

def __init__(self):self.file = codecs.open('Segement.json', 'w', encoding='utf-8')
def process_item(self, item, spider):line = json.dumps(dict(item), ensure_ascii=False) + "\n"self.file.write(line)return item
def spider_closed(self, spider):self.file.close()

class MySQLStoreCnblogsPipeline(object):

def __init__(self, dbpool):self.dbpool = dbpool@classmethod
def from_settings(cls, settings):dbargs = dict(host=settings['MYSQL_HOST'],db=settings['MYSQL_DBNAME'],user=settings['MYSQL_USER'],passwd=settings['MYSQL_PASSWD'],charset='utf8',cursorclass = MySQLdb.cursors.DictCursor,use_unicode= True,)dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)return cls(dbpool)#pipeline默认调用
def process_item(self, item, spider):d = self.dbpool.runInteraction(self._do_upinsert, item, spider)d.addErrback(self._handle_error, item, spider)d.addBoth(lambda _: item)return d
#将每行更新或写入数据库中
def _do_upinsert(self, conn, item, spider):linkmd5id = self._get_linkmd5id(item)#print linkmd5idnow = datetime.utcnow().replace(microsecond=0).isoformat(' ')conn.execute("""select 1 from cnblogsinfo where linkmd5id = %s""", (linkmd5id, ))ret = conn.fetchone()if ret:conn.execute("""update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))#print """#    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s#""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)else:conn.execute("""insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)values(%s, %s, %s, %s, %s, %s)""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))#print """#    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)#    values(%s, %s, %s, %s, %s, %s)#""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
#获取url的md5编码
def _get_linkmd5id(self, item):#url进行md5处理，为避免重复采集设计return md5(item['link']).hexdigest()
#异常处理
def _handle_error(self, failue, item, spider):# log.err(failure)pass

邮件发送代码：

coding:utf-8

'''读取mysql中的数据然后发送到邮箱实现浏览的'''

import MySQLdb
import MySQLdb.cursors
import smtplib
from email.header import Header
from email.mime.text import MIMEText
import sys
reload(sys)

msg_content = u'segmentfault有关Python文章爬去'

msg_content = 'ok'

def test(content):

'''
邮件发送
:param content:
:return:
'''
msg = MIMEText(content, 'html', 'utf-8')
server = smtplib.SMTP('smtp.163.com', 25)
server.login('18818261892@163.com', 'LBQ139196')
msg['From'] = '18818261892@163.com <18818261892@163.com>'
msg['Subject'] = Header(u'text', 'utf8').encode()
msg['To'] = u'飞轮海 <1848406889@qq.com>'
server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())
print 'finished is ok!ooooo'

def db_operate():
try:

    global msg_contentconn=MySQLdb.connect(host='localhost',user= 'root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")cur=conn.cursor()i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945rows = cur.fetchall()for row in rows:# print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])msg_content +=('<html><body><h1> %s </h1><p>send by <a href= %s>Python</a>...</p></body></html>')  %( row[1],row[4],)msg_content = msg_content+str(row[4])+'ok'print msg_contenttest(msg_content)print 'finished is ok!ooooo'cur.close()conn.close()

except MySQLdb.Error,e:

    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

if name == '__main__':

db_operate()

msg_content ='ok'

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")

cur=conn.cursor()

i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945

rows = cur.fetchall()

for row in rows:

print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])

print row[1]

msg_content +=('<html><body><h1> %s </h1>

send by Python...

</body></html>') %( row[1],row[4],)

msg_content +=('<html><body><h1> segmentfault</h1>

send by Python...

</body></html>') %( row[1],row[4],)

msg_content = msg_content+str(row[4])+'ok'

print msg_content

msg = MIMEText(msg_content, 'html', 'utf-8')

server = smtplib.SMTP('smtp.163.com', 25)

server.login('18818261892@163.com', 'LBQ139196')

msg['From'] = '18818261892@163.com <18818261892@163.com>'

msg['Subject'] = Header(u'text', 'utf8').encode()

msg['To'] = u'飞轮海 <1848406889@qq.com>'

server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())

print 'finished is ok!ooooo'

cur.close()

conn.close()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" % (e.args[0], e.args[1])

结果显示：

源码分享github

python爬去segementfault上的博客文章相关推荐

python小爬虫(爬取职位信息和博客文章信息)
1.python爬取招聘信息简单爬取智联招聘职位信息(仅供学习) # !/usr/bin/env python # -*-coding:utf-8-*- """ @Au ...
一文搞定scrapy爬取众多知名技术博客文章保存到本地数据库，包含：cnblog、csdn、51cto、itpub、jobbole、oschina等
本文旨在通过爬取一系列博客网站技术文章的实践,介绍一下scrapy这个python语言中强大的整站爬虫框架的使用.各位童鞋可不要用来干坏事哦,这些技术博客平台也是为了让我们大家更方便的交流.学习.提高 ...
python使用BeautifulSoup获取csdn单个博客文章字数
之前一直想统计每年博客大致写了多少字数,但是csdn中好像只有文章数统计,没有字数统计(或者是一直没有发现相关的功能).最近学习python的网络相关模块时,python关于网页读取及抓取网页内容 ...
python 论坛爬虫代码_python博客文章爬虫实现代码
例子,python网页爬虫实例,实现博客文章抓取的python爬虫. 代码示例: #!/usr/bin/python #-*-coding:utf-8-*- # JCrawler # Author: ...
2018.7.10 个人博客文章=利用ORM创建分类和ORM的内置函数
昨天的注册收尾工作其实就差了和MySql联系起来的部分,这部分很简单,首先要做的就是保存用户通过from传送过来的头像文件: """ 保存头像文件 "&quo ...
做百度快照入门_如何为您的博客文章拍摄屏幕快照（入门指南）
做百度快照入门 Do you want to add a screenshot to your blog post, but not sure of the best way to do it? Sc ...
Python爬虫自学与实战，爬一下自己的博客文章
文章目录 1. 什么是爬虫 2. 学习爬虫的必备知识 3. 环境准备 4. 爬虫的第一步,获取网页的HTML内容 4.1 GET 4.2 POST 5. 使用BeautifulSoup模块来从HTML ...
【爬虫】利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中（2）...
[爬虫]利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中(2) 第一篇( http://blog.itpub.net/26736162/viewspace-22865 ...
使用Python爬取CSDN历史博客文章列表，并生成目录
使用Python爬取CSDN历史博客文章列表,并生成目录这篇博客将介绍如何使用Python爬取CSDN历史博客文章列表,并生成目录. 2020年 2020年04月 cv2.threshold() 阈 ...

python爬去segementfault上的博客文章

coding:utf-8

! /usr/bin/python

-- coding: utf-8 --

Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/i...

coding:utf-8

msg_content = u'segmentfault有关Python文章爬去'

msg_content ='ok'

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")

cur=conn.cursor()

i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945

rows = cur.fetchall()

for row in rows:

print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])

print row[1]

msg_content +=('<html><body><h1> %s </h1>

msg_content +=('<html><body><h1> segmentfault</h1>

msg_content = msg_content+str(row[4])+'ok'

print msg_content

msg = MIMEText(msg_content, 'html', 'utf-8')

server = smtplib.SMTP('smtp.163.com', 25)

server.login('18818261892@163.com', 'LBQ139196')

msg['From'] = '18818261892@163.com <18818261892@163.com>'

msg['Subject'] = Header(u'text', 'utf8').encode()

msg['To'] = u'飞轮海 <1848406889@qq.com>'

server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())

print 'finished is ok!ooooo'

cur.close()

conn.close()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" % (e.args[0], e.args[1])

python爬去segementfault上的博客文章相关推荐

最新文章

热门文章