Python爬虫-爬取新闻网站,将数据对比去重插入数据库
一、近期想实现一个推荐系统的API,目前正在筹备中,实现了一个新闻网站,用来做测试(大家可以看我以前的文章)今天分享的就是为我的新闻网站提供数据的爬虫代码
- 先看效果
检测到重复数据时程序是不会插入到数据库中的
二、实现思路
- 获取数据库以经存在的数据提取末位30
- 每次爬取数据只爬页面前30条数据
- 在爬取过程中做一个简单的对比(title)
三、源码
import datetime
import json
import time
import requests
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import pymysqlclass Spider36Kr(object):def __init__(self):self.conn = pymysql.connect( # 链接MYSQLhost='localhost',user='root',passwd='963369',db='news_data',port=3306,charset='utf8')self.index = self.get_index() # 获取数据库中最后一个idself.add_index = list() # 添加的idself.url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel","https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate","https://36kr.com/information/web_zhichang"]self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36","Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22https%3A%2F%2F36kr.com%2Finformation%2Ftechnology%22%2C%22%24latest_referrer_host%22%3A%2236kr.com%22%2C%22%24latest_traffic_source_type%22%3A%22%E5%BC%95%E8%8D%90%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830180; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830180; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830181|1572829919",}self.deep_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36","Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830477; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830477; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830478|1572829919","authority": "36kr.com"}def __del__(self):print("关闭数据库链接...")self.conn.close()def run(self):for x, url in enumerate(self.url_list):print("正在爬取:", url, x + 1)data = self.request_page(url, x + 1) # 获取有效数据try:self.insert_data(data) # 入库except pymysql.err.InternalError:print("插入数据中出现异常")print("进入20秒睡眠...")time.sleep(20)def get_index(self):sql = 'select id from new'new = pd.read_sql(sql, self.conn).tail(1)["id"].tolist()[0]return newdef spider_one(self, num):"""爬取一个页面:param num::return:"""url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel","https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate","https://36kr.com/information/web_zhichang"]data = self.request_page(url_list[num - 1], num)self.insert_data(data)def request_page(self, temp_url, cate_id):"""请求页面数据:return: 去重后的数据"""response = requests.get(temp_url, self.headers) # 发送请求content = response.content.decode() # 解析数据html = etree.HTML(content) # 转换格式data_list = html.xpath("//script") # 提取数据temp_data = None # 提取js中的jsonfor data in data_list:try:data = str(data.text).split("window.initialState=")[1]temp_data = json.loads(data)except IndexError:passdata_all = list() # 提取新闻数据for x in range(10000):try:new_source = "36kr"new_title = temp_data["information"]["informationList"][x]["title"]index_image_url = temp_data["information"]["informationList"][x]["images"][0]new_time = datetime.datetime.now().strftime('%Y-%m-%d')digest = temp_data["information"]["informationList"][x]["summary"]url = "https://36kr.com/p/" + str(temp_data["information"]["informationList"][x]["entity_id"])new_content = self.deep_spider(url)if str(new_title) not in self.sql_title_list(cate_id):data_all.append([new_title, new_source, new_time, digest, index_image_url, new_content, 0, 0, cate_id])print(x + 1, new_title, "提取完成...")else:print(x + 1, new_title, "检测到重复数据...")except IndexError:print("数据提取完成...")breakdata_all.sort()data_all = pd.DataFrame(data_all, columns=["new_title", "new_source", "new_time", "digest", "index_image_url","new_content", "new_seenum", "new_disnum", "new_cate_id"])return data_alldef deep_spider(self, url):"""提取新闻url里面的数据:param url::return:"""response = requests.get(url, self.deep_headers)content = response.content.decode()soup = BeautifulSoup(content, "lxml")data = soup.find_all('p')[0: -11]data_str = ""for x in data:data_str = data_str + str(x)return data_strdef sql_title_list(self, num):"""读取MYSQL中新闻的数据:return: df(标题列表)"""if num == 1:df = pd.read_sql("select * from new where new_cate_id = 1 and new_source = '36kr';",self.conn) # 读取MySql 科技elif num == 2:df = pd.read_sql("select * from new where new_cate_id = 2 and new_source = '36kr';",self.conn) # 读取MySql 汽车elif num == 3:df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';",self.conn) # 读取MySql 生活elif num == 4:df = pd.read_sql("select * from new where new_cate_id = 4 and new_source = '36kr';",self.conn) # 读取MySql 房产elif num == 5:df = pd.read_sql("select * from new where new_cate_id = 5 and new_source = '36kr';",self.conn) # 读取MySql 职场else:df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';", self.conn)df = df.tail(40)["new_title"].to_list()return dfdef insert_data(self, data):cursor = self.conn.cursor() # 创建游标sql = "insert into new(new_time, index_image_url, new_title, new_source," \" new_seenum, new_disnum, digest, new_content, new_cate_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)"print(data.shape, "正在将数据插入数据库...")for x, y, z, e, f, j, h, i, g in zip(data["new_time"], data["index_image_url"], data["new_title"],data["new_source"], data["new_seenum"],data["new_disnum"],data["digest"], data["new_content"], data["new_cate_id"]):cursor.execute(sql, (x, y, z, e, f, j, h, i, g))self.conn.commit()print(z, "插入成功...")self.index += 1self.add_index.append(self.index)cursor.close()if __name__ == '__main__':spider = Spider36Kr()spider.run()# spider.spider_one(5)
注: 大家可以按照自身需求,自行修改(数据库等),仅用于学习交流
注: 新闻入库,在页面即可看到数据更新,项目做完后我会提供新闻网站和推荐系统的GitHub
Python爬虫-爬取新闻网站,将数据对比去重插入数据库相关推荐
- python爬虫爬取58网站数据_Python爬虫,爬取58租房数据 字体反爬
Python爬虫,爬取58租房数据 这俩天项目主管给了个爬虫任务,要爬取58同城上福州区域的租房房源信息.因为58的前端页面做了base64字体加密所以爬取比较费力,前前后后花了俩天才搞完. 项目演示 ...
- python爬取网页代码-python爬虫爬取网页所有数据详细教程
Python爬虫可通过查找一个或多个域的所有 URL 从 Web 收集数据.Python 有几个流行的网络爬虫库和框架.大家熟知的就是python爬取网页数据,对于没有编程技术的普通人来说,怎么才能快 ...
- python 爬虫 表格,python爬虫爬取网页表格数据
用python爬取网页表格数据,供大家参考,具体内容如下 from bs4 import BeautifulSoup import requests import csv import bs4 #检查 ...
- python爬虫 爬取斗鱼直播数据
from time import sleepimport requests from bs4 import BeautifulSoup# 爬取分类页面数据 #获取斗鱼分类页面数据 def get_di ...
- python爬虫爬取58网站数据_python实战学习笔记:爬取58同城平板电脑数据
学习爬虫一周后独立完成的第一个作业项目:爬取58同城平板电脑数据. 1.首先确定URL,并抓取详情页中需要的信息 首先我们确定好需要爬取的网页URL是:http://zhuanzhuan.58.com ...
- Python爬虫--爬取历史天气数据
写在前面:爬虫是老鼠屎在进入实验室后接触的第一个任务,当时刚刚接触代码的老鼠屎一下子迎来了地狱难度的爬微博签到数据.爬了一个多月毫无成果,所幸带我的师兄从未给我疾言厉色,他给与了我最大的包容与 ...
- Python爬虫爬取豆瓣书籍数据
" 阅读文本大概需要 5 分钟 此文首发于「brucepk」公众号,欢迎大家去关注. 炎热的夏天,酷暑难挡,难免会心烦意燥,睡前随手拿起枕边看过很多遍的「平凡的世界」.看书,会让躁动的心 ...
- Python 爬虫 | 爬取股票概念数据
这段时间写了行业板块.涨跌停板数据,获取这些数据的目的就是想通过处理.分析这些数据把整个大盘的情况反馈给我,让我可以用最少的时间进行复盘(说白了就是懒得看,果然懒才是程序员的第一生产力).这几天把这些 ...
- python爬虫——爬取图书馆借阅数据
环境 python3.6 BeautifulSoup4 -- v4.6 分析 由于图书管理系统很多人密码都未改,为默认密码,刚好最近在学爬虫,想爬出来试试手,并没有任何恶意,侵删. 本次主要包含以下内 ...
最新文章
- 敏捷团队中的QA由来
- 提莫隐身+机器人能钩_航空工业官宣全新歼20正式亮相,可以隐身的变形金刚
- Java机器学习库ML之九交叉验证法(Cross Validation)
- Linux 如何取进程运行时间,linux -- 获取进程执行时间
- poj 3275 Ranking the Cows 搜索
- Tomcat应用中post方式传参数长度限制
- PyTorch框架学习十一——网络层权值初始化
- pandas python groupby_python – 如何使用pandas groupby汇总多个列?
- 信号与系统 matlab实验报告,信号与系统 MATLAB实验报告
- iPhone 6S三大性能实测
- 【渝粤教育】国家开放大学2018年春季 0692-21T化工设备机械基础 参考试题
- Activiti工作流Day18-Crystalball流程仿真
- 毕业设计之校园一卡通管理系统的设计与实现
- 住在我隔壁储藏室的大学刚毕业的小夫妻
- 《活着》的优秀读后感范文3000字
- sklearn实现葡萄酒分类数据集训练朴素贝叶斯算法
- python需要多久才能学会_大家觉得自学python多久能学会?
- 钣金展开更自由了? SOLIDWORKS 2022 新功能
- vue按钮移上去显示提示_vue 鼠标移入移出事件(移入出现按钮),element-ui表格移入移出...
- 移动硬盘无法读取怎么修复?