python爬取北京政务公开惠民地图信息
爬取过程分析:
1、分析网页可知上面的数据为json格式
2、找到json数据的请求地址https://map.beijing.gov.cn/api/place_list_for_category.json?categoryId=
3、设计数据库的表
4、将爬取的数据存到mysql数据库中
具体代码如下:
import pprint
import requests
from DBcm import UseDatabase
from bs4 import BeautifulSoupbase_url = 'https://map.beijing.gov.cn/api/place_list_for_category.json?categoryId='headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}dbconfig = {'host': '127.0.0.1','user': 'vsearch','password': 'vsearchpasswd','database': 'vsearchlogdb', }resp = requests.get(url_bbyey,headers=headers).json()
# pprint.pprint(resp)def get_jxzy():# idList = ['3','2','15','9','12','8','28','16','30','31','29','32','33','34','17','35']idList = ['gbyey','zyjyxx','tsjyxx','gdyx','xx','zx']for id in idList:print(id)url = base_url+idprint(url)resp = requests.get(url,headers=headers).json()for i in resp:print(i['categoryId'])with UseDatabase(dbconfig) as cursor:_SQL = """insert into JXZX(placename, addr, postcode, tel,categoryid,reginid)values(%s, %s, %s, %s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],i['postcode'],i['tel'],i['categoryId'],i['regionId']))def get_ylws():idList = ['ekzlfwjg','zybzdjg','myghyfjzmz','kqymjzmz','cxd','ejyljg','sjyljg','zcjg']for id in idList:print(id)url = base_url+idresp = requests.get(url,headers=headers).json()# pprint.pprint(resp)for i in resp:print(i['categoryId'])with UseDatabase(dbconfig) as cursor:_SQL = """insert into ylwsjg(placename, addr,officehours, postcode, tel,categoryid,regionid)values(%s, %s, %s, %s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],i['officeHours'],i['postcode'],i['tel'],i['categoryId'],i['regionId']))get_jxzy()get_ylws()
数据库代码如下:
import mysql.connector 前需要安装mysql数据库驱动引擎
import mysql.connectorclass UseDatabase:def __init__(self, config: dict):self.configuration = configdef __enter__(self) -> 'cursor':self.conn = mysql.connector.connect(**self.configuration)self.cursor = self.conn.cursor()return self.cursordef __exit__(self, exc_type, exc_value, exc_traceback):self.conn.commit()self.cursor.close()self.conn.close()
8.7更新爬取的信息更为详细
import pprint
import requests
from DBcm import UseDatabase
from bs4 import BeautifulSoupbase_url = 'https://map.beijing.gov.cn/api/place_list_for_category.json?categoryId='headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}dbconfig = {'host': '127.0.0.1','user': 'vsearch','password': 'vsearchpasswd','database': 'vsearchlogdb', }# resp = requests.get(url_bbyey,headers=headers).json()
# pprint.pprint(resp)def switch_case(value):switcher = {3: "东城区", 2: "西城区", 15: "朝阳区",9:'海淀区',12:'丰台区',8:'石景山区',28:'门头沟区',16:'房山区',30:'通州区',31:'顺义区',29:'大兴区',32:'昌平区',33:'平谷区',34:'怀柔区',17:'密云区',35:'延庆区',}return switcher.get(value, 'null')# print(switch_case(3))def get_jxzy():# idList = ['3','2','15','9','12','8','28','16','30','31','29','32','33','34','17','35']idList = ['gbyey','zyjyxx','tsjyxx','gdyx']for id in idList:print(id)url = base_url+idprint(url)resp = requests.get(url,headers=headers).json()# print(resp)for i in resp:print(i['categoryId'])placeid = i['placeId']url_info = 'https://map.beijing.gov.cn/place?placeId='+placeid+'&categoryId='+id# print(url_info)cont = requests.get(url_info,headers=headers).text# print(cont)soup = BeautifulSoup(cont,'lxml')try:infos = soup.find('table', {'class': 'nxq_ctab'}).find_all('td')[3].get_text() #获取简介# print(infos)with UseDatabase(dbconfig) as cursor:_SQL = """insert into jxzx(placename, addr, postcode, tel,introduce,categoryid,reginid)values(%s, %s, %s, %s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],i['postcode'],i['tel'],infos,i['categoryId'],switch_case(int(i['regionId']))))except Exception as e:print(e)with UseDatabase(dbconfig) as cursor:_SQL = """insert into JXZX(placename, addr, postcode, tel,introduce,categoryid,reginid)values(%s, %s, %s, %s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],i['postcode'],i['tel'],'null', #有的简介为空i['categoryId'],switch_case(int(i['regionId']))))def get_jxzyxxzx(): #获取中学,小学信息idList = ['xx','zx']for id in idList:print(id)url = base_url+idprint(url)resp = requests.get(url,headers=headers).json()# print(resp)for i in resp:print(i['categoryId'])placeid = i['placeId']url_info = 'https://map.beijing.gov.cn/place?placeId='+placeid+'&categoryId='+idprint(url_info)cont = requests.get(url_info,headers=headers).text# print(cont)soup = BeautifulSoup(cont,'lxml')try:infos = soup.find('table', {'class': 'nxq_ctab'}) #获取简介ever_name = infos.find_all('td')[1].get_text() #曾用名称cbrq = infos.find_all('td')[2].get_text() #本校址创办日期sfjs = infos.find_all('td')[5].get_text() #是否有寄宿xxlb = infos.find_all('td')[6].get_text() #学校种类desc = infos.find_all('td')[7].get_text() #办学规模和主要特色with UseDatabase(dbconfig) as cursor:_SQL = """insert into jxzy2(学校名称,办公地址,曾用名称,校址创办日期,邮编,电话,类型,区域,是否有寄宿,学校类别,办学规模主要特色)values(%s, %s, %s, %s,%s,%s,%s,%s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],ever_name,cbrq,i['postcode'],i['tel'],i['categoryId'],switch_case(int(i['regionId'])),sfjs,xxlb,desc))except Exception as e:print(e)infos = soup.find('table', {'class': 'nxq_ctab'}) #获取简介cbrq = infos.find_all('td')[-6].get_text() #本校址创办日期sfjs = infos.find_all('td')[-3].get_text() #是否有寄宿xxlb = infos.find_all('td')[-2].get_text() #学校种类desc = infos.find_all('td')[-1].get_text() #办学规模和主要特色with UseDatabase(dbconfig) as cursor:_SQL = """insert into jxzy2(学校名称,办公地址,曾用名称,校址创办日期,邮编,电话,类型,区域,是否有寄宿,学校类别,办学规模主要特色)values(%s, %s, %s, %s,%s,%s,%s,%s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],'null',cbrq,i['postcode'],i['tel'],i['categoryId'],switch_case(int(i['regionId'])),sfjs,xxlb,desc))def get_ylws():idList = ['ekzlfwjg','zybzdjg','myghyfjzmz','kqymjzmz','cxd','ejyljg','sjyljg','zcjg']for id in idList:print(id)url = base_url+idresp = requests.get(url,headers=headers).json()# pprint.pprint(resp)for i in resp:print(i['categoryId'])with UseDatabase(dbconfig) as cursor:_SQL = """insert into ylwsjg(placename, addr,officehours, postcode, tel,categoryid,regionid)values(%s, %s, %s, %s,%s,%s,%s)"""cursor.execute(_SQL, (i['placeName'],i['addr'],i['officeHours'],i['postcode'],i['tel'],i['categoryId'],i['regionId']))# get_jxzy()
#
# get_ylws()
get_jxzyxxzx()
python爬取北京政务公开惠民地图信息相关推荐
- Python 爬取北京二手房数据,分析北漂族买得起房吗?(附完整源码)
来源:CSDN 本文约3500字,建议阅读9分钟. 本文根据Python爬取了赶集网北京二手房数据,R对爬取的二手房房价做线性回归分析,适合刚刚接触Python&R的同学们学习参考. 房价高是 ...
- Python爬取北京2.3万条租房信息,发现快租不起房子了!
1.概述 北上广深作为打工人最多的超一线城市,大部分都是租房生活着.自如作为目前第三方租房平台,应该算是该行业的龙头.但是最近蛋壳的暴雷,我们不得不更加警觉.那么自如都有多少open状态的房源呢,这些 ...
- Python爬取北京地区短租房信息
本文利用Requests和BeautifulSoup第三方库,爬取小猪短租网北京地区短租房的信息.代码参考<从零开始学Python网络爬虫>. 完整代码如下: from bs4 impor ...
- 爬取深圳市政府政务公开所有文件
文章目录 一.前言 二.获取文件URL列表 1.获取各类文件的URL 2.获取每类文件的总页数 3.获取每个网页上的文件URL 三.爬取文件内容 1.爬取文件的基本信息和内容 2.下载相应的附件 四. ...
- python爬取北京租房信息
租房助手 发现官网的筛选方式不能满足自己的需求,所以爬取相关网站制作出现在的东西来 效果预览-> 在线预览 下面进行详细分析 一.首先爬取起始地和终点地的路线及沿途地铁站名称 1.爬取8684. ...
- python爬取电子书_python爬取计算机电子书(源码移步github)
摘要:今年第一个项目,python爬取网络上公开的计算机电子书近8000本,在此基础上简要分析计算机专业的发展变迁.部分整理好的书籍下载链接见文末.代码链接见文末. 计算机诞生以来不到100年,学术的 ...
- python xpath循环_Python爬虫 爬取北京二手房数据
点击蓝字"python教程"关注我们哟! 前言 Python现在非常火,语法简单而且功能强大,很多同学都想学Python!所以小的给各位看官们准备了高价值Python学习视频教程及 ...
- 爬取北京二手房数据信息(python)
数据爬取 爬取北京二手房数据信息python代码: # coding : utf-8from requests import get from bs4 import BeautifulSoup as ...
- python爬取贝壳找房之北京二手房源信息
所用库 requests xpath解析库 multiprocessing多进程 pandas库用于保存csv文件 实战背景 本文首发于:python爬取贝壳找房之北京二手房源信息 主要是为了做北京二 ...
- 疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息
疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息 随着时间的流逝,在中国共产党的领导,全国人民的共同努力下,疫情逐渐受到了控制,逐渐好转,复工,开学有望.最近在和女朋友的闲聊当中得知 ...
最新文章
- 连接时会提示oracle initialization or shutdown in progress
- html给文字加动态效果,20种配合场景的CSS3鼠标滑过文字动画特效
- Scala集合:ListBuffer可变集合的head/tail/last/init方法
- 每日一linux命令
- Repeater控件绑定数据、分页、数据操作,最佳代码
- python之gevent模块实现协程
- Jedis连接Redis读写基本操作
- tomcat、netty以及nodejs的helloworld性能对比
- 天翼云搭建socks5和搭建http
- matlab非线性规划
- 车子刹车油管ABS油管被剪了好几刀,我还有救吗?
- Hbase、elasticsearch整合中jar包冲突
- 爱尔兰房产泡沫破灭带给我们…
- BZOJ 4316: 小C的独立集 仙人掌 + 树形DP
- 云监控介绍 - Amazon CloudWatch
- 哪里有kitti数据集的百度云资源
- flappy+bird+android源代码,Flappy Bird(安卓版)逆向分析(一)
- 如何快速更改电脑ip地址【图文教程】?
- opencv实战——图像矫正算法深入探讨
- 医学图像分割 (MICCAI 2019)
热门文章
- 云计算与大数据” 研讨会:迎来新的科学价值
- 通用稳定DNS,国际DNS,国内DNS,公共DNS
- 微信开发------微信公众号新老账户粉丝迁移问题
- Android 项目上线流程总结
- win10怎么设置无线网连接到服务器,win10wifi自动连接在哪里设置_win10设置自动连接wifi的方法...
- Netd 服务的 netd 套接字创建
- mis是商科还是计算机专业,MIS是什么?管理信息系统MIS和计算机科学CS有什么区别?...
- MySQL和Navicat怎么连接
- iOS用代码判断设备是否越狱
- 前路钉板系统在重建胸腰段稳定性中应用 [已发表]