python爬取国家统计局2019年行政区划分数据

建立数据表：
创建tab_citys mysql 数据表


DROP TABLE IF EXISTS `tab_citys`;
CREATE TABLE `tab_citys` (`id` int(11) NOT NULL AUTO_INCREMENT,`parent_id` int(11) DEFAULT NULL,`city_name_zh` varchar(20) NOT NULL,`city_name_en` varchar(20) DEFAULT NULL,`city_level` int(11) NOT NULL,`city_code` char(12) NOT NULL,PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;

pyton脚本

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#   功能：  获取省市县数据
#   版本：v1.1
import importlib
import sys
import pymysql
importlib.reload(sys)
import requests
import lxml.etree as etreeimport osclass chinese_city():# 初始化函数def __init__(self):self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'self.conn = pymysql.connect(host="localhost", port=8889, user="root", passwd="root", db="test", charset='utf8')self.cur = self.conn.cursor()self.trdic = {1: '//tr[@class="provincetr"]',2: '//tr[@class="citytr"]',3: '//tr[@class="countytr"]',4: '//tr[@class="towntr"]',5: '//tr[@class="villagetr"]'}def __del__(self):if self.cur:self.cur.close()if self.conn:self.conn.close()def crawl_page(self,url):''' 爬行政区划代码公布页 '''# print(f"crawling...{url}")headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}i = 0while i < 3:try:html = requests.get(url, headers=headers, timeout=20)html.encoding = 'gbk'  # 这里添加一行# print(html.status_code)text = html.textreturn textexcept requests.exceptions.RequestException:i += 1print('超时'+url)#解析省页，返回listdef parseProvince(self):html = self.crawl_page(self.baseUrl)tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))nodes = tree.xpath('//tr[@class="provincetr"]')id = 1values = []for node in nodes:items = node.xpath('./td')for item in items:value = {}nexturl = item.xpath('./a/@href')province = item.xpath('./a/text()')print(province)value['url'] = self.base + "".join(nexturl)value['name'] = "".join(province)value['code'] = 0value['pid'] = 0value['id'] = idvalue['level'] = 1print(repr(value['name']))id = id + 1last_id = self.insert_to_db(value)value['id'] = last_idvalues.append(value)print(value)return values#根据trid 解析子页def parse(self,trid, pid, url):if url.strip() == '':return None# url_prefix+urlhtml = self.crawl_page(url)tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))if trid==3:nodes = tree.xpath(self.trdic.get(trid))if len(nodes)==0:nodes = tree.xpath(self.trdic.get(4))print('有镇的市：'+url)else:nodes = tree.xpath(self.trdic.get(trid))path = os.path.basename(url)base_url = url.replace(path, '')id = 1values = []# 多个城市for node in nodes:value = {}nexturl = node.xpath('./td[1]/a/@href')if len(nexturl) == 0:nexturl = ''code = node.xpath('./td[1]/a/text()')if len(code) == 0:code = node.xpath('./td[1]/text()')name = node.xpath('./td[2]/a/text()')if len(name) == 0:name = node.xpath('./td[2]/text()')value['code'] = "".join(code)urltemp = "".join(nexturl)if len(urltemp) != 0:value['url'] = base_url + "".join(nexturl)else:value['url'] = ''value['name'] = "".join(name)print(repr(value['name']))print(value['url'])value['id'] = idvalue['pid'] = pidvalue['level'] = tridid = id + 1last_id = self.insert_to_db(value)value['id'] = last_idvalues.append(value)print(value)return values#解析社区页def parseVillager(self,trid, pid, url):html = self.crawl_page(url)tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))nodes = tree.xpath(self.trdic.get(trid))id = 1values = []# 多个城市for node in nodes:value = {}nexturl = node.xpath('./td[1]/a/@href')code = node.xpath('./td[1]/text()')vcode = node.xpath('./td[2]/text()')name = node.xpath('./td[3]/text()')value['code'] = "".join(code)value['url'] = "".join(nexturl)value['name'] = "".join(name)print(repr(value['name']))value['id'] = idvalue['pid'] = pidvalue['level'] = tridvalues.append(value)id = id + 1last_id = self.insert_to_db(value)value['id'] = last_idvalues.append(value)print(value)return values#插入数据库def insert_to_db(self,taobao):# return 0param = []lastid = 0try:sql = 'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)'param = (0, taobao.get("pid"), taobao.get("name"), '', taobao.get("level"), taobao.get("code"))self.cur.execute(sql, param)lastid = self.cur.lastrowidself.conn.commit()except Exception as e:print(e)self.conn.rollback()return lastid#从头执行解析def parseChineseCity(self):values = self.parseProvince()for value in values:citys = self.parse(2, value['id'], value['url'])if not citys is None:for city in citys:countys = self.parse(3, city['id'], city['url'])#这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了if not countys is None:for county in countys:towns = self.parse(4, county['id'], county['url'])if towns is not None:for town in towns:villagers = self.parseVillager(5, town['id'], town['url'])if __name__ == '__main__':chinese_city = chinese_city()chinese_city.parseChineseCity()

python爬取国家统计局2019年行政区划分数据相关推荐

利用Python爬取全国250m精度的人口数据
此次以GeoQ(智图)为基础,利用Python爬取全国250m精度的人口数据(GeoQ)这个网站开放过250m精度的人口分布数据,而且人口分布有年龄分段等属性.先得注册登录到达创建地图的界面. 看人口 ...
利用Python爬取全国250m精度的人口数据（GeoQ）、房价数据和公交站（线路）等数据
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: 中原百科 GIS大师兄 PS:如有需要Python学习资料的小伙伴可 ...
Python爬取王者荣耀英雄的皮肤数据并下载皮肤图片项目
Python爬取王者荣耀英雄的皮肤数据,并下载皮肤图片!高清的图片用来做桌面也不错哟~ 网址:https://pvp.qq.com/web201605/herolist.shtml 1.获得英雄信息, ...
利用python爬取丁香医生上新型肺炎数据，并下载到本地，附带经纬度信息
新增:国外疫情网站介绍已更新:爬取国外疫情数据已更新:新型肺炎历史数据下载 2020年3月27日补充: 制作了一个全球肺炎数据查询下载网站,效果如下: 访问地址:http://119.3.227. ...
python爬取电脑本地数据_利用python爬取丁香医生上新型肺炎数据，并下载到本地，附带经纬度信息...
原标题:利用python爬取丁香医生上新型肺炎数据,并下载到本地,附带经纬度信息新型肺炎肆虐全国,可以预知,最近一两年地理学中会有一部分论文研究新型肺炎的空间分布及与其他指标的关联分析.获取其患病人 ...
python爬取分析超级大乐透历史开奖数据
python爬取分析超级大乐透历史开奖数据博主作为爬虫初学者,本次使用了requests和beautifulsoup库进行数据的爬取爬取网站:http://datachart.500.com/dl ...
python爬取链家网的房屋数据
python爬取链家网的房屋数据爬取内容爬取源网站爬取内容爬取思路爬取的数据代码获取房屋url 获取房屋具体信息爬取内容爬取源网站北京二手房 https://bj.lianjia. ...
Python爬取南京地铁微博发布客流数据并进行分析
Python爬取南京地铁微博发布客流数据并进行分析之前在网上看到了分析北京地铁客流数据的开源项目,就想试着分析一下南京地铁的客流数据,可是找了很久没有找到可以获得南京地铁客流数据的接口,就去南京地铁 ...
python爬取微博评论（无重复数据）
python爬取微博评论(无重复数据) 前言一.整体思路二.获取微博地址 1.获取ajax地址2.解析页面中的微博地址3.获取指定用户微博地址三.获取主评论四.获取子评论 1.解析子评论2.获 ...

python爬取国家统计局2019年行政区划分数据

建立数据表：
创建tab_citys mysql 数据表

pyton脚本

python爬取国家统计局2019年行政区划分数据相关推荐

最新文章

热门文章

python爬取国家统计局2019年行政区划分数据

建立数据表： 创建tab_citys mysql 数据表

pyton脚本

python爬取国家统计局2019年行政区划分数据相关推荐

最新文章

热门文章

建立数据表：
创建tab_citys mysql 数据表