

  • Step 1: 数据获取
  • Step2: 地址解析
  • Step3: 地图可视化


Step 1: 数据爬取


from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time# define a function of crawing html
def geturl(url):r = requests.get(url)r.encoding = r.apparent_encodingreturn r.text# data crawing from the public notice
url = 'https://mp.weixin.qq.com/s/MkKsQkgvUWbwj8z9jG_Zng'
text = geturl(url) #get info from original link
file = open("shanghai0404.txt",'w') #save as a new file
file.write(text) #write info# parsing
soup = BeautifulSoup(text, "html.parser")
info = soup.find('div', 'rich_media_content')
x = str(info)# get the infection data as a list
addresses = re.findall('</span></p><p>.*?<span style="font-size: 16px;">([^已<2022].+?)?[,。、<]+?', x)
file = open("shanghai0404_addresses.txt",'w') #save as a new file
for i in addresses:file.write(i+'\r') #write info# get the summary data of all districts
districts = re.findall('2022年4月4日,(.+?区)[无]?新增([\d]+?)?.*?确诊病例,新增([\d]+?)例.*?无症状感染者',x)
li = []
for item in districts:di = []di.extend(item)li.append(di)df = pd.DataFrame(li,columns=['districts','positive','asymptomatic'])



'八灶村郭家宅', '八灶村周家宅', '白杨路1065弄', '板泉路1201弄', '板泉路2000弄', '板泉路2101弄', '板泉路25号', '北艾路1077弄', '北艾路1200弄', '北艾路1500弄', '北艾路155弄', '北艾路1660弄', '北艾路1765弄', '北蔡大街31弄', '北新园路460弄', '北洋泾路601号', '北园路258弄', '北张家浜路68号', '北中路175弄', '北中路280弄', '北中路480弄', '北庄村', '碧云路1198号', '碧云路86弄', '博成路周家渡路交界口工地生活区‘......



有了上面的地址信息,要想在地图中展现出来还需要有经纬度信息才行。这里就需要进行地址解析。用到的是google官方的geocoding api,免费,但是有一定限制,由于我们要发送3000+请求信息,需要设置time.sleep(),避免请求过于频繁导致失败。


地址解析:使用Google API将地址文本转换为经纬度_Liagogo的博客-CSDN博客本文将提供Geocoding API的基础使用指南,包括:设置谷歌云平台账号及项目、基础地址请求代码、返回值解析及信息提取https://blog.csdn.net/liatan/article/details/123973890?spm=1001.2014.3001.5502


也可以用其它API或者直接用geocoder 等现成的包,下面只是就Google API提供一个参考。


from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import urllib
import json
import time# define a function of crawing json from google api
def geturl(url):r = requests.get(url)r.encoding = r.apparent_encodingreturn r.text# define a function of parsing json from google api
def getinfo(js):i = []js = json.loads(text)i.append(js['results'][0]['formatted_address'])i.append(js['results'][0]['geometry']['location']['lat'])i.append(js['results'][0]['geometry']['location']['lng'])return(i)key = 'ENTER YOUR KEY HERE'# replace with your own key copied from google cloud platformaddresses = open("shanghai0404_addresses.txt",'r').readlines()
print(addresses[0].strip())data = []
for location in addresses:ls = []addr = '上海市'+location.strip()url = 'https://maps.googleapis.com/maps/api/geocode/json?{}&key={}'.format(urllib.parse.urlencode({'address':addr}), key)try:text = geturl(url)except:time.sleep(7)try:text = geturl(url)except:text = ''try:js = json.loads(text)except:js = Noneif js == None or 'status' not in js or js['status'] != 'OK':z = ['','','']z.append(addr)data.append(z)continueelse:z = getinfo(js)z.append(addr)data.append(z)df = pd.DataFrame(data,columns=['f_location','latitude','longitude','location'])


Step 3: 地图可视化


from pyecharts.charts import Bar
from pyecharts import options as opts
from pyecharts.globals import ThemeTypeimport pandas as pd
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
%matplotlib inline# -------------------------------------
# visualize the infectors locations data# first, import the .shp of Shanghai
# import the map
shanghai_map = gpd.read_file('./stanford-dv960kb3448-shapefile/shanghai1.shp')
# plot the empty map
fig,ax = plt.subplots(figsize = (15,15))
shanghai_map.plot(ax = ax, color = 'grey')# read the data
df = pd.read_csv('shanghai2022_04_04.csv')
crs = {'init': 'epsg:4326'}
# creat points to be shown
geo = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
# gen geodataframe
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geo)
fig,ax = plt.subplots(figsize = (15,15))# mark on the map
shanghai_map.plot(ax = ax, color = 'lightgray')
geo_df.plot(ax = ax, markersize = 60, color = 'darkred', alpha = 0.2, marker = 'o')


GitHub - Liagogo/Geocoding-and-Visualizing-covid-cases-Shanghai-2022spring: Data crawling, geocoding and visualization of the covid cases in the new infecting wave in Shanghai on April 2022 with the official public dataData crawling, geocoding and visualization of the covid cases in the new infecting wave in Shanghai on April 2022 with the official public data - GitHub - Liagogo/Geocoding-and-Visualizing-covid-cases-Shanghai-2022spring: Data crawling, geocoding and visualization of the covid cases in the new infecting wave in Shanghai on April 2022 with the official public datahttps://github.com/Liagogo/Geocoding-and-Visualizing-covid-cases-Shanghai-2022spring





