写在前面

使用python对【软科-高等教育评价领先品牌 https://www.shanghairanking.cn/rankings/bcur/2020】进行爬取，然后存到sql数据库中（也有excel的代码），并且进行分析，获得每个省份的大学的种类的分布图。
这个项目特别有纪念意义，因为它让我深刻认识了SQL数据库语句的魅力还有matplotlib的重要性，另外还有Echarts的强大，还有如何通过修改官方实例得到自己想要的结果，还有urllib 爬虫与BeautifulSoup 网页解析。是一个很全面的锻炼。很多的注释没有删掉，是自己编敲边试的结果。

爬虫与数据存储（excel&sql）

import bs4
from bs4 import BeautifulSoup  #网页解析，获取数据
import urllib #订制url，获取网页数据
import urllib.error #请求网页如果出错的时候的分析
import urllib.request #请求网页获取
import re
import os
import requests
import time
import numpy as np
import xlwt
import sqlite3def main():#1.爬取网页baseurl = "https://www.shanghairanking.cn/rankings/bcur/2020"print(baseurl)# ask_URL(baseurl) #自建函数，获取网页并存储为本地文本#2.分析数据，解析数据（边爬取边解析，逐一解析，循环语句）get_data(baseurl) #自建函数，获取网页的本地文本里面的有用的表单#得到指定某一个url的网页的内容，这是最基础的函数
def ask_URL(url):head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}request = urllib.request.Request(url, headers=head)html = ""try:response = urllib.request.urlopen(request)html = response.read().decode("utf-8")print(html)except urllib.error.URLError as e:if hasattr(e, "code"):print(e, code)if hasattr(e, "reason"):print(e, reason)print("访问成功")return html #最后返回，网页文本def get_data(baseurl):data_list = []# html = ask_URL(baseurl)html = open(r'./大学.html',"rb")# print(html)soup = BeautifulSoup(html, "html.parser")# print(soup)for item in soup.select('tr > td'):rx = item.get_text()re = rx.replace("\r\n", "")data_list.append(re.strip())print(data_list)print(len(data_list))print(data_list[0:6])print(data_list[0:6])ranking0 = []uname1 = []city2 = []info3 = []totalscore4=[]partscore5=[]ranking0.extend(data_list[0:-5:6])uname1.extend(data_list[1:-4:6])city2.extend(data_list[2:-3:6])info3.extend(data_list[3:-2:6])totalscore4.extend(data_list[4:-1:6])partscore5.extend(data_list[5::6])print("save...")# book = xlwt.Workbook(encoding="utf-8",style_compression=0)# sheet = book.add_sheet("中国大学排名", cell_overwrite_ok=True)# col = ("排名", "学校名称", "省市", "类型", "总分", "办学得分")# for i in range(6):#     sheet.write(0, i, col[i])# for i in range(0, 566):#     print("正在打印第%d行" % (i + 1))#     sheet.write(i + 1, 0, ranking0[i])#     sheet.write(i + 1, 1, uname1[i])#     sheet.write(i + 1, 2, city2[i])#     sheet.write(i + 1, 3, info3[i])#     sheet.write(i + 1, 4, totalscore4[i])#     sheet.write(i + 1, 5, partscore5[i])# book.save("2020年中国大学排名.xls")  # 保存名字或者路径，要给一个“”的字符串# print("save successfully by excel")#创建databasedatapath = "uranking.db"intSQL(datapath)#插入数据conn = sqlite3.connect(datapath)cur = conn.cursor()for i in range(566):uname1[i] = '"' + uname1[i] + '"'city2[i] = '"' + city2[i] + '"'info3[i] = '"' + info3[i] + '"'partscore5[i]= '"' + partscore5[i] + '"'# print(uname1[i],city2[i],info3[i])# print('%s,%s,%s,%s,%s,%s'%(ranking0[i],uname1[i],city2[i],info3[i],totalscore4[i],partscore5[i]))sql = ''' insert into uranking(ranking,uname,city,info,totalscore,partscore)values(%s,%s,%s,%s,%s,%s);''' %(ranking0[i],uname1[i],city2[i],info3[i],totalscore4[i],partscore5[i])# print(sql)cur.execute(sql)conn.commit()cur.close()conn.close()print("save successfully by sql")# 初始化数据库
def intSQL(datapath):#创建数据表格sql = '''create table uranking (id integer primary key autoincrement,ranking numeric,uname varchar ,city varchar,info varchar,totalscore numeric,partscore numeric);'''conn = sqlite3.connect(datapath)cursor = conn.cursor()cursor.execute(sql)conn.commit()conn.close()if __name__ == "__main__":main()

SQL数据库的数据筛选与matplotlib作图

import sqlite3
import pyecharts
from pyecharts import Map
from pyecharts import option as opts
import pandas as pd
import numpy as np
from matplotlib.ticker import MultipleLocator
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
# mpl.rcParams["font.family"] = 'Arial'  #默认字体类型
# mpl.rcParams["mathtext.fontset"] = 'cm' #数学文字字体
# mpl.rcParams["contour.negative_linestyle"] = 'dashed'  #默认字体类型
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号#大学种类与城市数量的关系
#数据提取
con = sqlite3.connect("uranking.db")
cursor = con.cursor()
sql = '''select info as "学校类别",count(city) as "城市数量" FROM uranking
group by info order by "城市数量" DESC;'''
output = cursor.execute(sql)
info_major = []
number_info = []
for item in output:info_major.append(item[0])number_info.append(item[1])
print(info_major)
print(number_info)
print(len(info_major))
cursor.close()
con.close()
#数据绘图
x = np.arange(len(info_major))  # the label locations
width = 0.55  # the width of the bars
fig, ax = plt.subplots()
ax.grid(alpha=0.5,linewidth=0.5)
rects1 = ax.bar(x, number_info, width, label='城市数量',fc="green",zorder=5,hatch="//",edgecolor="y",linewidth=0.5,align="center")
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('城市数量')
ax.set_title('大学种类与城市数量的关系')
ax.set_xticks(x)
ax.set_xticklabels(info_major)
ax.legend()
#在每个bar上插入数字
def autolabel(rects):"""Attach a text label above each bar in *rects*, displaying its height."""for rect in rects:height = rect.get_height()x = rect.get_x()movement = rect.get_width() / 2ax.text(x+movement,height,str(height),ha='center',va='bottom',color="red",fontsize=15)# # ax.annotate('{}'.format(height),#             xy=(rect.get_x() + rect.get_width() / 2, height),#             xytext=(0, 3),  # 3 points vertical offset#             textcoords="offset points",#             ha='center', va='center',)
autolabel(rects1)
ax.set_ylim(0,250)
ax.yaxis.set_minor_locator(MultipleLocator(10))
# ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
fig.tight_layout()
plt.savefig("大学种类与城市数量的关系.png",dpi=300)
plt.show()

Excel数据库的数据筛选与matplotlib作图

虽然很low，但是也锻炼了我的for循环的思维

import xlwt  #进行excel写入
import xlrd #进行excel读取
import numpy as npcitytype = info_major
# citytype = ['江苏', '山东', '河南', '北京', '辽宁', '陕西', '四川', '广东', '河北', '湖北', '湖南', '安徽', '浙江',
#             '江西', '黑龙江', '上海', '吉林', '山西', '福建', '云南', '广西', '贵州',
#             '内蒙古', '甘肃', '天津', '重庆', '新疆', '海南', '宁夏', '西藏', '青海']
majortype = ['理工', '综合', '师范', '农业', '林业']data = xlrd.open_workbook(r'../../2020年中国大学排名.xls')# print(data.sheet_names())
table = data.sheet_by_name(u'中国大学排名')
# print(table)
cell_A1 = table.row(2)[2].value
# print(cell_A1)
# 第0行的所有列不能要，因为是表头
# 总共有6列， 567列，包括所有shengfen = []
zhuanye = []
daxueshuliang = []
# list = []for x in range(31):city = citytype[x]for y in range(5):major = majortype [y]# print(city,major)#江苏 理工countnumber = 0for j in range(567):for i in range(6):cell_value = table.row(j)[i].value# print(cell_value)if cell_value == city:city_hanghao = jcity_liehao = imajor_hanghao = city_hanghaomajor_liehao = city_liehao + 1cell_value1 = table.row(major_hanghao)[major_liehao].value# print(city_hanghao, city_liehao, cell_value1)if cell_value1 == major:countnumber = countnumber + 1# print(city,major,"%d" %countnumber)# list.append({"city":city})# list.append({"major":major})# list.append({"%s%s"%(city,major): countnumber})shengfen.append(city)zhuanye.append(major)daxueshuliang.append(countnumber)
# # print(list)
# # # print(zhuanye)
# # # print(daxueshuliang)aa = shengfen
bb = zhuanye
cc = daxueshuliang
a1 = aa[0:len(aa):5]
a = aa[0:5]
b = bb[0:5]
c = cc[0:5]
print(a,b,c)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
labels = aa[0:len(aa):5]
ligong = np.array(cc[0:len(cc):5])
zonghe = np.array(cc[1:len(cc):5])
shifan = np.array(cc[2:len(cc):5])
nongye = np.array(cc[3:len(cc):5])
linye = np.array(cc[4:len(cc):5])fig, ax = plt.subplots()
ax.grid(alpha=0.5,linewidth=0.5,zorder=2)
ax.bar(labels, ligong, label='理工',zorder=5)
ax.bar(labels, zonghe, label='综合',zorder=5, bottom=ligong)
ax.bar(labels, shifan, label='师范',zorder=5,bottom=(ligong+zonghe))
ax.bar(labels, nongye, label='农业',zorder=5,bottom=(ligong+zonghe+shifan))
ax.bar(labels, linye, label='林业',zorder=5,bottom=(ligong+zonghe+shifan+nongye))ax.set_ylabel('数量（所）')
ax.set_title('%s个省市分别拥有的大学的分类'%len(info_major))
ax.legend()
for label in ax.get_xticklabels():label.set_ha("center")label.set_rotation(90)
plt.savefig('%s个省市分别拥有的大学的分类'%len(info_major),dpi=300)
plt.show()

Python爬虫，爬取2020年软科中国大学排名并进行数据清洗与可视化输出相关推荐

2020年软科中国大学排名
2020年软科中国大学排名
Python爬虫入门之2022软科中国大学排名爬取保存到csv文件
一.实验方案设计 1.获得"2022软科中国大学排名"数据,从[软科排名]2022年最新软科中国大学排名|中国最好大学排名网页中获得排名数据信息,并将数据保存到csv文件中. 2. ...
2021软科中国大学排名（Python抓取前200名）
2021软科中国大学排名(Python抓取) 由于软科官网使用动态渲染技术,将数据封装在js文件中,导致无法直接抓取所有的排名记录.用普通的请求方式只能抓取到前30条记录.虽然也可以通过自动化测试 ...
Python 抓取软科中国大学排名首页数据
文章目录利用requests.BeautifulSoup.xlwings库抓取软科中国大学排名首页数据 (1)软科中国大学排名 (2)调用requests模块中get方法,get方法包括header ...
2021，软科中国大学排名新鲜出炉！清华、北大、浙大位居前三！
"软科中国大学排名",前身是"软科中国最好大学排名", 自2015年首次发布以来,以专业.客观.透明的优势赢得了高等教育领域内外的广泛关注和高度认可,已经成为具 ...
你的学校排第几？软科中国大学排名发布
转载于软科 (2021年4月26日)高等教育评价专业机构软科今日正式发布"2021软科中国大学排名".清华大学.北京大学.浙江大学占据主榜(即综合性大学排名)前三位. 2021软 ...
2023软科中国大学排名（主榜TOP590）
图源:软科公众号排名指标 2023软科中国大学排名遵循"全面评估.分类评价"的原则,针对不同性质和不同类型大学的特点,采用差异化的指标体系进行排名. "软科中国大学排名 ...
重磅发布：2023软科中国大学排名，清华大学、北京大学、浙江大学位居前三！...
来源:软科 2023年3月30日,软科正式发布"2023软科中国大学排名".清华大学.北京大学.浙江大学蝉联主榜(即综合性大学排名)前三位.北京协和医学院位列医药类大学排名第一,上 ...
中国大学排名python爬虫_Python爬虫入门实例三之爬取软科中国大学排名
写在前面这个例子是笔者今天在中国大学MOOC(嵩天北京理工大学)上学习的时候写下来的.但是很快写完之后我就发现不对劲,首先课程给的例子是中国好大学网站的排名,但是现在这个网站已经重构了,原来的链接 ...
软科中国大学排名——计算机科学与技术专业
转载于软科高等教育专业评价机构软科今日正式发布"2023软科中国大学专业排名",排名包括787个专业,涉及93个专业类.12个专业门类.软科中国大学专业排名是迄今为止覆盖专业数 ...

Python爬虫，爬取2020年软科中国大学排名并进行数据清洗与可视化输出

写在前面

爬虫与数据存储（excel&sql）

SQL数据库的数据筛选与matplotlib作图

Excel数据库的数据筛选与matplotlib作图

Python爬虫，爬取2020年软科中国大学排名并进行数据清洗与可视化输出相关推荐

最新文章

热门文章