优达学城Numpy与Pandas笔记
此篇为优达学城数据分析入门第二课笔记网址
基本操作
import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076, 51.40000153, 50.5 , 75.69999695,58.40000153, 40.09999847, 61.5 , 57.09999847,60.90000153, 66.59999847, 60.40000153, 68.09999847,66.90000153, 53.40000153, 48.59999847, 56.79999924,71.59999847, 58.40000153, 70.40000153, 41.20000076
])# Change False to True for each block of code to see what it does# Accessing elements
if True:print(countries[0])print(countries[3])# Slicing
if False:print(countries[0:3])print(countries[:3])print(countries[17:])print(countries[:])# Element types
if False:print(countries.dtype)print(employment.dtype)print(np.array([0, 1, 2, 3]).dtype)print(np.array([1.0, 1.5, 2.0, 2.5]).dtype)print(np.array([True, False, True]).dtype)print(np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype)# Looping
if False:for country in countries:print('Examining country {}'.format(country))for i in range(len(countries)):country = countries[i]country_employment = employment[i]print('Country {} has employment {}'.format(country,country_employment))# Numpy functions
if False:print(employment.mean())print(employment.std())print(employment.max())print(employment.sum())def max_employment(countries, employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.'''i = employment.argmax()max_country = countries[i] # Replace this with your codemax_value = employment[i] # Replace this with your codereturn (max_country, max_value)
Afghanistan
Angola
运算
import numpy as np# Change False to True for each block of code to see what it does# Arithmetic operations between 2 NumPy arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([1, 2, 1, 2])print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Arithmetic operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Logical operations with NumPy arrays
if False:a = np.array([True, True, False, False])b = np.array([True, False, True, False])print (a & b)print (a | b)print (~a)print (a & True)print (a & False)print (a | True)print (a | False)# Comparison operations between 2 NumPy Arrays
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([5, 4, 3, 2, 1])print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# Comparison operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# First 20 countries with school completion data
countries = np.array(['Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan','Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia','Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi','Cambodia', 'Cameroon', 'Cape Verde'
])# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([97.35583, 104.62379, 103.02998, 95.14321, 103.69019,98.49185, 100.88828, 95.43974, 92.11484, 91.54804,95.98029, 98.22902, 96.12179, 119.28105, 97.84627,29.07386, 38.41644, 90.70509, 51.7478 , 95.45072
])# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([95.47622, 100.66476, 99.7926 , 91.48936, 103.22096,97.80458, 103.81398, 88.11736, 93.55611, 87.76347,102.45714, 98.73953, 92.22388, 115.3892 , 98.70502,37.00692, 45.39401, 91.22084, 62.42028, 90.66958
])def overall_completion_rate(female_completion, male_completion):'''Fill in this function to return a NumPy array containing the overallschool completion rate for each country. The arguments are NumPyarrays giving the female and male completion of each country inthe same order.'''return female_completion + male_completionsum_completion= overall_completion_rate(female_completion, male_completion)
print(sum_completion)
[ 192.83205 205.28855 202.82258 186.63257 206.91115 196.29643204.70226 183.5571 185.67095 179.31151 198.43743 196.96855188.34567 234.67025 196.55129 66.08078 83.81045 181.92593114.16808 186.1203 ]
归一化
import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076, 51.40000153, 50.5 , 75.69999695,58.40000153, 40.09999847, 61.5 , 57.09999847,60.90000153, 66.59999847, 60.40000153, 68.09999847,66.90000153, 53.40000153, 48.59999847, 56.79999924,71.59999847, 58.40000153, 70.40000153, 41.20000076
])# Change this country name to change what country will be printed when you
# click "Test Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.country_name = 'United States'def standardize_data(values):'''Fill in this function to return a standardized version of the given values,which will be in a NumPy array. Each value should be translated into thenumber of standard deviations that value is away from the mean of the data.(A positive number indicates a value higher than the mean, and a negativenumber indicates a value lower than the mean.)'''return (values-values.mean())/values.std()standardize_data(employment)
array([-0.31965231, -0.780123 , -0.87650077, 1.82207181, -0.03051941,-1.99019768, 0.30144772, -0.16973184, 0.23719615, 0.84758731,0.18365304, 1.00821665, 0.87971351, -0.56595055, -1.07996476,-0.20185762, 1.38301845, -0.03051941, 1.2545153 , -1.87240259])
import numpy as np# Change False to True for each block of code to see what it does# Using index arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([True, True, False, False])# print a[b]
# print a[np.array([True, False, True, False])]# Creating the index array using vectorized operations
if False:a = np.array([1, 2, 3, 2, 1])b = (a >= 2)# print a[b]
# print a[a >= 2]# Creating the index array using vectorized operations on another array
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([1, 2, 3, 2, 1])# print b == 2
# print a[b == 2]def mean_time_for_paid_students(time_spent, days_to_cancel):'''Fill in this function to calculate the mean time spent in the classroomfor students who stayed enrolled at least (greater than or equal to) 7 days.Unlike in Lesson 1, you can assume that days_to_cancel will contain onlyintegers (there are no students who have not canceled yet).The arguments are NumPy arrays. time_spent contains the amount of time spentin the classroom for each student, and days_to_cancel contains the numberof days until each student cancel. The data is given in the same orderin both arrays.'''st=time_spent[days_to_cancel >= 7]return st.mean()# Time spent in the classroom in the first week for 20 students
time_spent = np.array([12.89697233, 0. , 64.55043217, 0. ,24.2315615 , 39.991625 , 0. , 0. ,147.20683783, 0. , 0. , 0. ,45.18261617, 157.60454283, 133.2434615 , 52.85000767,0. , 54.9204785 , 26.78142417, 0.
])# Days to cancel for 20 students
days_to_cancel = np.array([4, 5, 37, 3, 12, 4, 35, 38, 5, 37, 3, 3, 68,38, 98, 2, 249, 2, 127, 35
])mean_time_for_paid_students(time_spent, days_to_cancel)
41.054003485454537
numpy的+=与切片需要注意,类似指针去理解
Pandas Series
import pandas as pdcountries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda','Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan','Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus','Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2,70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7,67.3, 70.6]gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478,13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176,27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972,483.97086804, 9783.98417323, 2253.46411147, 25034.66692293,3680.91642923, 366.04496652, 1175.92638695, 1132.21387981]# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)# Change False to True for each block of code to see what it does# Accessing elements and slicing
# if False:
# print life_expectancy[0]
# print gdp[3:6]# Looping
if False:for country_life_expectancy in life_expectancy:print('Examining life expectancy {}'.format(country_life_expectancy))# Pandas functions
# if False:
# print life_expectancy.mean()
# print life_expectancy.std()
# print gdp.max()
# print gdp.sum()# Vectorized operations and index arrays
if False:a = pd.Series([1, 2, 3, 4])b = pd.Series([1, 2, 1, 2])print(a + b)print(a * 2)print(a >= 3)print(a[a >= 3])def variable_correlation(life_expectancy, gdp):'''Fill in this function to calculate the number of data points for whichthe directions of variable1 and variable2 relative to the mean are thesame, and the number of data points for which they are different.Direction here means whether each value is above or below its mean.You can classify cases where the value is equal to the mean for one orboth variables however you like.Each argument will be a Pandas series.For example, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([4, 5, 6, 7]), then the output would be (4, 0).This is because 1 and 4 are both below their means, 2 and 5 are bothbelow, 3 and 6 are both above, and 4 and 7 are both above.On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([7, 6, 5, 4]), then the output would be (0, 4).This is because 1 is below its mean but 7 is above its mean, andso on.'''life_expectancy_values_dir = ((life_expectancy-life_expectancy.mean()) > 0)
# print(life_expectancy_values_dir)gdp_values_dir = ((gdp-gdp.mean()) > 0)
# print(gdp_values_dir)sum_dir = (life_expectancy_values_dir == gdp_values_dir)
# print(sum_dir)sum_dir_1 = (sum_dir == 1)sum_dir_0_2 = (sum_dir != 1)#len(sum_dir_1)-...num_same_direction = sum_dir_1.sum() # Replace this with your codenum_different_direction = sum_dir_0_2.sum() # Replace this with your codereturn (num_same_direction, num_different_direction)variable_correlation(life_expectancy, gdp)
(17, 3)
带索引的pandas
import pandas as pdcountries = ['Afghanistan', 'Albania', 'Algeria', 'Angola','Argentina', 'Armenia', 'Australia', 'Austria','Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh','Barbados', 'Belarus', 'Belgium', 'Belize','Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
]employment_values = [55.70000076, 51.40000153, 50.5 , 75.69999695,58.40000153, 40.09999847, 61.5 , 57.09999847,60.90000153, 66.59999847, 60.40000153, 68.09999847,66.90000153, 53.40000153, 48.59999847, 56.79999924,71.59999847, 58.40000153, 70.40000153, 41.20000076,
]# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)def max_employment(employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.The input will be a Pandas series where the valuesare employment and the index is country names.Try using the Pandas idxmax() function. Documention canbe found here:http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html'''max_country = employment.argmax() # Replace this with your codemax_value = employment.loc[max_country] # Replace this with your codereturn (max_country, max_value)print(max_employment(employment) )
('Angola', 75.699996949999999)
import pandas as pd# Change False to True for each block of code to see what it does# Addition when indexes are the same
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])print(s1 + s2)# Indexes have same elements in a different order
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])print(s1 + s2)# Indexes overlap, but do not have exactly the same elements
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])print(s1 + s2)sum = s1 + s2print(sum.dropna())
# Indexes do not overlap
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])print(s1 + s2)print(s1.add(s2, fill_value=0))
a 11
b 22
c 33
d 44
dtype: int64
a 31
b 12
c 43
d 24
dtype: int64
a NaN
b NaN
c 13.0
d 24.0
e NaN
f NaN
dtype: float64
c 13.0
d 24.0
dtype: float64
a NaN
b NaN
c NaN
d NaN
e NaN
f NaN
g NaN
h NaN
dtype: float64
a 1.0
b 2.0
c 3.0
d 4.0
e 10.0
f 20.0
g 30.0
h 40.0
dtype: float64
import pandas as pd# Change False to True to see what the following block of code does# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
if False:s = pd.Series([1, 2, 3, 4, 5])def add_one(x):return x + 1print(s.apply(add_one))names = pd.Series(['Andre Agassi','Barry Bonds','Christopher Columbus','Daniel Defoe','Emilio Estevez','Fred Flintstone','Greta Garbo','Humbert Humbert','Ivan Ilych','James Joyce','Keira Knightley','Lois Lane','Mike Myers','Nick Nolte','Ozzy Osbourne','Pablo Picasso','Quirinus Quirrell','Rachael Ray','Susan Sarandon','Tina Turner','Ugueth Urbina','Vince Vaughn','Woodrow Wilson','Yoji Yamada','Zinedine Zidane'
])def reverse_name(name):split_name = name.split(" ")firstname = split_name[0]secondname = split_name[1]return firstname+","+secondname# print(reverse_name(names[0]))def reverse_names(names):'''Fill in this function to return a new series where each namein the input series has been transformed from the format"Firstname Lastname" to "Lastname, FirstName".Try to use the Pandas apply() function rather than a loop.'''return names.apply(reverse_name)print(reverse_names(names))
0 Andre,Agassi
1 Barry,Bonds
2 Christopher,Columbus
3 Daniel,Defoe
4 Emilio,Estevez
5 Fred,Flintstone
6 Greta,Garbo
7 Humbert,Humbert
8 Ivan,Ilych
9 James,Joyce
10 Keira,Knightley
11 Lois,Lane
12 Mike,Myers
13 Nick,Nolte
14 Ozzy,Osbourne
15 Pablo,Picasso
16 Quirinus,Quirrell
17 Rachael,Ray
18 Susan,Sarandon
19 Tina,Turner
20 Ugueth,Urbina
21 Vince,Vaughn
22 Woodrow,Wilson
23 Yoji,Yamada
24 Zinedine,Zidane
dtype: object
import pandas as pd
import matplotlib.pyplot as plt
employment = pd.read_csv("employment-above-15.csv", index_col='Country')
employment_us = employment.loc['United States']
# %pylab inline
# employment_us.plt()
优达学城Numpy与Pandas笔记相关推荐
- 【自动驾驶技术】优达学城无人驾驶工程师学习笔记(七)——计算机视觉基础
计算机视觉基础目录 前言 颜色选择(Color Selection) 理论基础 代码实践 区域筛选(Region Masking) 理论基础 代码实践 Canny边缘检测 问题背景 Canny边缘检测 ...
- 【自动驾驶技术】优达学城无人驾驶工程师学习笔记(六)——Github与Markdown相关教程
Github与Markdown相关教程 本博文为笔者关于优达学城无人驾驶工程师课程中计算机视觉基础部分的学习笔记,该部分为实现车道线图像识别功能的基础课程,关于本课程的详细说明请参考优达学城官网. 优 ...
- 优达学城-深度学习笔记(一)
优达学城-深度学习笔记(一) 标签: 机器学习 优达学城-深度学习笔记一 一 神经网络简介 最大似然概率 交叉熵Cross entropy 1交叉熵代码实现 2多类别交叉熵 对数几率回归的误差函数co ...
- 优达学城《无人驾驶入门》学习笔记——卡尔曼滤波器实现详解
优达学城<无人驾驶入门>的第二个项目是实现矩阵类,要求通过python编写一个用来计算矩阵的类Matrix.编写这个类并不难,涉及到的线性代数方面的知识也不多,比如矩阵的加法.减法.乘法, ...
- 【多传感融合】优达学城多传感融合学习笔记(二)——将激光雷达点云俯视图映射到二维图像
将激光雷达点云俯视图映射到二维图像 目录 将激光雷达点云俯视图映射到二维图像 简介 实现方法 参考代码 简介 本节讲解如何将激光雷达点云俯视图(仅考虑水平坐标)映射到二维图像中,其中涉及到激光雷达点云 ...
- 优达twitter 清理_优达学城的学习感想及优惠
2017年12月因一次偶然的机会,看到优达学城的课程.当时通过优惠码:0C637434报名能有优惠(你们也可以使用拿优惠哦).就决定参加了优达学成的数据分析入门课程. 数据分析入门课程学习的内容有四大 ...
- 优达学城_数据清洗_项目三wrangle_act
下面是我优达学城项目三的记录报告 里面的思路和文字说明大多都在代码块里面的注释中,#后面?,可能不太容易看,需要认真看.? #导入可能需要的包 import os import numpy as np ...
- 优达学城《DeepLearning》大纲和学习愿景
目的: 查漏补缺深度学习体系,精益求精的深刻理解体系中各种核心的技术原理,感受技术思想背后的精髓魅力,做到能够脱口而出. 计划: 2021年5月.6月,完成课程中所有核心知识的深刻理解(通过撰写博客, ...
- 优达学城 深度学习 任务1
这几天刚好有环境,打算学习一下深度学习 看了一圈介绍,发现优达学城的深度学习课程作为入门课程还是不错的 今天看了第一章节的视频,顺便做了任务1 任务1难度不大,按照网站上的说明可以完成下载.打包等工作 ...
最新文章
- HPE品牌存储为啥高调? 因为“王炸”多啊!
- Linux下的格式化字符串漏洞利用姿势
- pywinauto 同花顺_东方财富、同花顺、恒生电子,到底谁是互联网金融老大?
- 春节添彩 福州花卉市场现“买花潮”
- GitHub热门教程:100天搞定机器学习(中文版)
- vue写进html,vue中html页面写入$t(‘’)怎么显示
- BZOJ3738 : [Ontak2013]Kapitał
- Angular set函数和Component属性的命名冲突问题
- 【转】VS编译环境命令窗口中的命令
- java restclient 调用_restclient 访问 springmvc java工程接口
- 怎么让jsp中的按钮置灰不能使用_拆解按钮规范
- MTK刷机(ubuntu下)
- jsp余jspx的区别
- 国内热门ERP软件有哪些推荐?
- Rufus轻松创建USB启动盘
- 2011广告联盟排名,最好的广告联盟推荐
- 显示一个立方体的斜二测图(用数组存放正方体的各顶点坐标)
- 【Linux 性能优化】利用perf和CPU使用率定位异常函数
- UPnP与Jini面对面
- Map中的keySet方法