优达学城Numpy与Pandas笔记

此篇为优达学城数据分析入门第二课笔记网址

基本操作

import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076
])# Change False to True for each block of code to see what it does# Accessing elements
if True:print(countries[0])print(countries[3])# Slicing
if False:print(countries[0:3])print(countries[:3])print(countries[17:])print(countries[:])# Element types
if False:print(countries.dtype)print(employment.dtype)print(np.array([0, 1, 2, 3]).dtype)print(np.array([1.0, 1.5, 2.0, 2.5]).dtype)print(np.array([True, False, True]).dtype)print(np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype)# Looping
if False:for country in countries:print('Examining country {}'.format(country))for i in range(len(countries)):country = countries[i]country_employment = employment[i]print('Country {} has employment {}'.format(country,country_employment))# Numpy functions
if False:print(employment.mean())print(employment.std())print(employment.max())print(employment.sum())def max_employment(countries, employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.'''i = employment.argmax()max_country = countries[i]      # Replace this with your codemax_value = employment[i]   # Replace this with your codereturn (max_country, max_value)

Afghanistan
Angola

运算

import numpy as np# Change False to True for each block of code to see what it does# Arithmetic operations between 2 NumPy arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([1, 2, 1, 2])print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Arithmetic operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Logical operations with NumPy arrays
if False:a = np.array([True, True, False, False])b = np.array([True, False, True, False])print (a & b)print (a | b)print (~a)print (a & True)print (a & False)print (a | True)print (a | False)# Comparison operations between 2 NumPy Arrays
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([5, 4, 3, 2, 1])print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# Comparison operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# First 20 countries with school completion data
countries = np.array(['Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan','Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia','Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi','Cambodia', 'Cameroon', 'Cape Verde'
])# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([97.35583,  104.62379,  103.02998,   95.14321,  103.69019,98.49185,  100.88828,   95.43974,   92.11484,   91.54804,95.98029,   98.22902,   96.12179,  119.28105,   97.84627,29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,97.80458,  103.81398,   88.11736,   93.55611,   87.76347,102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])def overall_completion_rate(female_completion, male_completion):'''Fill in this function to return a NumPy array containing the overallschool completion rate for each country. The arguments are NumPyarrays giving the female and male completion of each country inthe same order.'''return female_completion + male_completionsum_completion= overall_completion_rate(female_completion, male_completion)
print(sum_completion)

[ 192.83205  205.28855  202.82258  186.63257  206.91115  196.29643204.70226  183.5571   185.67095  179.31151  198.43743  196.96855188.34567  234.67025  196.55129   66.08078   83.81045  181.92593114.16808  186.1203 ]

归一化

import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076
])# Change this country name to change what country will be printed when you
# click "Test Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.country_name = 'United States'def standardize_data(values):'''Fill in this function to return a standardized version of the given values,which will be in a NumPy array. Each value should be translated into thenumber of standard deviations that value is away from the mean of the data.(A positive number indicates a value higher than the mean, and a negativenumber indicates a value lower than the mean.)'''return (values-values.mean())/values.std()standardize_data(employment)

array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,-1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,-0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

import numpy as np# Change False to True for each block of code to see what it does# Using index arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([True, True, False, False])#     print a[b]
#     print a[np.array([True, False, True, False])]# Creating the index array using vectorized operations
if False:a = np.array([1, 2, 3, 2, 1])b = (a >= 2)#     print a[b]
#     print a[a >= 2]# Creating the index array using vectorized operations on another array
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([1, 2, 3, 2, 1])#     print b == 2
#     print a[b == 2]def mean_time_for_paid_students(time_spent, days_to_cancel):'''Fill in this function to calculate the mean time spent in the classroomfor students who stayed enrolled at least (greater than or equal to) 7 days.Unlike in Lesson 1, you can assume that days_to_cancel will contain onlyintegers (there are no students who have not canceled yet).The arguments are NumPy arrays. time_spent contains the amount of time spentin the classroom for each student, and days_to_cancel contains the numberof days until each student cancel. The data is given in the same orderin both arrays.'''st=time_spent[days_to_cancel >= 7]return st.mean()# Time spent in the classroom in the first week for 20 students
time_spent = np.array([12.89697233,    0.        ,   64.55043217,    0.        ,24.2315615 ,   39.991625  ,    0.        ,    0.        ,147.20683783,    0.        ,    0.        ,    0.        ,45.18261617,  157.60454283,  133.2434615 ,   52.85000767,0.        ,   54.9204785 ,   26.78142417,    0.
])# Days to cancel for 20 students
days_to_cancel = np.array([4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,38,  98,   2, 249,   2, 127,  35
])mean_time_for_paid_students(time_spent, days_to_cancel)

41.054003485454537

numpy的+=与切片需要注意，类似指针去理解

Pandas Series

import pandas as pdcountries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda','Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan','Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus','Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,67.3,  70.6]gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)# Change False to True for each block of code to see what it does# Accessing elements and slicing
# if False:
#     print life_expectancy[0]
#     print gdp[3:6]# Looping
if False:for country_life_expectancy in life_expectancy:print('Examining life expectancy {}'.format(country_life_expectancy))# Pandas functions
# if False:
#     print life_expectancy.mean()
#     print life_expectancy.std()
#     print gdp.max()
#     print gdp.sum()# Vectorized operations and index arrays
if False:a = pd.Series([1, 2, 3, 4])b = pd.Series([1, 2, 1, 2])print(a + b)print(a * 2)print(a >= 3)print(a[a >= 3])def variable_correlation(life_expectancy, gdp):'''Fill in this function to calculate the number of data points for whichthe directions of variable1 and variable2 relative to the mean are thesame, and the number of data points for which they are different.Direction here means whether each value is above or below its mean.You can classify cases where the value is equal to the mean for one orboth variables however you like.Each argument will be a Pandas series.For example, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([4, 5, 6, 7]), then the output would be (4, 0).This is because 1 and 4 are both below their means, 2 and 5 are bothbelow, 3 and 6 are both above, and 4 and 7 are both above.On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([7, 6, 5, 4]), then the output would be (0, 4).This is because 1 is below its mean but 7 is above its mean, andso on.'''life_expectancy_values_dir = ((life_expectancy-life_expectancy.mean()) > 0)
#     print(life_expectancy_values_dir)gdp_values_dir = ((gdp-gdp.mean()) > 0)
#     print(gdp_values_dir)sum_dir = (life_expectancy_values_dir == gdp_values_dir)
#     print(sum_dir)sum_dir_1 = (sum_dir == 1)sum_dir_0_2 = (sum_dir != 1)#len(sum_dir_1)-...num_same_direction = sum_dir_1.sum()     # Replace this with your codenum_different_direction = sum_dir_0_2.sum()  # Replace this with your codereturn (num_same_direction, num_different_direction)variable_correlation(life_expectancy, gdp)

(17, 3)

带索引的pandas

import pandas as pdcountries = ['Afghanistan', 'Albania', 'Algeria', 'Angola','Argentina', 'Armenia', 'Australia', 'Austria','Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh','Barbados', 'Belarus', 'Belgium', 'Belize','Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
]employment_values = [55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076,
]# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)def max_employment(employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.The input will be a Pandas series where the valuesare employment and the index is country names.Try using the Pandas idxmax() function. Documention canbe found here:http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html'''max_country = employment.argmax()      # Replace this with your codemax_value = employment.loc[max_country]   # Replace this with your codereturn (max_country, max_value)print(max_employment(employment) )

('Angola', 75.699996949999999)

import pandas as pd# Change False to True for each block of code to see what it does# Addition when indexes are the same
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])print(s1 + s2)# Indexes have same elements in a different order
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])print(s1 + s2)# Indexes overlap, but do not have exactly the same elements
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])print(s1 + s2)sum = s1 + s2print(sum.dropna())
# Indexes do not overlap
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])print(s1 + s2)print(s1.add(s2, fill_value=0))

a    11
b    22
c    33
d    44
dtype: int64
a    31
b    12
c    43
d    24
dtype: int64
a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64
c    13.0
d    24.0
dtype: float64
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64
a     1.0
b     2.0
c     3.0
d     4.0
e    10.0
f    20.0
g    30.0
h    40.0
dtype: float64

import pandas as pd# Change False to True to see what the following block of code does# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
if False:s = pd.Series([1, 2, 3, 4, 5])def add_one(x):return x + 1print(s.apply(add_one))names = pd.Series(['Andre Agassi','Barry Bonds','Christopher Columbus','Daniel Defoe','Emilio Estevez','Fred Flintstone','Greta Garbo','Humbert Humbert','Ivan Ilych','James Joyce','Keira Knightley','Lois Lane','Mike Myers','Nick Nolte','Ozzy Osbourne','Pablo Picasso','Quirinus Quirrell','Rachael Ray','Susan Sarandon','Tina Turner','Ugueth Urbina','Vince Vaughn','Woodrow Wilson','Yoji Yamada','Zinedine Zidane'
])def reverse_name(name):split_name = name.split(" ")firstname = split_name[0]secondname = split_name[1]return firstname+","+secondname# print(reverse_name(names[0]))def reverse_names(names):'''Fill in this function to return a new series where each namein the input series has been transformed from the format"Firstname Lastname" to "Lastname, FirstName".Try to use the Pandas apply() function rather than a loop.'''return names.apply(reverse_name)print(reverse_names(names))

0             Andre,Agassi
1              Barry,Bonds
2     Christopher,Columbus
3             Daniel,Defoe
4           Emilio,Estevez
5          Fred,Flintstone
6              Greta,Garbo
7          Humbert,Humbert
8               Ivan,Ilych
9              James,Joyce
10         Keira,Knightley
11               Lois,Lane
12              Mike,Myers
13              Nick,Nolte
14           Ozzy,Osbourne
15           Pablo,Picasso
16       Quirinus,Quirrell
17             Rachael,Ray
18          Susan,Sarandon
19             Tina,Turner
20           Ugueth,Urbina
21            Vince,Vaughn
22          Woodrow,Wilson
23             Yoji,Yamada
24         Zinedine,Zidane
dtype: object

import pandas as pd
import matplotlib.pyplot as plt
employment = pd.read_csv("employment-above-15.csv", index_col='Country')
employment_us = employment.loc['United States']
# %pylab inline
# employment_us.plt()

优达学城Numpy与Pandas笔记相关推荐

【自动驾驶技术】优达学城无人驾驶工程师学习笔记（七）——计算机视觉基础
计算机视觉基础目录前言颜色选择(Color Selection) 理论基础代码实践区域筛选(Region Masking) 理论基础代码实践 Canny边缘检测问题背景 Canny边缘检测 ...
【自动驾驶技术】优达学城无人驾驶工程师学习笔记（六）——Github与Markdown相关教程
Github与Markdown相关教程本博文为笔者关于优达学城无人驾驶工程师课程中计算机视觉基础部分的学习笔记,该部分为实现车道线图像识别功能的基础课程,关于本课程的详细说明请参考优达学城官网. 优 ...
优达学城-深度学习笔记（一）
优达学城-深度学习笔记(一) 标签: 机器学习优达学城-深度学习笔记一一神经网络简介最大似然概率交叉熵Cross entropy 1交叉熵代码实现 2多类别交叉熵对数几率回归的误差函数co ...
优达学城《无人驾驶入门》学习笔记——卡尔曼滤波器实现详解
优达学城<无人驾驶入门>的第二个项目是实现矩阵类,要求通过python编写一个用来计算矩阵的类Matrix.编写这个类并不难,涉及到的线性代数方面的知识也不多,比如矩阵的加法.减法.乘法, ...
【多传感融合】优达学城多传感融合学习笔记（二）——将激光雷达点云俯视图映射到二维图像
将激光雷达点云俯视图映射到二维图像目录将激光雷达点云俯视图映射到二维图像简介实现方法参考代码简介本节讲解如何将激光雷达点云俯视图(仅考虑水平坐标)映射到二维图像中,其中涉及到激光雷达点云 ...
优达twitter 清理_优达学城的学习感想及优惠
2017年12月因一次偶然的机会,看到优达学城的课程.当时通过优惠码:0C637434报名能有优惠(你们也可以使用拿优惠哦).就决定参加了优达学成的数据分析入门课程. 数据分析入门课程学习的内容有四大 ...
优达学城_数据清洗_项目三wrangle_act
下面是我优达学城项目三的记录报告里面的思路和文字说明大多都在代码块里面的注释中,#后面?,可能不太容易看,需要认真看.? #导入可能需要的包 import os import numpy as np ...
优达学城《DeepLearning》大纲和学习愿景
目的: 查漏补缺深度学习体系,精益求精的深刻理解体系中各种核心的技术原理,感受技术思想背后的精髓魅力,做到能够脱口而出. 计划: 2021年5月.6月,完成课程中所有核心知识的深刻理解(通过撰写博客, ...
优达学城深度学习任务1
这几天刚好有环境,打算学习一下深度学习看了一圈介绍,发现优达学城的深度学习课程作为入门课程还是不错的今天看了第一章节的视频,顺便做了任务1 任务1难度不大,按照网站上的说明可以完成下载.打包等工作 ...

优达学城Numpy与Pandas笔记

基本操作

运算

归一化

numpy的+=与切片需要注意，类似指针去理解

Pandas Series

带索引的pandas

优达学城Numpy与Pandas笔记相关推荐

最新文章

热门文章