此篇为优达学城数据分析入门第二课笔记网址

基本操作

import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076
])# Change False to True for each block of code to see what it does# Accessing elements
if True:print(countries[0])print(countries[3])# Slicing
if False:print(countries[0:3])print(countries[:3])print(countries[17:])print(countries[:])# Element types
if False:print(countries.dtype)print(employment.dtype)print(np.array([0, 1, 2, 3]).dtype)print(np.array([1.0, 1.5, 2.0, 2.5]).dtype)print(np.array([True, False, True]).dtype)print(np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype)# Looping
if False:for country in countries:print('Examining country {}'.format(country))for i in range(len(countries)):country = countries[i]country_employment = employment[i]print('Country {} has employment {}'.format(country,country_employment))# Numpy functions
if False:print(employment.mean())print(employment.std())print(employment.max())print(employment.sum())def max_employment(countries, employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.'''i = employment.argmax()max_country = countries[i]      # Replace this with your codemax_value = employment[i]   # Replace this with your codereturn (max_country, max_value)
Afghanistan
Angola

运算

import numpy as np# Change False to True for each block of code to see what it does# Arithmetic operations between 2 NumPy arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([1, 2, 1, 2])print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Arithmetic operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a + b)print (a - b)print (a * b)print (a / b)print (a ** b)# Logical operations with NumPy arrays
if False:a = np.array([True, True, False, False])b = np.array([True, False, True, False])print (a & b)print (a | b)print (~a)print (a & True)print (a & False)print (a | True)print (a | False)# Comparison operations between 2 NumPy Arrays
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([5, 4, 3, 2, 1])print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# Comparison operations between a NumPy array and a single number
if False:a = np.array([1, 2, 3, 4])b = 2print (a > b)print (a >= b)print (a < b)print (a <= b)print (a == b)print (a != b)# First 20 countries with school completion data
countries = np.array(['Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan','Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia','Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi','Cambodia', 'Cameroon', 'Cape Verde'
])# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([97.35583,  104.62379,  103.02998,   95.14321,  103.69019,98.49185,  100.88828,   95.43974,   92.11484,   91.54804,95.98029,   98.22902,   96.12179,  119.28105,   97.84627,29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,97.80458,  103.81398,   88.11736,   93.55611,   87.76347,102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])def overall_completion_rate(female_completion, male_completion):'''Fill in this function to return a NumPy array containing the overallschool completion rate for each country. The arguments are NumPyarrays giving the female and male completion of each country inthe same order.'''return female_completion + male_completionsum_completion= overall_completion_rate(female_completion, male_completion)
print(sum_completion)
[ 192.83205  205.28855  202.82258  186.63257  206.91115  196.29643204.70226  183.5571   185.67095  179.31151  198.43743  196.96855188.34567  234.67025  196.55129   66.08078   83.81045  181.92593114.16808  186.1203 ]

归一化

import numpy as np# First 20 countries with employment data
countries = np.array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina','Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas','Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium','Belize', 'Benin', 'Bhutan', 'Bolivia','Bosnia and Herzegovina'
])# Employment data in 2007 for those 20 countries
employment = np.array([55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076
])# Change this country name to change what country will be printed when you
# click "Test Run". Your function will be called to determine the standardized
# score for this country for each of the given 5 Gapminder variables in 2007.
# The possible country names are available in the Downloadables section.country_name = 'United States'def standardize_data(values):'''Fill in this function to return a standardized version of the given values,which will be in a NumPy array. Each value should be translated into thenumber of standard deviations that value is away from the mean of the data.(A positive number indicates a value higher than the mean, and a negativenumber indicates a value lower than the mean.)'''return (values-values.mean())/values.std()standardize_data(employment)
array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,-1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,-0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])
import numpy as np# Change False to True for each block of code to see what it does# Using index arrays
if False:a = np.array([1, 2, 3, 4])b = np.array([True, True, False, False])#     print a[b]
#     print a[np.array([True, False, True, False])]# Creating the index array using vectorized operations
if False:a = np.array([1, 2, 3, 2, 1])b = (a >= 2)#     print a[b]
#     print a[a >= 2]# Creating the index array using vectorized operations on another array
if False:a = np.array([1, 2, 3, 4, 5])b = np.array([1, 2, 3, 2, 1])#     print b == 2
#     print a[b == 2]def mean_time_for_paid_students(time_spent, days_to_cancel):'''Fill in this function to calculate the mean time spent in the classroomfor students who stayed enrolled at least (greater than or equal to) 7 days.Unlike in Lesson 1, you can assume that days_to_cancel will contain onlyintegers (there are no students who have not canceled yet).The arguments are NumPy arrays. time_spent contains the amount of time spentin the classroom for each student, and days_to_cancel contains the numberof days until each student cancel. The data is given in the same orderin both arrays.'''st=time_spent[days_to_cancel >= 7]return st.mean()# Time spent in the classroom in the first week for 20 students
time_spent = np.array([12.89697233,    0.        ,   64.55043217,    0.        ,24.2315615 ,   39.991625  ,    0.        ,    0.        ,147.20683783,    0.        ,    0.        ,    0.        ,45.18261617,  157.60454283,  133.2434615 ,   52.85000767,0.        ,   54.9204785 ,   26.78142417,    0.
])# Days to cancel for 20 students
days_to_cancel = np.array([4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,38,  98,   2, 249,   2, 127,  35
])mean_time_for_paid_students(time_spent, days_to_cancel)
41.054003485454537

numpy的+=与切片需要注意,类似指针去理解

Pandas Series

import pandas as pdcountries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda','Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan','Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus','Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,67.3,  70.6]gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)# Change False to True for each block of code to see what it does# Accessing elements and slicing
# if False:
#     print life_expectancy[0]
#     print gdp[3:6]# Looping
if False:for country_life_expectancy in life_expectancy:print('Examining life expectancy {}'.format(country_life_expectancy))# Pandas functions
# if False:
#     print life_expectancy.mean()
#     print life_expectancy.std()
#     print gdp.max()
#     print gdp.sum()# Vectorized operations and index arrays
if False:a = pd.Series([1, 2, 3, 4])b = pd.Series([1, 2, 1, 2])print(a + b)print(a * 2)print(a >= 3)print(a[a >= 3])def variable_correlation(life_expectancy, gdp):'''Fill in this function to calculate the number of data points for whichthe directions of variable1 and variable2 relative to the mean are thesame, and the number of data points for which they are different.Direction here means whether each value is above or below its mean.You can classify cases where the value is equal to the mean for one orboth variables however you like.Each argument will be a Pandas series.For example, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([4, 5, 6, 7]), then the output would be (4, 0).This is because 1 and 4 are both below their means, 2 and 5 are bothbelow, 3 and 6 are both above, and 4 and 7 are both above.On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) andpd.Series([7, 6, 5, 4]), then the output would be (0, 4).This is because 1 is below its mean but 7 is above its mean, andso on.'''life_expectancy_values_dir = ((life_expectancy-life_expectancy.mean()) > 0)
#     print(life_expectancy_values_dir)gdp_values_dir = ((gdp-gdp.mean()) > 0)
#     print(gdp_values_dir)sum_dir = (life_expectancy_values_dir == gdp_values_dir)
#     print(sum_dir)sum_dir_1 = (sum_dir == 1)sum_dir_0_2 = (sum_dir != 1)#len(sum_dir_1)-...num_same_direction = sum_dir_1.sum()     # Replace this with your codenum_different_direction = sum_dir_0_2.sum()  # Replace this with your codereturn (num_same_direction, num_different_direction)variable_correlation(life_expectancy, gdp)
(17, 3)

带索引的pandas

import pandas as pdcountries = ['Afghanistan', 'Albania', 'Algeria', 'Angola','Argentina', 'Armenia', 'Australia', 'Austria','Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh','Barbados', 'Belarus', 'Belgium', 'Belize','Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
]employment_values = [55.70000076,  51.40000153,  50.5       ,  75.69999695,58.40000153,  40.09999847,  61.5       ,  57.09999847,60.90000153,  66.59999847,  60.40000153,  68.09999847,66.90000153,  53.40000153,  48.59999847,  56.79999924,71.59999847,  58.40000153,  70.40000153,  41.20000076,
]# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)def max_employment(employment):'''Fill in this function to return the name of the countrywith the highest employment in the given employmentdata, and the employment in that country.The input will be a Pandas series where the valuesare employment and the index is country names.Try using the Pandas idxmax() function. Documention canbe found here:http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html'''max_country = employment.argmax()      # Replace this with your codemax_value = employment.loc[max_country]   # Replace this with your codereturn (max_country, max_value)print(max_employment(employment) )
('Angola', 75.699996949999999)
import pandas as pd# Change False to True for each block of code to see what it does# Addition when indexes are the same
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])print(s1 + s2)# Indexes have same elements in a different order
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])print(s1 + s2)# Indexes overlap, but do not have exactly the same elements
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])print(s1 + s2)sum = s1 + s2print(sum.dropna())
# Indexes do not overlap
if True:s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])print(s1 + s2)print(s1.add(s2, fill_value=0))
a    11
b    22
c    33
d    44
dtype: int64
a    31
b    12
c    43
d    24
dtype: int64
a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64
c    13.0
d    24.0
dtype: float64
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64
a     1.0
b     2.0
c     3.0
d     4.0
e    10.0
f    20.0
g    30.0
h    40.0
dtype: float64
import pandas as pd# Change False to True to see what the following block of code does# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
if False:s = pd.Series([1, 2, 3, 4, 5])def add_one(x):return x + 1print(s.apply(add_one))names = pd.Series(['Andre Agassi','Barry Bonds','Christopher Columbus','Daniel Defoe','Emilio Estevez','Fred Flintstone','Greta Garbo','Humbert Humbert','Ivan Ilych','James Joyce','Keira Knightley','Lois Lane','Mike Myers','Nick Nolte','Ozzy Osbourne','Pablo Picasso','Quirinus Quirrell','Rachael Ray','Susan Sarandon','Tina Turner','Ugueth Urbina','Vince Vaughn','Woodrow Wilson','Yoji Yamada','Zinedine Zidane'
])def reverse_name(name):split_name = name.split(" ")firstname = split_name[0]secondname = split_name[1]return firstname+","+secondname# print(reverse_name(names[0]))def reverse_names(names):'''Fill in this function to return a new series where each namein the input series has been transformed from the format"Firstname Lastname" to "Lastname, FirstName".Try to use the Pandas apply() function rather than a loop.'''return names.apply(reverse_name)print(reverse_names(names))
0             Andre,Agassi
1              Barry,Bonds
2     Christopher,Columbus
3             Daniel,Defoe
4           Emilio,Estevez
5          Fred,Flintstone
6              Greta,Garbo
7          Humbert,Humbert
8               Ivan,Ilych
9              James,Joyce
10         Keira,Knightley
11               Lois,Lane
12              Mike,Myers
13              Nick,Nolte
14           Ozzy,Osbourne
15           Pablo,Picasso
16       Quirinus,Quirrell
17             Rachael,Ray
18          Susan,Sarandon
19             Tina,Turner
20           Ugueth,Urbina
21            Vince,Vaughn
22          Woodrow,Wilson
23             Yoji,Yamada
24         Zinedine,Zidane
dtype: object
import pandas as pd
import matplotlib.pyplot as plt
employment = pd.read_csv("employment-above-15.csv", index_col='Country')
employment_us = employment.loc['United States']
# %pylab inline
# employment_us.plt()

优达学城Numpy与Pandas笔记相关推荐

  1. 【自动驾驶技术】优达学城无人驾驶工程师学习笔记(七)——计算机视觉基础

    计算机视觉基础目录 前言 颜色选择(Color Selection) 理论基础 代码实践 区域筛选(Region Masking) 理论基础 代码实践 Canny边缘检测 问题背景 Canny边缘检测 ...

  2. 【自动驾驶技术】优达学城无人驾驶工程师学习笔记(六)——Github与Markdown相关教程

    Github与Markdown相关教程 本博文为笔者关于优达学城无人驾驶工程师课程中计算机视觉基础部分的学习笔记,该部分为实现车道线图像识别功能的基础课程,关于本课程的详细说明请参考优达学城官网. 优 ...

  3. 优达学城-深度学习笔记(一)

    优达学城-深度学习笔记(一) 标签: 机器学习 优达学城-深度学习笔记一 一 神经网络简介 最大似然概率 交叉熵Cross entropy 1交叉熵代码实现 2多类别交叉熵 对数几率回归的误差函数co ...

  4. 优达学城《无人驾驶入门》学习笔记——卡尔曼滤波器实现详解

    优达学城<无人驾驶入门>的第二个项目是实现矩阵类,要求通过python编写一个用来计算矩阵的类Matrix.编写这个类并不难,涉及到的线性代数方面的知识也不多,比如矩阵的加法.减法.乘法, ...

  5. 【多传感融合】优达学城多传感融合学习笔记(二)——将激光雷达点云俯视图映射到二维图像

    将激光雷达点云俯视图映射到二维图像 目录 将激光雷达点云俯视图映射到二维图像 简介 实现方法 参考代码 简介 本节讲解如何将激光雷达点云俯视图(仅考虑水平坐标)映射到二维图像中,其中涉及到激光雷达点云 ...

  6. 优达twitter 清理_优达学城的学习感想及优惠

    2017年12月因一次偶然的机会,看到优达学城的课程.当时通过优惠码:0C637434报名能有优惠(你们也可以使用拿优惠哦).就决定参加了优达学成的数据分析入门课程. 数据分析入门课程学习的内容有四大 ...

  7. 优达学城_数据清洗_项目三wrangle_act

    下面是我优达学城项目三的记录报告 里面的思路和文字说明大多都在代码块里面的注释中,#后面?,可能不太容易看,需要认真看.? #导入可能需要的包 import os import numpy as np ...

  8. 优达学城《DeepLearning》大纲和学习愿景

    目的: 查漏补缺深度学习体系,精益求精的深刻理解体系中各种核心的技术原理,感受技术思想背后的精髓魅力,做到能够脱口而出. 计划: 2021年5月.6月,完成课程中所有核心知识的深刻理解(通过撰写博客, ...

  9. 优达学城 深度学习 任务1

    这几天刚好有环境,打算学习一下深度学习 看了一圈介绍,发现优达学城的深度学习课程作为入门课程还是不错的 今天看了第一章节的视频,顺便做了任务1 任务1难度不大,按照网站上的说明可以完成下载.打包等工作 ...

最新文章

  1. HPE品牌存储为啥高调? 因为“王炸”多啊!
  2. Linux下的格式化字符串漏洞利用姿势
  3. pywinauto 同花顺_东方财富、同花顺、恒生电子,到底谁是互联网金融老大?
  4. 春节添彩 福州花卉市场现“买花潮”
  5. GitHub热门教程:100天搞定机器学习(中文版)
  6. vue写进html,vue中html页面写入$t(‘’)怎么显示
  7. BZOJ3738 : [Ontak2013]Kapitał
  8. Angular set函数和Component属性的命名冲突问题
  9. 【转】VS编译环境命令窗口中的命令
  10. java restclient 调用_restclient 访问 springmvc java工程接口
  11. 怎么让jsp中的按钮置灰不能使用_拆解按钮规范
  12. MTK刷机(ubuntu下)
  13. jsp余jspx的区别
  14. 国内热门ERP软件有哪些推荐?
  15. Rufus轻松创建USB启动盘
  16. 2011广告联盟排名,最好的广告联盟推荐
  17. 显示一个立方体的斜二测图(用数组存放正方体的各顶点坐标)
  18. 【Linux 性能优化】利用perf和CPU使用率定位异常函数
  19. UPnP与Jini面对面
  20. Map中的keySet方法

热门文章

  1. 谷歌浏览器调试vue项目
  2. 华为机试真题 Python 实现【观看文艺汇演问题】【计算最多能观看几场演出】
  3. 日常用到的开源软件列表
  4. 什么是新零售 新零售对电商的影响是什么?
  5. Android 方向传感器与磁力计和加速度传感器之间的关系
  6. 国内期货量化之期货量化の自动切换主力合约
  7. vscode的Latex学习,bug别来
  8. python Pyqt5 QLabel控件 自定义添加鼠标单击信号/事件 双击信号/事件
  9. 搭建rsync+inotify实时同步
  10. 常见的“压缩与解压缩”方法...