  • python数据分析处理的一个package
  • 基于numpy(对“矩阵”做科学计算)
  • 有一种用python去操作Excel/SQL的感觉


  • series
  • DataFrame
  • Index
  • csv文件读写


import numpy as np
import pandas as pd
# json.loads()解码python json格式
import jsonjsonStr = '{"name":"aspiring", "age": 17, "hobby": ["money","power", "read"],"parames":{"a":1,"b":2}}'jsonData = json.loads(jsonStr)
{'name': 'aspiring', 'age': 17, 'hobby': ['money', 'power', 'read'], 'parames': {'a': 1, 'b': 2}}
<class 'dict'>
['money', 'power', 'read']
# 读json文件
# json.load()加载python json格式文件path1 = 'data/example.json'
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
# python数据分析书籍例子
import jsonpath2 = 'data/example.json'records = [json.loads(line) for line in open(path2, 'r', encoding='utf-8')]records[0]records[0]['tz']


s = pd.Series([7, 'Beijing', 3.14, -12345, 'HanXiaoyang'])
0              7
1        Beijing
2           3.14
3         -12345
4    HanXiaoyang
dtype: object
array([7, 'Beijing', 3.14, -12345, 'HanXiaoyang'], dtype=object)
RangeIndex(start=0, stop=5, step=1)
0              7
1        Beijing
2           3.14
3         -12345
4    HanXiaoyang
dtype: object


s = pd.Series([7, 'Beijing', 3.14, -12345, 'HanXiaoyang'], index=['A', 'B', 'C', 'D', 'E'])
A              7
B        Beijing
C           3.14
D         -12345
E    HanXiaoyang
dtype: object
s[ ['A','D','B'] ]
A          7
D     -12345
B    Beijing
dtype: object


cities = {'Beijing':55000, 'ShangHai':60000, 'Shenzhen':50000, 'Hangzhou':30000, 'Guangzhou':40000, 'Suzhou':None}
{'Beijing': 55000,'Guangzhou': 40000,'Hangzhou': 30000,'ShangHai': 60000,'Shenzhen': 50000,'Suzhou': None}
apt = pd.Series(cities, name='income')
Beijing      55000.0
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Suzhou           NaN
Name: income, dtype: float64
# 索引
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Suzhou           NaN
Name: income, dtype: float64
Beijing      55000.0
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Name: income, dtype: float64
ShangHai     60000.0
Shenzhen     50000.0
Guangzhou    40000.0
Name: income, dtype: float64
apt[ ['ShangHai', 'Shenzhen', 'Guangzhou'] ]
ShangHai     60000.0
Shenzhen     50000.0
Guangzhou    40000.0
Name: income, dtype: float64
# 简单的计算
# 广播特性
Beijing      165000.0
Guangzhou    120000.0
Hangzhou      90000.0
ShangHai     180000.0
Shenzhen     150000.0
Suzhou            NaN
Name: income, dtype: float64
Beijing      22000.0
Guangzhou    16000.0
Hangzhou     12000.0
ShangHai     24000.0
Shenzhen     20000.0
Suzhou           NaN
Name: income, dtype: float64
# list不可以直接做数学运算
my_list = [2,4,6,8,10]
---------------------------------------------------------------------------TypeError                                 Traceback (most recent call last)<ipython-input-29-39aba40a404f> in <module>()
----> 1 my_list/2TypeError: unsupported operand type(s) for /: 'list' and 'int'
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Suzhou           NaN
Name: income, dtype: float64
Beijing      55000.0
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Name: income, dtype: float64
# 基于索引去做计算的
apt[1:] + apt[:-1]
Beijing           NaN
Guangzhou     80000.0
Hangzhou      60000.0
ShangHai     120000.0
Shenzhen     100000.0
Suzhou            NaN
Name: income, dtype: float64
# in判断index是否存在
'Hangzhou' in apt
'Chongqing' in apt
# apt['Chongqing'] 不OK的
booling indexing/条件判断索引
Beijing       True
Guangzhou     True
Hangzhou     False
ShangHai      True
Shenzhen      True
Suzhou       False
Name: income, dtype: bool
Beijing      55000.0
Guangzhou    40000.0
ShangHai     60000.0
Shenzhen     50000.0
Name: income, dtype: float64
# 统计计算


Beijing      55000.0
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     50000.0
Suzhou           NaN
Name: income, dtype: float64
apt['Shenzhen'] = 70000
Beijing      55000.0
Guangzhou    40000.0
Hangzhou     30000.0
ShangHai     60000.0
Shenzhen     70000.0
Suzhou           NaN
Name: income, dtype: float64
apt[apt<=40000] = 45000
Beijing      55000.0
Guangzhou    45000.0
Hangzhou     45000.0
ShangHai     60000.0
Shenzhen     70000.0
Suzhou           NaN
Name: income, dtype: float64
Beijing      10.915088
Guangzhou    10.714418
Hangzhou     10.714418
ShangHai     11.002100
Shenzhen     11.156251
Suzhou             NaN
Name: income, dtype: float64
cars = pd.Series({'Beijing':350000, 'ShangHai':400000, 'Shenzhen':300000, \'Tianjin':200000, 'Guangzhou':250000, 'Chongqing':150000})
Beijing      350000
Chongqing    150000
Guangzhou    250000
ShangHai     400000
Shenzhen     300000
Tianjin      200000
dtype: int64
expense = cars + 10*apt
Beijing       900000.0
Chongqing          NaN
Guangzhou     700000.0
Hangzhou           NaN
ShangHai     1000000.0
Shenzhen     1000000.0
Suzhou             NaN
Tianjin            NaN
dtype: float64


'Hangzhou' in apt
'Hangzhou' in cars
Beijing      55000.0
Guangzhou    45000.0
Hangzhou     45000.0
ShangHai     60000.0
Shenzhen     70000.0
Suzhou           NaN
Name: income, dtype: float64
# bool结果返回
Beijing       True
Guangzhou     True
Hangzhou      True
ShangHai      True
Shenzhen      True
Suzhou       False
Name: income, dtype: bool
Beijing      False
Guangzhou    False
Hangzhou     False
ShangHai     False
Shenzhen     False
Suzhou        True
Name: income, dtype: bool
Beijing       900000.0
Chongqing          NaN
Guangzhou     700000.0
Hangzhou           NaN
ShangHai     1000000.0
Shenzhen     1000000.0
Suzhou             NaN
Tianjin            NaN
dtype: float64
expense[expense.isnull()] = expense.mean()
Beijing       900000.0
Chongqing     900000.0
Guangzhou     700000.0
Hangzhou      900000.0
ShangHai     1000000.0
Shenzhen     1000000.0
Suzhou        900000.0
Tianjin       900000.0
dtype: float64



data = {'City':['Beijing','ShangHai','Guangzhou','Shenzhen','Hangzhou','Chongqing'],'year':[2017,2018,2017,2018,2017,2017],'population':[2100,2300,1000,700,500,500]}
City population year
0 Beijing 2100 2017
1 ShangHai 2300 2018
2 Guangzhou 1000 2017
3 Shenzhen 700 2018
4 Hangzhou 500 2017
5 Chongqing 500 2017
pd.DataFrame(data, columns=['year','City','population'])
year City population
0 2017 Beijing 2100
1 2018 ShangHai 2300
2 2017 Guangzhou 1000
3 2018 Shenzhen 700
4 2017 Hangzhou 500
5 2017 Chongqing 500
# index
pd.DataFrame(data, columns=['year','City','population'], index=['one','two','three','four','five','six'])
year City population
one 2017 Beijing 2100
two 2018 ShangHai 2300
three 2017 Guangzhou 1000
four 2018 Shenzhen 700
five 2017 Hangzhou 500
six 2017 Chongqing 500
# DataFrame可以视作Series的集合
Beijing      55000.0
Guangzhou    45000.0
Hangzhou     45000.0
ShangHai     60000.0
Shenzhen     70000.0
Suzhou           NaN
Name: income, dtype: float64
Beijing      350000
Chongqing    150000
Guangzhou    250000
ShangHai     400000
Shenzhen     300000
Tianjin      200000
dtype: int64
df = pd.DataFrame({'apt':apt, 'cars':cars})
apt cars
Beijing 55000.0 350000.0
Chongqing NaN 150000.0
Guangzhou 45000.0 250000.0
Hangzhou 45000.0 NaN
ShangHai 60000.0 400000.0
Shenzhen 70000.0 300000.0
Suzhou NaN NaN
Tianjin NaN 200000.0
# 取出一列(Series)
Beijing      55000.0
Chongqing        NaN
Guangzhou    45000.0
Hangzhou     45000.0
ShangHai     60000.0
Shenzhen     70000.0
Suzhou           NaN
Tianjin          NaN
Name: apt, dtype: float64
Beijing 55000.0
Chongqing NaN
Guangzhou 45000.0
Hangzhou 45000.0
ShangHai 60000.0
Shenzhen 70000.0
Suzhou NaN
Tianjin NaN
# 赋值
apt cars
Beijing 55000.0 350000.0
Chongqing NaN 150000.0
Guangzhou 45000.0 250000.0
Hangzhou 45000.0 NaN
ShangHai 60000.0 400000.0
Shenzhen 70000.0 300000.0
Suzhou NaN NaN
Tianjin NaN 200000.0
df['bonus'] = 40000
apt cars bonus
Beijing 55000.0 350000.0 40000
Chongqing NaN 150000.0 40000
Guangzhou 45000.0 250000.0 40000
Hangzhou 45000.0 NaN 40000
ShangHai 60000.0 400000.0 40000
Shenzhen 70000.0 300000.0 40000
Suzhou NaN NaN 40000
Tianjin NaN 200000.0 40000
# 对两列做计算
df['expense'] = df['apt'] + df['bonus']
apt cars bonus expense
Beijing 55000.0 350000.0 40000 95000.0
Chongqing NaN 150000.0 40000 NaN
Guangzhou 45000.0 250000.0 40000 85000.0
Hangzhou 45000.0 NaN 40000 85000.0
ShangHai 60000.0 400000.0 40000 100000.0
Shenzhen 70000.0 300000.0 40000 110000.0
Suzhou NaN NaN 40000 NaN
Tianjin NaN 200000.0 40000 NaN
Index(['Beijing', 'Chongqing', 'Guangzhou', 'Hangzhou', 'ShangHai', 'Shenzhen','Suzhou', 'Tianjin'],dtype='object')
apt         55000.0
cars       350000.0
bonus       40000.0
expense     95000.0
Name: Beijing, dtype: float64
df.loc[['Beijing', 'ShangHai', 'Guangzhou']]
apt cars bonus expense
Beijing 55000.0 350000.0 40000 95000.0
ShangHai 60000.0 400000.0 40000 100000.0
Guangzhou 45000.0 250000.0 40000 85000.0
apt cars bonus expense
Beijing 55000.0 350000.0 40000 95000.0
Chongqing NaN 150000.0 40000 NaN
Guangzhou 45000.0 250000.0 40000 85000.0
Hangzhou 45000.0 NaN 40000 85000.0
ShangHai 60000.0 400000.0 40000 100000.0
Shenzhen 70000.0 300000.0 40000 110000.0
Suzhou NaN NaN 40000 NaN
Tianjin NaN 200000.0 40000 NaN
# 高级函数loc
# 利用index的名称,来获取想要的行(或列)
df.loc['Beijing':'Suzhou', ['apt','bonus']]
apt bonus
Beijing 55000.0 40000
Chongqing NaN 40000
Guangzhou 45000.0 40000
Hangzhou 45000.0 40000
ShangHai 60000.0 40000
Shenzhen 70000.0 40000
Suzhou NaN 40000
# 类似切片的用法
df.loc['Beijing':'Suzhou', 'apt':'bonus']
apt cars bonus
Beijing 55000.0 350000.0 40000
Chongqing NaN 150000.0 40000
Guangzhou 45000.0 250000.0 40000
Hangzhou 45000.0 NaN 40000
ShangHai 60000.0 400000.0 40000
Shenzhen 70000.0 300000.0 40000
Suzhou NaN NaN 40000
# 传入list的用法
df.loc[['Beijing','Suzhou'], ['apt','bonus']]
apt bonus
Beijing 55000.0 40000
Suzhou NaN 40000
apt cars bonus expense
Beijing 55000.0 350000.0 40000 95000.0
Chongqing NaN 150000.0 40000 NaN
Guangzhou 45000.0 250000.0 40000 85000.0
Hangzhou 45000.0 NaN 40000 85000.0
ShangHai 60000.0 400000.0 40000 100000.0
Shenzhen 70000.0 300000.0 40000 110000.0
Suzhou NaN NaN 40000 NaN
Tianjin NaN 200000.0 40000 NaN
df.loc['Beijing','bonus'] = 50000
apt cars bonus expense
Beijing 55000.0 350000.0 50000 95000.0
Chongqing NaN 150000.0 40000 NaN
Guangzhou 45000.0 250000.0 40000 85000.0
Hangzhou 45000.0 NaN 40000 85000.0
ShangHai 60000.0 400000.0 40000 100000.0
Shenzhen 70000.0 300000.0 40000 110000.0
Suzhou NaN NaN 40000 NaN
Tianjin NaN 200000.0 40000 NaN
df.loc[:,'expense'] = 100000
apt cars bonus expense
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
# 返回表示DataFrame维度的元组
(8, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, Beijing to Tianjin
Data columns (total 5 columns):
apt        5 non-null float64
cars       6 non-null float64
bonus      8 non-null int64
expense    8 non-null int64
color      8 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 704.0+ bytes
Beijing Chongqing Guangzhou Hangzhou ShangHai Shenzhen Suzhou Tianjin
apt 55000.0 NaN 45000.0 45000.0 60000.0 70000.0 NaN NaN
cars 350000.0 150000.0 250000.0 NaN 400000.0 300000.0 NaN 200000.0
bonus 50000.0 40000.0 40000.0 40000.0 40000.0 40000.0 40000.0 40000.0
expense 100000.0 100000.0 100000.0 100000.0 100000.0 100000.0 100000.0 100000.0
apt cars bonus expense
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
apt cars bonus expense
count 5.000000 6.000000 8.000000 8.0
mean 55000.000000 275000.000000 41250.000000 100000.0
std 10606.601718 93541.434669 3535.533906 0.0
min 45000.000000 150000.000000 40000.000000 100000.0
25% 45000.000000 212500.000000 40000.000000 100000.0
50% 55000.000000 275000.000000 40000.000000 100000.0
75% 60000.000000 337500.000000 40000.000000 100000.0
max 70000.000000 400000.000000 50000.000000 100000.0
Beijing      350000.0
Chongqing    150000.0
Guangzhou    250000.0
Hangzhou          NaN
ShangHai     400000.0
Shenzhen     300000.0
Suzhou            NaN
Tianjin      200000.0
Name: cars, dtype: float64
df['cars'] < 310000
Beijing      False
Chongqing     True
Guangzhou     True
Hangzhou     False
ShangHai     False
Shenzhen      True
Suzhou       False
Tianjin       True
Name: cars, dtype: bool
df.loc[:,'color'] = ['红','黄','紫','蓝','红','绿','棕','橙']
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
Beijing       True
Chongqing    False
Guangzhou    False
Hangzhou     False
ShangHai      True
Shenzhen      True
Suzhou       False
Tianjin      False
Name: color, dtype: bool
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
# 填充缺失值
#df.fillna(value=50000, inplace=True)
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing 50000.0 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 50000.0 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou 50000.0 50000.0 40000 100000
Tianjin 50000.0 200000.0 40000 100000
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing 55000.0 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 250000.0 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou 70000.0 300000.0 40000 100000
Tianjin 70000.0 200000.0 40000 100000
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing 45000.0 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 400000.0 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou NaN 200000.0 40000 100000
Tianjin NaN 200000.0 40000 100000
!head -10 data/GOOG.csv
'head' 不是内部或外部命令,也不是可运行的程序
goog = pd.read_csv('data/GOOG.csv', index_col=0, parse_dates=['Date'])
Open High Low Close Adj Close Volume
2004-08-19 49.813286 51.835709 47.800831 49.982655 49.982655 44871300
2004-08-20 50.316402 54.336334 50.062355 53.952770 53.952770 22942800
2004-08-23 55.168217 56.528118 54.321388 54.495735 54.495735 18342800
2004-08-24 55.412300 55.591629 51.591621 52.239193 52.239193 15319700
2004-08-25 52.284027 53.798351 51.746044 52.802086 52.802086 9232100
Open High Low Close Adj Close Volume
2004-08-19 49.813286 51.835709 47.800831 49.982655 49.982655 44871300
2004-08-20 50.316402 54.336334 50.062355 53.952770 53.952770 22942800
2004-08-23 55.168217 56.528118 54.321388 54.495735 54.495735 18342800
2004-08-24 55.412300 55.591629 51.591621 52.239193 52.239193 15319700
2004-08-25 52.284027 53.798351 51.746044 52.802086 52.802086 9232100
2004-08-26 52.279045 53.773445 52.134586 53.753517 53.753517 7128600
2004-08-27 53.848164 54.107193 52.647663 52.876804 52.876804 6241200
2004-08-30 52.443428 52.548038 50.814533 50.814533 50.814533 5221400
2004-08-31 50.958992 51.661362 50.889256 50.993862 50.993862 4941200
2004-09-01 51.158245 51.292744 49.648903 49.937820 49.937820 9181600
2004-09-02 49.409801 50.993862 49.285267 50.565468 50.565468 15190400
2004-09-03 50.286514 50.680038 49.474556 49.818268 49.818268 5176800
2004-09-07 50.316402 50.809555 49.619015 50.600338 50.600338 5875200
2004-09-08 50.181908 51.322632 50.062355 50.958992 50.958992 5009200
2004-09-09 51.073563 51.163227 50.311420 50.963974 50.963974 4080900
2004-09-10 50.610302 53.081039 50.460861 52.468334 52.468334 8740200
2004-09-13 53.115910 54.002586 53.031227 53.549286 53.549286 7881300
2004-09-14 53.524376 55.790882 53.195610 55.536835 55.536835 10880300
2004-09-15 55.073570 56.901718 54.894241 55.790882 55.790882 10763900
2004-09-16 55.960247 57.683788 55.616535 56.772205 56.772205 9310200
2004-09-17 56.996365 58.525631 56.562988 58.525631 58.525631 9517400
2004-09-20 58.256641 60.572956 58.166977 59.457142 59.457142 10679200
2004-09-21 59.681301 59.985161 58.535595 58.699978 58.699978 7263000
2004-09-22 58.480801 59.611561 58.186901 58.968971 58.968971 7617100
2004-09-23 59.198112 61.086033 58.291508 60.184414 60.184414 8576100
2004-09-24 60.244190 61.818291 59.656395 59.691261 59.691261 9166700
2004-09-27 59.556767 60.214302 58.680054 58.909195 58.909195 7099600
2004-09-28 60.423519 63.462128 59.880554 63.193138 63.193138 17009400
2004-09-29 63.113434 67.257904 62.879314 65.295258 65.295258 30661400
2004-09-30 64.707458 65.902977 64.259140 64.558022 64.558022 13823300
... ... ... ... ... ... ...
2017-06-08 982.349976 984.570007 977.200012 983.409973 983.409973 1481900
2017-06-09 984.500000 984.500000 935.630005 949.830017 949.830017 3309400
2017-06-12 939.559998 949.354980 915.232971 942.900024 942.900024 3763500
2017-06-13 951.909973 959.979980 944.090027 953.400024 953.400024 2013300
2017-06-14 959.919983 961.150024 942.250000 950.760010 950.760010 1489700
2017-06-15 933.969971 943.338989 924.440002 942.309998 942.309998 2133100
2017-06-16 940.000000 942.039978 931.594971 939.780029 939.780029 3094700
2017-06-19 949.960022 959.989990 949.049988 957.369995 957.369995 1533300
2017-06-20 957.520020 961.619995 950.010010 950.630005 950.630005 1126000
2017-06-21 953.640015 960.099976 950.760010 959.450012 959.450012 1202200
2017-06-22 958.700012 960.719971 954.549988 957.090027 957.090027 941400
2017-06-23 956.830017 966.000000 954.200012 965.590027 965.590027 1527900
2017-06-26 969.900024 973.309998 950.789978 952.270020 952.270020 1598400
2017-06-27 942.460022 948.289978 926.849976 927.330017 927.330017 2579900
2017-06-28 929.000000 942.750000 916.000000 940.489990 940.489990 2721400
2017-06-29 929.919983 931.260010 910.619995 917.789978 917.789978 3299200
2017-06-30 926.049988 926.049988 908.309998 908.729980 908.729980 2065500
2017-07-03 912.179993 913.940002 894.789978 898.700012 898.700012 1709800
2017-07-05 901.760010 914.510010 898.500000 911.710022 911.710022 1813900
2017-07-06 904.119995 914.943970 899.700012 906.690002 906.690002 1424500
2017-07-07 908.849976 921.539978 908.849976 918.590027 918.590027 1637800
2017-07-10 921.770020 930.380005 919.590027 928.799988 928.799988 1192800
2017-07-11 929.539978 931.429993 922.000000 930.090027 930.090027 1113200
2017-07-12 938.679993 946.299988 934.469971 943.830017 943.830017 1532100
2017-07-13 946.289978 954.450012 943.010010 947.159973 947.159973 1294700
2017-07-14 952.000000 956.909973 948.005005 955.989990 955.989990 1053800
2017-07-17 957.000000 960.739990 949.241028 953.419983 953.419983 1165500
2017-07-18 953.000000 968.039978 950.599976 965.400024 965.400024 1154000
2017-07-19 967.840027 973.039978 964.030029 970.890015 970.890015 1224500
2017-07-20 975.000000 975.900024 961.510010 968.150024 968.150024 1616500

3253 rows × 6 columns

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3253 entries, 2004-08-19 to 2017-07-20
Data columns (total 6 columns):
Open         3253 non-null float64
High         3253 non-null float64
Low          3253 non-null float64
Close        3253 non-null float64
Adj Close    3253 non-null float64
Volume       3253 non-null int64
dtypes: float64(5), int64(1)
memory usage: 177.9 KB
Open High Low Close Adj Close Volume
count 3253.000000 3253.000000 3253.000000 3253.000000 3253.000000 3.253000e+03
mean 370.588678 373.854568 366.959060 370.463274 370.463274 8.139070e+06
std 212.537536 213.645163 211.213609 212.542226 212.542226 8.403870e+06
min 49.409801 50.680038 47.800831 49.818268 49.818268 7.900000e+03
25% 225.928162 228.050217 222.984207 224.986694 224.986694 2.743600e+06
50% 292.030396 293.898407 288.538483 291.318054 291.318054 5.374600e+06
75% 531.599976 535.729126 527.810913 532.299988 532.299988 1.081150e+07
max 984.500000 988.250000 977.200012 983.679993 983.679993 8.254150e+07
Open High Low Close Adj Close Volume
2017-07-14 952.000000 956.909973 948.005005 955.989990 955.989990 1053800
2017-07-17 957.000000 960.739990 949.241028 953.419983 953.419983 1165500
2017-07-18 953.000000 968.039978 950.599976 965.400024 965.400024 1154000
2017-07-19 967.840027 973.039978 964.030029 970.890015 970.890015 1224500
2017-07-20 975.000000 975.900024 961.510010 968.150024 968.150024 1616500
DatetimeIndex(['2004-08-19', '2004-08-20', '2004-08-23', '2004-08-24','2004-08-25', '2004-08-26', '2004-08-27', '2004-08-30','2004-08-31', '2004-09-01',...'2017-07-07', '2017-07-10', '2017-07-11', '2017-07-12','2017-07-13', '2017-07-14', '2017-07-17', '2017-07-18','2017-07-19', '2017-07-20'],dtype='datetime64[ns]', name='Date', length=3253, freq=None)
# 日期对应一周的星期几
goog.loc[:,'dow'] = goog.index.dayofweek
# 日期对应的一年的第几天
# goog.loc[:,'doy'] = goog.index.dayofyear
Open High Low Close Adj Close Volume dow
2004-08-19 49.813286 51.835709 47.800831 49.982655 49.982655 44871300 3
2004-08-20 50.316402 54.336334 50.062355 53.952770 53.952770 22942800 4
2004-08-23 55.168217 56.528118 54.321388 54.495735 54.495735 18342800 0
2004-08-24 55.412300 55.591629 51.591621 52.239193 52.239193 15319700 1
2004-08-25 52.284027 53.798351 51.746044 52.802086 52.802086 9232100 2
2004-08-26 52.279045 53.773445 52.134586 53.753517 53.753517 7128600 3
2004-08-27 53.848164 54.107193 52.647663 52.876804 52.876804 6241200 4
2004-08-30 52.443428 52.548038 50.814533 50.814533 50.814533 5221400 0
2004-08-31 50.958992 51.661362 50.889256 50.993862 50.993862 4941200 1
2004-09-01 51.158245 51.292744 49.648903 49.937820 49.937820 9181600 2
2004-09-02 49.409801 50.993862 49.285267 50.565468 50.565468 15190400 3
2004-09-03 50.286514 50.680038 49.474556 49.818268 49.818268 5176800 4
2004-09-07 50.316402 50.809555 49.619015 50.600338 50.600338 5875200 1
2004-09-08 50.181908 51.322632 50.062355 50.958992 50.958992 5009200 2
2004-09-09 51.073563 51.163227 50.311420 50.963974 50.963974 4080900 3
2004-09-10 50.610302 53.081039 50.460861 52.468334 52.468334 8740200 4
2004-09-13 53.115910 54.002586 53.031227 53.549286 53.549286 7881300 0
2004-09-14 53.524376 55.790882 53.195610 55.536835 55.536835 10880300 1
2004-09-15 55.073570 56.901718 54.894241 55.790882 55.790882 10763900 2
2004-09-16 55.960247 57.683788 55.616535 56.772205 56.772205 9310200 3
2004-09-17 56.996365 58.525631 56.562988 58.525631 58.525631 9517400 4
2004-09-20 58.256641 60.572956 58.166977 59.457142 59.457142 10679200 0
2004-09-21 59.681301 59.985161 58.535595 58.699978 58.699978 7263000 1
2004-09-22 58.480801 59.611561 58.186901 58.968971 58.968971 7617100 2
2004-09-23 59.198112 61.086033 58.291508 60.184414 60.184414 8576100 3
2004-09-24 60.244190 61.818291 59.656395 59.691261 59.691261 9166700 4
2004-09-27 59.556767 60.214302 58.680054 58.909195 58.909195 7099600 0
2004-09-28 60.423519 63.462128 59.880554 63.193138 63.193138 17009400 1
2004-09-29 63.113434 67.257904 62.879314 65.295258 65.295258 30661400 2
2004-09-30 64.707458 65.902977 64.259140 64.558022 64.558022 13823300 3
... ... ... ... ... ... ... ...
2017-06-08 982.349976 984.570007 977.200012 983.409973 983.409973 1481900 3
2017-06-09 984.500000 984.500000 935.630005 949.830017 949.830017 3309400 4
2017-06-12 939.559998 949.354980 915.232971 942.900024 942.900024 3763500 0
2017-06-13 951.909973 959.979980 944.090027 953.400024 953.400024 2013300 1
2017-06-14 959.919983 961.150024 942.250000 950.760010 950.760010 1489700 2
2017-06-15 933.969971 943.338989 924.440002 942.309998 942.309998 2133100 3
2017-06-16 940.000000 942.039978 931.594971 939.780029 939.780029 3094700 4
2017-06-19 949.960022 959.989990 949.049988 957.369995 957.369995 1533300 0
2017-06-20 957.520020 961.619995 950.010010 950.630005 950.630005 1126000 1
2017-06-21 953.640015 960.099976 950.760010 959.450012 959.450012 1202200 2
2017-06-22 958.700012 960.719971 954.549988 957.090027 957.090027 941400 3
2017-06-23 956.830017 966.000000 954.200012 965.590027 965.590027 1527900 4
2017-06-26 969.900024 973.309998 950.789978 952.270020 952.270020 1598400 0
2017-06-27 942.460022 948.289978 926.849976 927.330017 927.330017 2579900 1
2017-06-28 929.000000 942.750000 916.000000 940.489990 940.489990 2721400 2
2017-06-29 929.919983 931.260010 910.619995 917.789978 917.789978 3299200 3
2017-06-30 926.049988 926.049988 908.309998 908.729980 908.729980 2065500 4
2017-07-03 912.179993 913.940002 894.789978 898.700012 898.700012 1709800 0
2017-07-05 901.760010 914.510010 898.500000 911.710022 911.710022 1813900 2
2017-07-06 904.119995 914.943970 899.700012 906.690002 906.690002 1424500 3
2017-07-07 908.849976 921.539978 908.849976 918.590027 918.590027 1637800 4
2017-07-10 921.770020 930.380005 919.590027 928.799988 928.799988 1192800 0
2017-07-11 929.539978 931.429993 922.000000 930.090027 930.090027 1113200 1
2017-07-12 938.679993 946.299988 934.469971 943.830017 943.830017 1532100 2
2017-07-13 946.289978 954.450012 943.010010 947.159973 947.159973 1294700 3
2017-07-14 952.000000 956.909973 948.005005 955.989990 955.989990 1053800 4
2017-07-17 957.000000 960.739990 949.241028 953.419983 953.419983 1165500 0
2017-07-18 953.000000 968.039978 950.599976 965.400024 965.400024 1154000 1
2017-07-19 967.840027 973.039978 964.030029 970.890015 970.890015 1224500 2
2017-07-20 975.000000 975.900024 961.510010 968.150024 968.150024 1616500 3

3253 rows × 7 columns

goog.loc[:,'doy'] = goog.index.dayofyear
Open High Low Close Adj Close Volume dow doy
2004-08-19 49.813286 51.835709 47.800831 49.982655 49.982655 44871300 3 232
2004-08-20 50.316402 54.336334 50.062355 53.952770 53.952770 22942800 4 233
2004-08-23 55.168217 56.528118 54.321388 54.495735 54.495735 18342800 0 236
2004-08-24 55.412300 55.591629 51.591621 52.239193 52.239193 15319700 1 237
2004-08-25 52.284027 53.798351 51.746044 52.802086 52.802086 9232100 2 238
%matplotlib inline
<matplotlib.axes._subplots.AxesSubplot at 0x244022eb908>

nvda = pd.read_csv('data/NVDA.csv', index_col=0, parse_dates=['Date'])
Open High Low Close Adj Close Volume
1999-01-22 1.750000 1.953125 1.552083 1.640625 1.523430 67867200
1999-01-25 1.770833 1.833333 1.640625 1.812500 1.683028 12762000
1999-01-26 1.833333 1.869792 1.645833 1.671875 1.552448 8580000
1999-01-27 1.677083 1.718750 1.583333 1.666667 1.547611 6109200
1999-01-28 1.666667 1.677083 1.651042 1.661458 1.542776 5688000
1999-01-29 1.661458 1.666667 1.583333 1.583333 1.470231 6100800
1999-02-01 1.583333 1.625000 1.583333 1.614583 1.499249 3867600
1999-02-02 1.583333 1.625000 1.442708 1.489583 1.383178 6602400
1999-02-03 1.468750 1.541667 1.458333 1.520833 1.412196 1878000
1999-02-04 1.541667 1.645833 1.520833 1.604167 1.489577 4548000
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4654 entries, 1999-01-22 to 2017-07-20
Data columns (total 6 columns):
Open         4654 non-null float64
High         4654 non-null float64
Low          4654 non-null float64
Close        4654 non-null float64
Adj Close    4654 non-null float64
Volume       4654 non-null int64
dtypes: float64(5), int64(1)
memory usage: 254.5 KB
Open High Low Close Adj Close Volume
count 4654.000000 4654.000000 4654.000000 4654.000000 4654.000000 4.654000e+03
mean 18.872888 19.222090 18.513574 18.879564 18.091126 1.632563e+07
std 22.025278 22.346668 21.662627 22.048935 22.093697 1.204002e+07
min 1.395833 1.421875 1.333333 1.364583 1.267107 4.920000e+05
25% 8.510000 8.755000 8.245261 8.505000 7.897462 8.721475e+06
50% 13.810000 14.090000 13.500000 13.814167 12.832797 1.373830e+07
75% 19.770000 20.129999 19.505000 19.789167 18.774976 2.041408e+07
max 166.330002 168.500000 164.610001 167.500000 167.500000 2.307714e+08
%matplotlib inline
<matplotlib.axes._subplots.AxesSubplot at 0x244025689e8>

nvda['Open'].plot(grid=True)# 画柱状图
# nvda['Open'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x24402568908>

nvda.index > '2016-01-01'
array([False, False, False, ...,  True,  True,  True])
nvda.index < '2016-04-01'
array([ True,  True,  True, ..., False, False, False])
# 条件与或非
# | 表示或
# & 表示且
# ! 表示非
(nvda.index > '2016-01-01') & (nvda.index < '2016-02-01')
array([False, False, False, ..., False, False, False])
nvda[(nvda.index > '2016-01-01') & (nvda.index < '2016-02-01')]['Open'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x24401f3cd68>

nvda[(nvda.index > '2016-01-01') & (nvda.index < '2016-02-01')]['Open'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x244026c24a8>

Open High Low Close Adj Close Volume
1999-01-22 1.750000 1.953125 1.552083 1.640625 1.523430 67867200
1999-01-25 1.770833 1.833333 1.640625 1.812500 1.683028 12762000
1999-01-26 1.833333 1.869792 1.645833 1.671875 1.552448 8580000
1999-01-27 1.677083 1.718750 1.583333 1.666667 1.547611 6109200
1999-01-28 1.666667 1.677083 1.651042 1.661458 1.542776 5688000
Open    18.872888
High    19.222090
dtype: float64
nvda[(nvda.index >= '2016-01-01') & (nvda.index <= '2016-06-30')].describe()
Open High Low Close Adj Close Volume
count 125.000000 125.000000 125.000000 125.000000 125.000000 1.250000e+02
mean 35.931360 36.416000 35.498880 36.006720 35.705421 9.802855e+06
std 6.722128 6.771410 6.768863 6.798316 6.816078 5.527803e+06
min 24.780001 25.559999 24.750000 25.219999 24.922132 4.382600e+06
25% 31.270000 31.870001 30.820000 31.520000 31.147726 6.919400e+06
50% 35.299999 35.570000 34.840000 35.389999 35.099426 8.707300e+06
75% 42.000000 42.799999 41.459999 42.279999 41.932854 1.122720e+07
max 47.759998 48.540001 47.650002 48.490002 48.216755 5.275640e+07
apt cars bonus expense color
Beijing 55000.0 350000.0 50000 100000
Chongqing NaN 150000.0 40000 100000
Guangzhou 45000.0 250000.0 40000 100000
Hangzhou 45000.0 NaN 40000 100000
ShangHai 60000.0 400000.0 40000 100000
Shenzhen 70000.0 300000.0 40000 100000 绿
Suzhou NaN NaN 40000 100000
Tianjin NaN 200000.0 40000 100000
!head -10 my_df.csv
'head' 不是内部或外部命令,也不是可运行的程序
df.to_csv('data/my_df.csv', index=False)
!head -10 my_df.csv
'head' 不是内部或外部命令,也不是可运行的程序
  1. 史诗级动态规划 教程 by hch
  2. 高中数学知识点归纳总结三角函数与解三角形
  3. 献给面试学生 关键字const是什么意思 ESP(译者:Embedded Systems Programming) --Dan Saks概括了const的所有用法
  4. Mac mini 2018 win10 外接显卡终极教程
  5. 量子精密测量技术大突破,应用正当时,国仪量子成果斐然
  6. java cap 反编译_应用 JD-Eclipse 插件实现 RFT 中 .class 文件的反向编译
  7. JAVA.犹抱琵琶半遮面
  8. 怎么使qq推广效果最大化
  9. Python模拟银行管理系统(面向对象)# 谭子
  10. 2019 ICPC 沈阳站 游记