数据聚合与分组运算
GroupBy技术
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df = DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
df
|
key1
|
key2
|
data1
|
data2
|
0
|
a
|
one
|
-0.074122
|
-0.571432
|
1
|
a
|
two
|
0.347874
|
-0.794645
|
2
|
b
|
one
|
0.399766
|
-0.596056
|
3
|
b
|
two
|
1.209857
|
-0.266257
|
4
|
a
|
one
|
-0.001175
|
0.180895
|
#根据key1进行分组,并计算data1列的平均值。
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>
grouped.mean()
key1
a 0.090859
b 0.804812
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1 key2
a one -0.037649two 0.347874
b one 0.399766two 1.209857
Name: data1, dtype: float64
means.unstack()
key2
|
one
|
two
|
key1
|
|
|
a
|
-0.037649
|
0.347874
|
b
|
0.399766
|
1.209857
|
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
California 2005 0.3478742006 0.399766
Ohio 2005 0.5678672006 -0.001175
Name: data1, dtype: float64
df.groupby('key1').mean()
|
data1
|
data2
|
key1
|
|
|
a
|
0.090859
|
-0.395061
|
b
|
0.804812
|
-0.431157
|
df.groupby(['key1','key2']).mean()
|
|
data1
|
data2
|
key1
|
key2
|
|
|
a
|
one
|
-0.037649
|
-0.195268
|
two
|
0.347874
|
-0.794645
|
b
|
one
|
0.399766
|
-0.596056
|
two
|
1.209857
|
-0.266257
|
#GroupBy的size方法,可以返回一个含有分组大小的Series。目前,分组键中的任何缺失值都会被排除在结果之外。
df.groupby(['key1','key2']).size()
key1 key2
a one 2two 1
b one 1two 1
dtype: int64
对分组进行迭代
for name,group in df.groupby('key1'):print(name)print(group)
akey1 key2 data1 data2
0 a one -0.074122 -0.571432
1 a two 0.347874 -0.794645
4 a one -0.001175 0.180895
bkey1 key2 data1 data2
2 b one 0.399766 -0.596056
3 b two 1.209857 -0.266257
for (k1,k2),group in df.groupby(['key1','key2']):print(k1,k2)print(group)
a onekey1 key2 data1 data2
0 a one -0.074122 -0.571432
4 a one -0.001175 0.180895
a twokey1 key2 data1 data2
1 a two 0.347874 -0.794645
b onekey1 key2 data1 data2
2 b one 0.399766 -0.596056
b twokey1 key2 data1 data2
3 b two 1.209857 -0.266257
pieces = dict(list(df.groupby('key1')))
pieces['b']
|
key1
|
key2
|
data1
|
data2
|
2
|
b
|
one
|
0.399766
|
-0.596056
|
3
|
b
|
two
|
1.209857
|
-0.266257
|
df.dtypes
key1 object
key2 object
data1 float64
data2 float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
{dtype('float64'): data1 data20 -0.074122 -0.5714321 0.347874 -0.7946452 0.399766 -0.5960563 1.209857 -0.2662574 -0.001175 0.180895,dtype('O'): key1 key20 a one1 a two2 b one3 b two4 a one}
选取一个或一组列
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>
df.groupby(['key1','key2'])[['data2']].mean()
|
|
data2
|
key1
|
key2
|
|
a
|
one
|
-0.195268
|
two
|
-0.794645
|
b
|
one
|
-0.596056
|
two
|
-0.266257
|
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>
s_grouped.mean()
key1 key2
a one -0.195268two -0.794645
b one -0.596056two -0.266257
Name: data2, dtype: float64
通过字典或Series进行分组
people = DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
people
C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead.people.loc[2:3,['b','c']] = np.nan#添加几个NA值
|
a
|
b
|
c
|
d
|
e
|
Joe
|
0.309327
|
1.658107
|
1.146959
|
-0.123471
|
0.159285
|
Steve
|
1.380735
|
-0.703245
|
0.158134
|
-1.602958
|
1.455772
|
Wes
|
-0.766580
|
NaN
|
NaN
|
0.074462
|
1.430541
|
Jim
|
-0.615666
|
2.578830
|
-0.002766
|
0.885567
|
-0.375239
|
Travis
|
-0.033534
|
1.158113
|
0.637327
|
1.473547
|
0.373215
|
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
|
blue
|
red
|
Joe
|
1.023488
|
2.126719
|
Steve
|
-1.444824
|
2.133263
|
Wes
|
0.074462
|
0.663960
|
Jim
|
0.882800
|
1.587925
|
Travis
|
2.110874
|
1.497794
|
map_series = Series(mapping)
map_series
a red
b red
c blue
d blue
e red
f orange
dtype: object
people.groupby(map_series,axis=1).count()
|
blue
|
red
|
Joe
|
2
|
3
|
Steve
|
2
|
3
|
Wes
|
1
|
2
|
Jim
|
2
|
3
|
Travis
|
2
|
3
|
通过函数进行分组
people.groupby(len).sum()
|
a
|
b
|
c
|
d
|
e
|
3
|
-1.072920
|
4.236937
|
1.144193
|
0.836558
|
1.214587
|
5
|
1.380735
|
-0.703245
|
0.158134
|
-1.602958
|
1.455772
|
6
|
-0.033534
|
1.158113
|
0.637327
|
1.473547
|
0.373215
|
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()
|
|
a
|
b
|
c
|
d
|
e
|
3
|
one
|
-0.766580
|
1.658107
|
1.146959
|
-0.123471
|
0.159285
|
two
|
-0.615666
|
2.578830
|
-0.002766
|
0.885567
|
-0.375239
|
5
|
one
|
1.380735
|
-0.703245
|
0.158134
|
-1.602958
|
1.455772
|
6
|
two
|
-0.033534
|
1.158113
|
0.637327
|
1.473547
|
0.373215
|
根据索引级别分组
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df
cty
|
US
|
JP
|
tenor
|
1
|
3
|
5
|
1
|
3
|
0
|
0.971689
|
-0.207027
|
0.641528
|
1.197729
|
-0.800907
|
1
|
0.906871
|
-0.087288
|
0.204273
|
-0.009374
|
0.637842
|
2
|
0.649755
|
-0.800055
|
-0.057130
|
-1.087200
|
0.435762
|
3
|
-0.618737
|
0.325816
|
-0.702310
|
-0.519860
|
-0.101653
|
hier_df.groupby(level='cty',axis=1).count()
cty
|
JP
|
US
|
0
|
2
|
3
|
1
|
2
|
3
|
2
|
2
|
3
|
3
|
2
|
3
|
数据聚合
grouped = df.groupby('key1')
#如果传入的百分位上没有值,则quantile会进行线性插值
grouped['data1'].quantile(0.9)
key1
a 0.278064
b 1.128848
Name: data1, dtype: float64
def peak_to_peak(arr):return arr.max() - arr.min()
grouped.agg(peak_to_peak)
C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.grouped.agg(peak_to_peak)
|
data1
|
data2
|
key1
|
|
|
a
|
0.421996
|
0.975541
|
b
|
0.810090
|
0.329799
|
grouped.describe()
|
data1
|
data2
|
|
count
|
mean
|
std
|
min
|
25%
|
50%
|
75%
|
max
|
count
|
mean
|
std
|
min
|
25%
|
50%
|
75%
|
max
|
key1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a
|
3.0
|
0.090859
|
0.22555
|
-0.074122
|
-0.037649
|
-0.001175
|
0.173349
|
0.347874
|
3.0
|
-0.395061
|
0.511126
|
-0.794645
|
-0.683039
|
-0.571432
|
-0.195268
|
0.180895
|
b
|
2.0
|
0.804812
|
0.57282
|
0.399766
|
0.602289
|
0.804812
|
1.007334
|
1.209857
|
2.0
|
-0.431157
|
0.233203
|
-0.596056
|
-0.513606
|
-0.431157
|
-0.348707
|
-0.266257
|
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('经过优化的GroupBy的方法.png')
imshow(img)
tips = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\tips.csv")
#添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
|
total_bill
|
tip
|
smoker
|
day
|
time
|
size
|
tip_pct
|
0
|
16.99
|
1.01
|
No
|
Sun
|
Dinner
|
2
|
0.059447
|
1
|
10.34
|
1.66
|
No
|
Sun
|
Dinner
|
3
|
0.160542
|
2
|
21.01
|
3.50
|
No
|
Sun
|
Dinner
|
3
|
0.166587
|
3
|
23.68
|
3.31
|
No
|
Sun
|
Dinner
|
2
|
0.139780
|
4
|
24.59
|
3.61
|
No
|
Sun
|
Dinner
|
4
|
0.146808
|
5
|
25.29
|
4.71
|
No
|
Sun
|
Dinner
|
4
|
0.186240
|
面向列的多函数应用
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day smoker
Fri No 0.151650Yes 0.174783
Sat No 0.158048Yes 0.147906
Sun No 0.160113Yes 0.187250
Thur No 0.160298Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean','std',peak_to_peak])
|
|
mean
|
std
|
peak_to_peak
|
day
|
smoker
|
|
|
|
Fri
|
No
|
0.151650
|
0.028123
|
0.067349
|
Yes
|
0.174783
|
0.051293
|
0.159925
|
Sat
|
No
|
0.158048
|
0.039767
|
0.235193
|
Yes
|
0.147906
|
0.061375
|
0.290095
|
Sun
|
No
|
0.160113
|
0.042347
|
0.193226
|
Yes
|
0.187250
|
0.154134
|
0.644685
|
Thur
|
No
|
0.160298
|
0.038774
|
0.193350
|
Yes
|
0.163863
|
0.039389
|
0.151240
|
#由(name,function)组成的列表,第一个元素会被用作列名
grouped_pct.agg([('foo','mean'),('bar',np.std)])
|
|
foo
|
bar
|
day
|
smoker
|
|
|
Fri
|
No
|
0.151650
|
0.028123
|
Yes
|
0.174783
|
0.051293
|
Sat
|
No
|
0.158048
|
0.039767
|
Yes
|
0.147906
|
0.061375
|
Sun
|
No
|
0.160113
|
0.042347
|
Yes
|
0.187250
|
0.154134
|
Thur
|
No
|
0.160298
|
0.038774
|
Yes
|
0.163863
|
0.039389
|
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
C:\windowFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.result = grouped['tip_pct','total_bill'].agg(functions)
|
|
tip_pct
|
total_bill
|
|
|
count
|
mean
|
max
|
count
|
mean
|
max
|
day
|
smoker
|
|
|
|
|
|
|
Fri
|
No
|
4
|
0.151650
|
0.187735
|
4
|
18.420000
|
22.75
|
Yes
|
15
|
0.174783
|
0.263480
|
15
|
16.813333
|
40.17
|
Sat
|
No
|
45
|
0.158048
|
0.291990
|
45
|
19.661778
|
48.33
|
Yes
|
42
|
0.147906
|
0.325733
|
42
|
21.276667
|
50.81
|
Sun
|
No
|
57
|
0.160113
|
0.252672
|
57
|
20.506667
|
48.17
|
Yes
|
19
|
0.187250
|
0.710345
|
19
|
24.120000
|
45.35
|
Thur
|
No
|
45
|
0.160298
|
0.266312
|
45
|
17.113111
|
41.19
|
Yes
|
17
|
0.163863
|
0.241255
|
17
|
19.190588
|
43.11
|
result['tip_pct']
|
|
count
|
mean
|
max
|
day
|
smoker
|
|
|
|
Fri
|
No
|
4
|
0.151650
|
0.187735
|
Yes
|
15
|
0.174783
|
0.263480
|
Sat
|
No
|
45
|
0.158048
|
0.291990
|
Yes
|
42
|
0.147906
|
0.325733
|
Sun
|
No
|
57
|
0.160113
|
0.252672
|
Yes
|
19
|
0.187250
|
0.710345
|
Thur
|
No
|
45
|
0.160298
|
0.266312
|
Yes
|
17
|
0.163863
|
0.241255
|
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
C:\windowsFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.grouped['tip_pct','total_bill'].agg(ftuples)
|
|
tip_pct
|
total_bill
|
|
|
Durchschnitt
|
Abweichung
|
Durchschnitt
|
Abweichung
|
day
|
smoker
|
|
|
|
|
Fri
|
No
|
0.151650
|
0.000791
|
18.420000
|
25.596333
|
Yes
|
0.174783
|
0.002631
|
16.813333
|
82.562438
|
Sat
|
No
|
0.158048
|
0.001581
|
19.661778
|
79.908965
|
Yes
|
0.147906
|
0.003767
|
21.276667
|
101.387535
|
Sun
|
No
|
0.160113
|
0.001793
|
20.506667
|
66.099980
|
Yes
|
0.187250
|
0.023757
|
24.120000
|
109.046044
|
Thur
|
No
|
0.160298
|
0.001503
|
17.113111
|
59.625081
|
Yes
|
0.163863
|
0.001551
|
19.190588
|
69.808518
|
#对不同的列应用不同的函数
grouped.agg({'tip':np.max,'size':'sum'})
|
|
tip
|
size
|
day
|
smoker
|
|
|
Fri
|
No
|
3.50
|
9
|
Yes
|
4.73
|
31
|
Sat
|
No
|
9.00
|
115
|
Yes
|
10.00
|
104
|
Sun
|
No
|
6.00
|
167
|
Yes
|
6.50
|
49
|
Thur
|
No
|
6.70
|
112
|
Yes
|
5.00
|
40
|
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
|
|
tip_pct
|
size
|
|
|
min
|
max
|
mean
|
std
|
sum
|
day
|
smoker
|
|
|
|
|
|
Fri
|
No
|
0.120385
|
0.187735
|
0.151650
|
0.028123
|
9
|
Yes
|
0.103555
|
0.263480
|
0.174783
|
0.051293
|
31
|
Sat
|
No
|
0.056797
|
0.291990
|
0.158048
|
0.039767
|
115
|
Yes
|
0.035638
|
0.325733
|
0.147906
|
0.061375
|
104
|
Sun
|
No
|
0.059447
|
0.252672
|
0.160113
|
0.042347
|
167
|
Yes
|
0.065660
|
0.710345
|
0.187250
|
0.154134
|
49
|
Thur
|
No
|
0.072961
|
0.266312
|
0.160298
|
0.038774
|
112
|
Yes
|
0.090014
|
0.241255
|
0.163863
|
0.039389
|
40
|
以“无索引”的形式返回聚合数据
tips.groupby(['day','smoker'],as_index=False).mean()
|
day
|
smoker
|
total_bill
|
tip
|
size
|
tip_pct
|
0
|
Fri
|
No
|
18.420000
|
2.812500
|
2.250000
|
0.151650
|
1
|
Fri
|
Yes
|
16.813333
|
2.714000
|
2.066667
|
0.174783
|
2
|
Sat
|
No
|
19.661778
|
3.102889
|
2.555556
|
0.158048
|
3
|
Sat
|
Yes
|
21.276667
|
2.875476
|
2.476190
|
0.147906
|
4
|
Sun
|
No
|
20.506667
|
3.167895
|
2.929825
|
0.160113
|
5
|
Sun
|
Yes
|
24.120000
|
3.516842
|
2.578947
|
0.187250
|
6
|
Thur
|
No
|
17.113111
|
2.673778
|
2.488889
|
0.160298
|
7
|
Thur
|
Yes
|
19.190588
|
3.030000
|
2.352941
|
0.163863
|
分组级运算和转换
#添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
|
mean_data1
|
mean_data2
|
key1
|
|
|
a
|
0.090859
|
-0.395061
|
b
|
0.804812
|
-0.431157
|
pd.merge(df,k1_means,left_on='key1',right_index=True)
|
key1
|
key2
|
data1
|
data2
|
mean_data1
|
mean_data2
|
0
|
a
|
one
|
-0.074122
|
-0.571432
|
0.090859
|
-0.395061
|
1
|
a
|
two
|
0.347874
|
-0.794645
|
0.090859
|
-0.395061
|
4
|
a
|
one
|
-0.001175
|
0.180895
|
0.090859
|
-0.395061
|
2
|
b
|
one
|
0.399766
|
-0.596056
|
0.804812
|
-0.431157
|
3
|
b
|
two
|
1.209857
|
-0.266257
|
0.804812
|
-0.431157
|
key = ['one','two','one','two','one']
people.groupby(key).mean()
|
a
|
b
|
c
|
d
|
e
|
one
|
-0.163596
|
1.408110
|
0.892143
|
0.474846
|
0.654347
|
two
|
0.382534
|
0.937792
|
0.077684
|
-0.358695
|
0.540267
|
#transform会将一个函数应用到各个分组
people.groupby(key).transform(np.mean)
|
a
|
b
|
c
|
d
|
e
|
Joe
|
-0.163596
|
1.408110
|
0.892143
|
0.474846
|
0.654347
|
Steve
|
0.382534
|
0.937792
|
0.077684
|
-0.358695
|
0.540267
|
Wes
|
-0.163596
|
1.408110
|
0.892143
|
0.474846
|
0.654347
|
Jim
|
0.382534
|
0.937792
|
0.077684
|
-0.358695
|
0.540267
|
Travis
|
-0.163596
|
1.408110
|
0.892143
|
0.474846
|
0.654347
|
#从各组中减去平均值
def demean(arr):return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
|
a
|
b
|
c
|
d
|
e
|
Joe
|
0.472923
|
0.249997
|
0.254816
|
-0.598317
|
-0.495062
|
Steve
|
0.998201
|
-1.641038
|
0.080450
|
-1.244262
|
0.915506
|
Wes
|
-0.602985
|
NaN
|
NaN
|
-0.400384
|
0.776194
|
Jim
|
-0.998201
|
1.641038
|
-0.080450
|
1.244262
|
-0.915506
|
Travis
|
0.130062
|
-0.249997
|
-0.254816
|
0.998701
|
-0.281132
|
demeaned.groupby(key).mean()
|
a
|
b
|
c
|
d
|
e
|
one
|
2.775558e-17
|
0.000000e+00
|
-5.551115e-17
|
7.401487e-17
|
-1.110223e-16
|
two
|
0.000000e+00
|
1.110223e-16
|
-6.938894e-18
|
0.000000e+00
|
-5.551115e-17
|
apply:一般性的“拆分——应用——合并”
#在指定列找到最大值,然后把这个值所在的行选取出来
#将sort_index()改为sort_values()即可
def top(df,n=5,column='tip_pct'):return df.sort_values(by=column)[-n:]
top(tips,n=6)
|
total_bill
|
tip
|
smoker
|
day
|
time
|
size
|
tip_pct
|
109
|
14.31
|
4.00
|
Yes
|
Sat
|
Dinner
|
2
|
0.279525
|
183
|
23.17
|
6.50
|
Yes
|
Sun
|
Dinner
|
4
|
0.280535
|
232
|
11.61
|
3.39
|
No
|
Sat
|
Dinner
|
2
|
0.291990
|
67
|
3.07
|
1.00
|
Yes
|
Sat
|
Dinner
|
1
|
0.325733
|
178
|
9.60
|
4.00
|
Yes
|
Sun
|
Dinner
|
2
|
0.416667
|
172
|
7.25
|
5.15
|
Yes
|
Sun
|
Dinner
|
2
|
0.710345
|
tips.groupby('smoker').apply(top)
|
|
total_bill
|
tip
|
smoker
|
day
|
time
|
size
|
tip_pct
|
smoker
|
|
|
|
|
|
|
|
|
No
|
88
|
24.71
|
5.85
|
No
|
Thur
|
Lunch
|
2
|
0.236746
|
185
|
20.69
|
5.00
|
No
|
Sun
|
Dinner
|
5
|
0.241663
|
51
|
10.29
|
2.60
|
No
|
Sun
|
Dinner
|
2
|
0.252672
|
149
|
7.51
|
2.00
|
No
|
Thur
|
Lunch
|
2
|
0.266312
|
232
|
11.61
|
3.39
|
No
|
Sat
|
Dinner
|
2
|
0.291990
|
Yes
|
109
|
14.31
|
4.00
|
Yes
|
Sat
|
Dinner
|
2
|
0.279525
|
183
|
23.17
|
6.50
|
Yes
|
Sun
|
Dinner
|
4
|
0.280535
|
67
|
3.07
|
1.00
|
Yes
|
Sat
|
Dinner
|
1
|
0.325733
|
178
|
9.60
|
4.00
|
Yes
|
Sun
|
Dinner
|
2
|
0.416667
|
172
|
7.25
|
5.15
|
Yes
|
Sun
|
Dinner
|
2
|
0.710345
|
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
|
|
|
total_bill
|
tip
|
smoker
|
day
|
time
|
size
|
tip_pct
|
smoker
|
day
|
|
|
|
|
|
|
|
|
No
|
Fri
|
94
|
22.75
|
3.25
|
No
|
Fri
|
Dinner
|
2
|
0.142857
|
Sat
|
212
|
48.33
|
9.00
|
No
|
Sat
|
Dinner
|
4
|
0.186220
|
Sun
|
156
|
48.17
|
5.00
|
No
|
Sun
|
Dinner
|
6
|
0.103799
|
Thur
|
142
|
41.19
|
5.00
|
No
|
Thur
|
Lunch
|
5
|
0.121389
|
Yes
|
Fri
|
95
|
40.17
|
4.73
|
Yes
|
Fri
|
Dinner
|
4
|
0.117750
|
Sat
|
170
|
50.81
|
10.00
|
Yes
|
Sat
|
Dinner
|
3
|
0.196812
|
Sun
|
182
|
45.35
|
3.50
|
Yes
|
Sun
|
Dinner
|
3
|
0.077178
|
Thur
|
197
|
43.11
|
5.00
|
Yes
|
Thur
|
Lunch
|
4
|
0.115982
|
result = tips.groupby('smoker')['tip_pct'].describe()
result
|
count
|
mean
|
std
|
min
|
25%
|
50%
|
75%
|
max
|
smoker
|
|
|
|
|
|
|
|
|
No
|
151.0
|
0.159328
|
0.039910
|
0.056797
|
0.136906
|
0.155625
|
0.185014
|
0.291990
|
Yes
|
93.0
|
0.163196
|
0.085119
|
0.035638
|
0.106771
|
0.153846
|
0.195059
|
0.710345
|
result.unstack('smoker')
smoker
count No 151.000000Yes 93.000000
mean No 0.159328Yes 0.163196
std No 0.039910Yes 0.085119
min No 0.056797Yes 0.035638
25% No 0.136906Yes 0.106771
50% No 0.155625Yes 0.153846
75% No 0.185014Yes 0.195059
max No 0.291990Yes 0.710345
dtype: float64
当调用describe之类的方法时,实际上只是应用了以下两条代码的快捷方式:
f = lambda x: x.describe()
grouped.apply(f)
禁止分组键
tips.groupby('smoker',group_keys=False).apply(top)
|
total_bill
|
tip
|
smoker
|
day
|
time
|
size
|
tip_pct
|
88
|
24.71
|
5.85
|
No
|
Thur
|
Lunch
|
2
|
0.236746
|
185
|
20.69
|
5.00
|
No
|
Sun
|
Dinner
|
5
|
0.241663
|
51
|
10.29
|
2.60
|
No
|
Sun
|
Dinner
|
2
|
0.252672
|
149
|
7.51
|
2.00
|
No
|
Thur
|
Lunch
|
2
|
0.266312
|
232
|
11.61
|
3.39
|
No
|
Sat
|
Dinner
|
2
|
0.291990
|
109
|
14.31
|
4.00
|
Yes
|
Sat
|
Dinner
|
2
|
0.279525
|
183
|
23.17
|
6.50
|
Yes
|
Sun
|
Dinner
|
4
|
0.280535
|
67
|
3.07
|
1.00
|
Yes
|
Sat
|
Dinner
|
1
|
0.325733
|
178
|
9.60
|
4.00
|
Yes
|
Sun
|
Dinner
|
2
|
0.416667
|
172
|
7.25
|
5.15
|
Yes
|
Sun
|
Dinner
|
2
|
0.710345
|
分位数和桶分析
frame = DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]
0 (-1.448, 0.107]
1 (-1.448, 0.107]
2 (-1.448, 0.107]
3 (-1.448, 0.107]
4 (0.107, 1.663]
5 (0.107, 1.663]
6 (0.107, 1.663]
7 (-1.448, 0.107]
8 (-1.448, 0.107]
9 (0.107, 1.663]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.01, -1.448] < (-1.448, 0.107] < (0.107, 1.663] < (1.663, 3.218]]
def get_stats(group):return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
#区间大小相等
|
min
|
max
|
count
|
mean
|
data1
|
|
|
|
|
(-3.01, -1.448]
|
-2.614910
|
2.368046
|
70.0
|
-0.092146
|
(-1.448, 0.107]
|
-2.534962
|
2.783160
|
479.0
|
0.009041
|
(0.107, 1.663]
|
-3.073771
|
2.513553
|
398.0
|
-0.091291
|
(1.663, 3.218]
|
-2.699080
|
2.373634
|
53.0
|
-0.099021
|
#数据点数量相等,使用qcut
#返回分位数编号
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
|
min
|
max
|
count
|
mean
|
data1
|
|
|
|
|
0
|
-2.614910
|
2.783160
|
100.0
|
0.006906
|
1
|
-2.534962
|
2.490249
|
100.0
|
-0.101695
|
2
|
-2.015862
|
2.261854
|
100.0
|
0.084059
|
3
|
-2.250966
|
2.509572
|
100.0
|
-0.000924
|
4
|
-2.068747
|
2.425219
|
100.0
|
0.119523
|
5
|
-2.913492
|
2.032037
|
100.0
|
-0.233505
|
6
|
-2.432055
|
1.983781
|
100.0
|
-0.038541
|
7
|
-2.339164
|
2.046824
|
100.0
|
-0.096358
|
8
|
-3.073771
|
2.235941
|
100.0
|
-0.091584
|
9
|
-2.699080
|
2.513553
|
100.0
|
-0.084895
|
示例:用特定于分组的值填充缺失值
#用平均值填充NA值
s = Series(np.random.randn(6))
s[::2] = np.nan
s
0 NaN
1 0.209858
2 NaN
3 1.379023
4 NaN
5 -0.743300
dtype: float64
s.fillna(s.mean())
0 0.281860
1 0.209858
2 0.281860
3 1.379023
4 0.281860
5 -0.743300
dtype: float64
states = ['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key = ['East']*4+['West']*4
data = Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data
Ohio 0.155978
New York -0.133767
Vermont NaN
Florida -0.765162
Oregon 0.682524
Nevada NaN
California 0.730390
Idaho NaN
dtype: float64
data.groupby(group_key).mean()
East -0.247650
West 0.706457
dtype: float64
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio 0.155978
New York -0.133767
Vermont -0.247650
Florida -0.765162
Oregon 0.682524
Nevada 0.706457
California 0.730390
Idaho 0.706457
dtype: float64
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio 0.155978
New York -0.133767
Vermont 0.500000
Florida -0.765162
Oregon 0.682524
Nevada -1.000000
California 0.730390
Idaho -1.000000
dtype: float64
示例:随机采样和排列
抽取的一个办法:选取np.random.permutation(N)的前K个元素,其中N为完整数据的大小,K为期望的样本大小。
#红桃(Hearts)、黑桃(Spades)、梅花(Clubs)、方片(Diamonds)
suits = ['H','S','C','D']
#python2中,range()返回的是list,可以将两个range()直接相加,如range(5)+range(10) ;python3中,range()成了一个class
card_val = (list(range(1,11))+ [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val,index=cards)
deck[:13]
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
def draw(deck,n=5):return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
7H 7
4D 4
8H 8
QC 10
4S 4
dtype: int64
#从每种花色中随机抽取两张牌
get_suit = lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)
C AC 1JC 10
D 5D 58D 8
H 10H 10JH 10
S 9S 95S 5
dtype: int64
#另一种办法
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
10C 10
AC 1
KD 10
10D 10
3H 3
9H 9
5S 5
8S 8
dtype: int64
示例:分组加权平均数和相关系数
df = DataFrame({'category':['a','a','a','a','b','b','b','b'],'data':np.random.randn(8),'weights':np.random.randn(8)})
df
|
category
|
data
|
weights
|
0
|
a
|
0.591317
|
-1.032939
|
1
|
a
|
-0.589692
|
0.436704
|
2
|
a
|
-0.128848
|
2.257153
|
3
|
a
|
-0.774626
|
0.811910
|
4
|
b
|
-2.050679
|
1.144802
|
5
|
b
|
1.216111
|
0.736471
|
6
|
b
|
-0.801366
|
0.139008
|
7
|
b
|
-1.577430
|
-0.576198
|
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category
a -0.723088
b -0.453212
dtype: float64
close_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
close_px
|
AA
|
AAPL
|
GE
|
IBM
|
JNJ
|
MSFT
|
PEP
|
SPX
|
XOM
|
1990-02-01
|
4.98
|
7.86
|
2.87
|
16.79
|
4.27
|
0.51
|
6.04
|
328.79
|
6.12
|
1990-02-02
|
5.04
|
8.00
|
2.87
|
16.89
|
4.37
|
0.51
|
6.09
|
330.92
|
6.24
|
1990-02-05
|
5.07
|
8.18
|
2.87
|
17.32
|
4.34
|
0.51
|
6.05
|
331.85
|
6.25
|
1990-02-06
|
5.01
|
8.12
|
2.88
|
17.56
|
4.32
|
0.51
|
6.15
|
329.66
|
6.23
|
1990-02-07
|
5.04
|
7.77
|
2.91
|
17.93
|
4.38
|
0.51
|
6.17
|
333.75
|
6.33
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
2011-10-10
|
10.09
|
388.81
|
16.14
|
186.62
|
64.43
|
26.94
|
61.87
|
1194.89
|
76.28
|
2011-10-11
|
10.30
|
400.29
|
16.14
|
185.00
|
63.96
|
27.00
|
60.95
|
1195.54
|
76.27
|
2011-10-12
|
10.05
|
402.19
|
16.40
|
186.12
|
64.33
|
26.96
|
62.70
|
1207.25
|
77.16
|
2011-10-13
|
10.10
|
408.43
|
16.22
|
186.82
|
64.23
|
27.18
|
62.36
|
1203.66
|
76.37
|
2011-10-14
|
10.26
|
422.00
|
16.60
|
190.53
|
64.72
|
27.27
|
62.24
|
1224.58
|
78.11
|
5472 rows × 9 columns
close_px[-4:]
|
AA
|
AAPL
|
GE
|
IBM
|
JNJ
|
MSFT
|
PEP
|
SPX
|
XOM
|
2011-10-11
|
10.30
|
400.29
|
16.14
|
185.00
|
63.96
|
27.00
|
60.95
|
1195.54
|
76.27
|
2011-10-12
|
10.05
|
402.19
|
16.40
|
186.12
|
64.33
|
26.96
|
62.70
|
1207.25
|
77.16
|
2011-10-13
|
10.10
|
408.43
|
16.22
|
186.82
|
64.23
|
27.18
|
62.36
|
1203.66
|
76.37
|
2011-10-14
|
10.26
|
422.00
|
16.60
|
190.53
|
64.72
|
27.27
|
62.24
|
1224.58
|
78.11
|
#计算日收益率与SPX之间的年度相关系数
rets = close_px.pct_change().dropna()
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
|
AA
|
AAPL
|
GE
|
IBM
|
JNJ
|
MSFT
|
PEP
|
SPX
|
XOM
|
1990
|
0.595024
|
0.545067
|
0.752187
|
0.738361
|
0.801145
|
0.586691
|
0.783168
|
1.0
|
0.517586
|
1991
|
0.453574
|
0.365315
|
0.759607
|
0.557046
|
0.646401
|
0.524225
|
0.641775
|
1.0
|
0.569335
|
1992
|
0.398180
|
0.498732
|
0.632685
|
0.262232
|
0.515740
|
0.492345
|
0.473871
|
1.0
|
0.318408
|
1993
|
0.259069
|
0.238578
|
0.447257
|
0.211269
|
0.451503
|
0.425377
|
0.385089
|
1.0
|
0.318952
|
1994
|
0.428549
|
0.268420
|
0.572996
|
0.385162
|
0.372962
|
0.436585
|
0.450516
|
1.0
|
0.395078
|
1995
|
0.291532
|
0.161829
|
0.519126
|
0.416390
|
0.315733
|
0.453660
|
0.413144
|
1.0
|
0.368752
|
1996
|
0.292344
|
0.191482
|
0.750724
|
0.388497
|
0.569232
|
0.564015
|
0.421477
|
1.0
|
0.538736
|
1997
|
0.564427
|
0.211435
|
0.827512
|
0.646823
|
0.703538
|
0.606171
|
0.509344
|
1.0
|
0.695653
|
1998
|
0.533802
|
0.379883
|
0.815243
|
0.623982
|
0.591988
|
0.698773
|
0.494213
|
1.0
|
0.369264
|
1999
|
0.099033
|
0.425584
|
0.710928
|
0.486167
|
0.517061
|
0.631315
|
0.336593
|
1.0
|
0.315383
|
2000
|
0.265359
|
0.440161
|
0.610362
|
0.445114
|
0.189765
|
0.538005
|
0.077525
|
1.0
|
0.084163
|
2001
|
0.624069
|
0.577152
|
0.794632
|
0.696038
|
0.111493
|
0.696447
|
0.133975
|
1.0
|
0.336869
|
2002
|
0.748021
|
0.580548
|
0.822373
|
0.716490
|
0.584758
|
0.784728
|
0.487211
|
1.0
|
0.759933
|
2003
|
0.690466
|
0.545582
|
0.777643
|
0.741775
|
0.562399
|
0.750534
|
0.541487
|
1.0
|
0.662775
|
2004
|
0.591485
|
0.374283
|
0.728626
|
0.601740
|
0.354690
|
0.588531
|
0.466854
|
1.0
|
0.557742
|
2005
|
0.564267
|
0.467540
|
0.675637
|
0.516846
|
0.444728
|
0.562374
|
0.489559
|
1.0
|
0.631010
|
2006
|
0.487638
|
0.428267
|
0.612388
|
0.598636
|
0.394026
|
0.406126
|
0.335054
|
1.0
|
0.518514
|
2007
|
0.642427
|
0.508118
|
0.796945
|
0.603906
|
0.568423
|
0.658770
|
0.651911
|
1.0
|
0.786264
|
2008
|
0.781057
|
0.681434
|
0.777337
|
0.833074
|
0.801005
|
0.804626
|
0.709264
|
1.0
|
0.828303
|
2009
|
0.735642
|
0.707103
|
0.713086
|
0.684513
|
0.603146
|
0.654902
|
0.541474
|
1.0
|
0.797921
|
2010
|
0.745700
|
0.710105
|
0.822285
|
0.783638
|
0.689896
|
0.730118
|
0.626655
|
1.0
|
0.839057
|
2011
|
0.882045
|
0.691931
|
0.864595
|
0.802730
|
0.752379
|
0.800996
|
0.592029
|
1.0
|
0.859975
|
#苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
1990 0.408271
1991 0.266807
1992 0.450592
1993 0.236917
1994 0.361638
1995 0.258642
1996 0.147539
1997 0.196144
1998 0.364106
1999 0.329484
2000 0.275298
2001 0.563156
2002 0.571435
2003 0.486262
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
示例:面向分组的线性回归
import statsmodels.api as sm
def regress(data,yvar,xvars):Y = data[yvar]X = data[xvars]X['intercept'] = 1.result = sm.OLS(Y,X).fit()return result.params
by_year.apply(regress,'AAPL',['SPX'])
E:\python_study_files\python_pip\.venvs\lpthw\lib\site-packages\statsmodels\compat\pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.from pandas import Int64Index as NumericIndex
|
SPX
|
intercept
|
1990
|
1.512772
|
0.001395
|
1991
|
1.187351
|
0.000396
|
1992
|
1.832427
|
0.000164
|
1993
|
1.390470
|
-0.002657
|
1994
|
1.190277
|
0.001617
|
1995
|
0.858818
|
-0.001423
|
1996
|
0.829389
|
-0.001791
|
1997
|
0.749928
|
-0.001901
|
1998
|
1.164582
|
0.004075
|
1999
|
1.384989
|
0.003273
|
2000
|
1.733802
|
-0.002523
|
2001
|
1.676128
|
0.003122
|
2002
|
1.080795
|
-0.000219
|
2003
|
1.187770
|
0.000690
|
2004
|
1.363463
|
0.004201
|
2005
|
1.766415
|
0.003246
|
2006
|
1.645496
|
0.000080
|
2007
|
1.198761
|
0.003438
|
2008
|
0.968016
|
-0.001110
|
2009
|
0.879103
|
0.002954
|
2010
|
1.052608
|
0.001261
|
2011
|
0.806605
|
0.001514
|
透视表和交叉表
tips.pivot_table(index=['day','smoker'])
|
|
size
|
tip
|
tip_pct
|
total_bill
|
day
|
smoker
|
|
|
|
|
Fri
|
No
|
2.250000
|
2.812500
|
0.151650
|
18.420000
|
Yes
|
2.066667
|
2.714000
|
0.174783
|
16.813333
|
Sat
|
No
|
2.555556
|
3.102889
|
0.158048
|
19.661778
|
Yes
|
2.476190
|
2.875476
|
0.147906
|
21.276667
|
Sun
|
No
|
2.929825
|
3.167895
|
0.160113
|
20.506667
|
Yes
|
2.578947
|
3.516842
|
0.187250
|
24.120000
|
Thur
|
No
|
2.488889
|
2.673778
|
0.160298
|
17.113111
|
Yes
|
2.352941
|
3.030000
|
0.163863
|
19.190588
|
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')
|
|
size
|
tip_pct
|
|
smoker
|
No
|
Yes
|
No
|
Yes
|
time
|
day
|
|
|
|
|
Dinner
|
Fri
|
2.000000
|
2.222222
|
0.139622
|
0.165347
|
Sat
|
2.555556
|
2.476190
|
0.158048
|
0.147906
|
Sun
|
2.929825
|
2.578947
|
0.160113
|
0.187250
|
Thur
|
2.000000
|
NaN
|
0.159744
|
NaN
|
Lunch
|
Fri
|
3.000000
|
1.833333
|
0.187735
|
0.188937
|
Thur
|
2.500000
|
2.352941
|
0.160311
|
0.163863
|
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker',margins=True)
|
|
size
|
tip_pct
|
|
smoker
|
No
|
Yes
|
All
|
No
|
Yes
|
All
|
time
|
day
|
|
|
|
|
|
|
Dinner
|
Fri
|
2.000000
|
2.222222
|
2.166667
|
0.139622
|
0.165347
|
0.158916
|
Sat
|
2.555556
|
2.476190
|
2.517241
|
0.158048
|
0.147906
|
0.153152
|
Sun
|
2.929825
|
2.578947
|
2.842105
|
0.160113
|
0.187250
|
0.166897
|
Thur
|
2.000000
|
NaN
|
2.000000
|
0.159744
|
NaN
|
0.159744
|
Lunch
|
Fri
|
3.000000
|
1.833333
|
2.000000
|
0.187735
|
0.188937
|
0.188765
|
Thur
|
2.500000
|
2.352941
|
2.459016
|
0.160311
|
0.163863
|
0.161301
|
All
|
|
2.668874
|
2.408602
|
2.569672
|
0.159328
|
0.163196
|
0.160803
|
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',aggfunc=len,margins=True)
|
day
|
Fri
|
Sat
|
Sun
|
Thur
|
All
|
time
|
smoker
|
|
|
|
|
|
Dinner
|
No
|
3.0
|
45.0
|
57.0
|
1.0
|
106
|
Yes
|
9.0
|
42.0
|
19.0
|
NaN
|
70
|
Lunch
|
No
|
1.0
|
NaN
|
NaN
|
44.0
|
45
|
Yes
|
6.0
|
NaN
|
NaN
|
17.0
|
23
|
All
|
|
19.0
|
87.0
|
76.0
|
62.0
|
244
|
tips.pivot_table('size',index=['time','smoker'],columns='day',aggfunc='sum',fill_value=0)
|
day
|
Fri
|
Sat
|
Sun
|
Thur
|
time
|
smoker
|
|
|
|
|
Dinner
|
No
|
6
|
115
|
167
|
2
|
Yes
|
20
|
104
|
49
|
0
|
Lunch
|
No
|
3
|
0
|
0
|
110
|
Yes
|
11
|
0
|
0
|
40
|
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('pivot_table的参数.png')
imshow(img)
rows改为index,cols改为columns
交叉表:crosstab
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
|
smoker
|
No
|
Yes
|
All
|
time
|
day
|
|
|
|
Dinner
|
Fri
|
3
|
9
|
12
|
Sat
|
45
|
42
|
87
|
Sun
|
57
|
19
|
76
|
Thur
|
1
|
0
|
1
|
Lunch
|
Fri
|
1
|
6
|
7
|
Thur
|
44
|
17
|
61
|
All
|
|
151
|
93
|
244
|
示例:2012联邦选举委员会数据库
fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
fec
C:\windowsDtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
|
cmte_id
|
cand_id
|
cand_nm
|
contbr_nm
|
contbr_city
|
contbr_st
|
contbr_zip
|
contbr_employer
|
contbr_occupation
|
contb_receipt_amt
|
contb_receipt_dt
|
receipt_desc
|
memo_cd
|
memo_text
|
form_tp
|
file_num
|
0
|
C00410118
|
P20002978
|
Bachmann, Michelle
|
HARVEY, WILLIAM
|
MOBILE
|
AL
|
366010290.0
|
RETIRED
|
RETIRED
|
250.0
|
20-JUN-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
736166
|
1
|
C00410118
|
P20002978
|
Bachmann, Michelle
|
HARVEY, WILLIAM
|
MOBILE
|
AL
|
366010290.0
|
RETIRED
|
RETIRED
|
50.0
|
23-JUN-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
736166
|
2
|
C00410118
|
P20002978
|
Bachmann, Michelle
|
SMITH, LANIER
|
LANETT
|
AL
|
368633403.0
|
INFORMATION REQUESTED
|
INFORMATION REQUESTED
|
250.0
|
05-JUL-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
749073
|
3
|
C00410118
|
P20002978
|
Bachmann, Michelle
|
BLEVINS, DARONDA
|
PIGGOTT
|
AR
|
724548253.0
|
NONE
|
RETIRED
|
250.0
|
01-AUG-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
749073
|
4
|
C00410118
|
P20002978
|
Bachmann, Michelle
|
WARDENBURG, HAROLD
|
HOT SPRINGS NATION
|
AR
|
719016467.0
|
NONE
|
RETIRED
|
300.0
|
20-JUN-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
736166
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
1001726
|
C00500587
|
P20003281
|
Perry, Rick
|
GORMAN, CHRIS D. MR.
|
INFO REQUESTED
|
XX
|
99999
|
INFORMATION REQUESTED PER BEST EFFORTS
|
INFORMATION REQUESTED PER BEST EFFORTS
|
5000.0
|
29-SEP-11
|
REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...
|
NaN
|
REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...
|
SA17A
|
751678
|
1001727
|
C00500587
|
P20003281
|
Perry, Rick
|
DUFFY, DAVID A. MR.
|
INFO REQUESTED
|
XX
|
99999
|
DUFFY EQUIPMENT COMPANY INC.
|
BUSINESS OWNER
|
2500.0
|
30-SEP-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
751678
|
1001728
|
C00500587
|
P20003281
|
Perry, Rick
|
GRANE, BRYAN F. MR.
|
INFO REQUESTED
|
XX
|
99999
|
INFORMATION REQUESTED PER BEST EFFORTS
|
INFORMATION REQUESTED PER BEST EFFORTS
|
500.0
|
29-SEP-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
751678
|
1001729
|
C00500587
|
P20003281
|
Perry, Rick
|
TOLBERT, DARYL MR.
|
INFO REQUESTED
|
XX
|
99999
|
T.A.C.C.
|
LONGWALL MAINTENANCE FOREMAN
|
500.0
|
30-SEP-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
751678
|
1001730
|
C00500587
|
P20003281
|
Perry, Rick
|
ANDERSON, MARILEE MRS.
|
INFO REQUESTED
|
XX
|
99999
|
INFORMATION REQUESTED PER BEST EFFORTS
|
INFORMATION REQUESTED PER BEST EFFORTS
|
2500.0
|
31-AUG-11
|
NaN
|
NaN
|
NaN
|
SA17A
|
751678
|
1001731 rows × 16 columns
fec.loc[123456]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands = fec.cand_nm.unique()
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy','Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick','Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G','Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[2]
'Obama, Barack'
parties = {'Bachmann, Michelle':'Republican','Cain, Herman':'Republican','Gingrich, Newt':'Republican','Huntsman, Jon':'Republican','Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican','Obama, Barack':'Democrat','Paul, Ron':'Republican','Pawlenty, Timothy':'Republican','Perry, Rick':'Republican',"Roemer, Charles E. 'Buddy' Ⅲ":'Republican','Romney, Mitt':'Republican','Santorum, Rick':'Republican'}
fec.cand_nm[123456:123461]
125611 Obama, Barack
125612 Obama, Barack
125613 Obama, Barack
125614 Obama, Barack
125615 Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(parties)
125611 Democrat
125612 Democrat
125613 Democrat
125614 Democrat
125615 Democrat
Name: cand_nm, dtype: object
#添加一个新列
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat 589127
Republican 396504
Name: party, dtype: int64
(fec.contb_receipt_amt>0).value_counts()
True 991475
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt>0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
雇主职业和雇主统计赞助信息
fec.contbr_occupation.value_counts()[:10]
RETIRED 233990
NOT PROVIDED 56245
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
NOT EMPLOYED 9828
Name: contbr_occupation, dtype: int64
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED','INFORMATION REQUESTED':'NOT PROVIDED','INFORMATIO REQUESTED (BEST EFFORTS)':'NOT PROVIDED','C.E.O':'CEO'}
#如果没有提供相关映射,则返回X
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED','INFORMATION REQUESTED':'NOT PROVIDED','SELF':'SELF-EMPLOYED','SELF EMPLOYED':'SELF-EMPLOTED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
over_2mm
party
|
Democrat
|
Republican
|
contbr_occupation
|
|
|
ATTORNEY
|
11141982.97
|
7462058.31
|
C.E.O.
|
1690.00
|
2592983.11
|
CEO
|
2074284.79
|
1638668.41
|
CONSULTANT
|
2459912.71
|
2538990.45
|
ENGINEER
|
951525.55
|
1811937.30
|
EXECUTIVE
|
1355161.05
|
4136400.09
|
HOMEMAKER
|
4248875.80
|
13625600.78
|
INVESTOR
|
884133.00
|
2431258.92
|
LAWYER
|
3160478.87
|
391124.32
|
MANAGER
|
762883.22
|
1441092.37
|
NOT PROVIDED
|
4866973.96
|
20216287.01
|
OWNER
|
1001567.36
|
2406081.92
|
PHYSICIAN
|
3735124.94
|
3587195.24
|
PRESIDENT
|
1878509.95
|
4717413.76
|
PROFESSOR
|
2165071.08
|
294032.73
|
REAL ESTATE
|
528902.09
|
1624507.25
|
RETIRED
|
25305116.38
|
23481023.18
|
SELF-EMPLOYED
|
672393.40
|
1636774.54
|
over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5):totals = group.groupby(key)['contb_receipt_amt'].sum()#根据key对totals进行降序排列return totals.sort_values(ascending=False)[n:]
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm contbr_occupation
Obama, Barack PROFESSOR 2165071.08CEO 2074284.79PRESIDENT 1878509.95NOT EMPLOYED 1709188.20EXECUTIVE 1355161.05...
Romney, Mitt INDEPENDENT PROFESSIONAL 3.00IFC CONTRACTING SOLUTIONS 3.00REMODELER & SEMI RETIRED 3.00AFFORDABLE REAL ESTATE DEVELOPER 3.003RD GENERATION FAMILY BUSINESS OWNER 3.00
Name: contb_receipt_amt, Length: 35973, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm contbr_employer
Obama, Barack REFUSED 149516.07DLA PIPER 148235.00HARVARD UNIVERSITY 131368.94IBM 128490.93GOOGLE 125302.88...
Romney, Mitt UN 3.00UPTOWN CHEAPSKATE 3.00WILL MERRIFIELD 3.00INDEPENDENT PROFESSIONAL 3.00HONOLD COMMUNICTAIONS 3.00
Name: contb_receipt_amt, Length: 95890, dtype: float64
对出资额分组
bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nm
|
Obama, Barack
|
Romney, Mitt
|
contb_receipt_amt
|
|
|
(0, 1]
|
493
|
77
|
(1, 10]
|
40070
|
3681
|
(10, 100]
|
372280
|
31853
|
(100, 1000]
|
153991
|
43357
|
(1000, 10000]
|
22284
|
26186
|
(10000, 100000]
|
2
|
1
|
(100000, 1000000]
|
3
|
0
|
(1000000, 10000000]
|
4
|
0
|
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
cand_nm
|
Obama, Barack
|
Romney, Mitt
|
contb_receipt_amt
|
|
|
(0, 1]
|
318.24
|
77.00
|
(1, 10]
|
337267.62
|
29819.66
|
(10, 100]
|
20288981.41
|
1987783.76
|
(100, 1000]
|
54798531.46
|
22363381.69
|
(1000, 10000]
|
51753705.67
|
63942145.42
|
(10000, 100000]
|
59100.00
|
12700.00
|
(100000, 1000000]
|
1490683.08
|
0.00
|
(1000000, 10000000]
|
7148839.76
|
0.00
|
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
cand_nm
|
Obama, Barack
|
Romney, Mitt
|
contb_receipt_amt
|
|
|
(0, 1]
|
0.805182
|
0.194818
|
(1, 10]
|
0.918767
|
0.081233
|
(10, 100]
|
0.910769
|
0.089231
|
(100, 1000]
|
0.710176
|
0.289824
|
(1000, 10000]
|
0.447326
|
0.552674
|
(10000, 100000]
|
0.823120
|
0.176880
|
(100000, 1000000]
|
1.000000
|
0.000000
|
(1000000, 10000000]
|
1.000000
|
0.000000
|
normed_sums[:2].plot(kind='barh',stacked=True)
根据州统计赞助信息
grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1)>100000]
totals[:10]
cand_nm
|
Obama, Barack
|
Romney, Mitt
|
contbr_st
|
|
|
AK
|
281840.15
|
86204.24
|
AL
|
543123.48
|
527303.51
|
AR
|
359247.28
|
105556.00
|
AZ
|
1506476.98
|
1888436.23
|
CA
|
23824984.24
|
11237636.60
|
CO
|
2132429.49
|
1506714.12
|
CT
|
2068291.26
|
3499475.45
|
DC
|
4373538.80
|
1025137.50
|
DE
|
336669.14
|
82712.00
|
FL
|
7318178.58
|
8338458.81
|
percent = totals.div(totals.sum(1),axis=0)
percent[:10]
cand_nm
|
Obama, Barack
|
Romney, Mitt
|
contbr_st
|
|
|
AK
|
0.765778
|
0.234222
|
AL
|
0.507390
|
0.492610
|
AR
|
0.772902
|
0.227098
|
AZ
|
0.443745
|
0.556255
|
CA
|
0.679498
|
0.320502
|
CO
|
0.585970
|
0.414030
|
CT
|
0.371476
|
0.628524
|
DC
|
0.810113
|
0.189887
|
DE
|
0.802776
|
0.197224
|
FL
|
0.467417
|
0.532583
|
from mpl_toolkits.basemap import Basemap, cm
import numpy as np
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
#from shapelib import ShapeFile
import pyshp
import dbflib
---------------------------------------------------------------------------ModuleNotFoundError Traceback (most recent call last)
5 import matplotlib.pyplot as plt6 #from shapelib import ShapeFile
----> 7 import pyshp8 import dbflibModuleNotFoundError: No module named 'pyshp'
利用Python进行数据分析的学习笔记——chap9相关推荐
- 《利用Python进行数据分析》学习笔记ch02-1(1)
前言 這是我第一次开通博客,主要目的是想记录下自己学习python的过程,同时也是想作为学习笔记,我会把<利用python进行数据分析>这本树上的每个例子都自己敲一边,很多语句并不知道为什 ...
- 利用Python进行数据分析(学习笔记)
第壹章 准备工作 1.1 本书内容 1.1.1 什么类型的数据 1.2 为何利用Python进行数据分析 1.2.1 Python作为胶水 1.2.2 解决"双语言"难题 1.2. ...
- 利用Python进行数据分析的学习笔记——chap6
读写文本格式的数据 import pandas as pd import numpy as np from pandas import Series,DataFrame !type "E:\ ...
- python输入一组数据、进行简单的统计_《利用Python进行数据分析》学习笔记——第二章(3)...
1880-2010年间全美婴儿姓名 用pandas.read_csv加载.txt文件 图2.1 用read_csv加载.txt文件 DataFrame.names1880中只有births这一列是in ...
- 《利用Python进行数据分析》学习笔记ch02-3(3)
索引: pandas.read_csv pandas.concat ignore_index=True groupby或pivot_table进行聚合 np.allclose 计算prop的累计和cu ...
- python 数据分析学什么-利用Python做数据分析 需要学习哪些知识
根据调查结果,十大最常用的数据工具中有八个来自或利用Python.Python广泛应用于所有数据科学领域,包括数据分析.机器学习.深度学习和数据可视化.不过你知道如何利用Python做数据分析吗?需要 ...
- 《利用python进行数据分析》读书笔记
<利用python进行数据分析>是一本利用python的Numpy.Pandas.Matplotlib库进行数据分析的基础介绍,非常适合初学者. 重要的python库 NumPy http ...
- 用python进行数据分析举例说明_《利用python进行数据分析》读书笔记 --第一、二章 准备与例子...
第一章 准备工作 今天开始码这本书--<利用python进行数据分析>.R和python都得会用才行,这是码这本书的原因.首先按照书上说的进行安装,google下载了epd_free-7. ...
- 《利用python进行数据分析》读书笔记--第八章 绘图和可视化
python有许多可视化工具,本书主要讲解matplotlib.matplotlib是用于创建出版质量图表的桌面绘图包(主要是2D方面).matplotlib的目的是为了构建一个MATLAB式的绘图接 ...
最新文章
- Vue报错:Uncaught RangeError: Maximum call stack size exceeded
- Windows Server下DB2自动备份、复原和前滚的脚本编程
- 【LeetCode从零单排】No70.ClimbingStairs
- nvm-windows的安装配置
- 我的领域驱动设计运用实例 - 领域啊领域
- jsvascript 学习 二 操作符
- 苹果调整App Store政策;国内首个5G+8K超高清国产化白皮书发布;Windows计算器移植到到 Linux|极客头条
- JAVA中console方法怎么用,java的Console类的使用方法及实例
- 解决serv-u中文乱码的问题
- Mysql更新关联子查询报错
- paip.提升效率---模块化设计方法V2012.9.15
- 导出微信聊天记录并生成词云
- 如何用手机在图片上标箭头_如何在手机上快速给图片做标记?
- MOOC-大型开放式网络课程massive open online courses
- postman发送图片
- U8接口开发-五大方式对比与剖析
- 浅谈对IT架构师的理解
- Flink任务链Operator Chains
- 小程序实现文字两端对齐
- 二进制差分码规则_一篇文章弄明白Node.js与二进制数据流
热门文章
- win 10卸载mysql_Windows 10系统下彻底删除卸载MySQL的方法教程
- 计算机主机故障有哪些,电脑主机电源常见的问题与解决方法_电脑故障
- 平衡树(splay)学习笔记(详细,从入门到精(bao)通(ling))(持续更新)
- SQL Server 2008 复制 遇到: 进程无法执行 'sp_replcmds' 命令
- uniapp实现左右滑动
- 反思:安全需要新体系
- python - 03【列表/元组/字符串】
- 计算机毕业设计SSM大学生志愿者管理系统【附源码数据库】
- u盘内容无故消失了怎么恢复 u盘恢复数据怎么操作
- 千牛如何装修店铺 千牛装修店铺的教程