文章目录

缺失值处理——拉格朗日插值法
dataframe合并
索引上的合并
轴向连接
合并重叠数据
重塑层次化索引
长宽格式的转换
移除重复数据
利用函数或映射进行数据转换
数据标准化
- 最小-最大规范化
- 零-均值规范化
- 小数定标规范化
替换值
重命名轴索引
离散化与面元划分
检测和过滤异常值
排列与随机采样
计算指标与哑变量
属性构造
字符串对象方法
正则表达式
pandas中矢量化的字符串函数
示例：USDA食品数据库

GitHub: https://github.com/RealEmperor/Python-for-Data-Analysis

缺失值处理——拉格朗日插值法

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from scipy.interpolate import lagrange  # 导入拉格朗日插值函数np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))inputfile = 'data/catering_sale.xls'  # 销量数据路径
outputfile = 'data/sales.xls'  # 输出数据路径data = pd.read_excel(inputfile)  # 读入数据# 过滤异常值，将其变为空值
"""
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None
上面这样写会有警告：
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame如果要更改原始数据，请使用单一赋值操作（loc）：
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None如果想要一个副本，请确保强制让 Pandas 创建副本：
error_data = data.copy()
error_data.loc[(error_data[u'销量'] < 400) | (error_data[u'销量'] > 5000), u'销量'] = None参考：https://www.jianshu.com/p/72274ccb647a
"""
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None# 自定义列向量插值函数
# s为列向量，n为被插值的位置，k为取前后的数据个数，默认为5
def ployinterp_column(s, n, k=5):y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]  # 取数y = y[y.notnull()]  # 剔除空值return lagrange(y.index, list(y))(n)  # 插值并返回插值结果# 逐个元素判断是否需要插值
for i in data.columns:for j in range(len(data)):if (data[i].isnull())[j]:  # 如果为空即插值。"""data[i][j] = ployinterp_column(data[i], j)  这样写会有警告：SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame"""data.loc[j, i] = ployinterp_column(data[i], j)data.to_excel(outputfile)  # 输出结果，写入文件

dataframe合并

#dataframe合并
# 1
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
print(df1)
print(df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   bdata2 key
0      0   a
1      1   b
2      2   d

pd.merge(df1, df2)

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

pd.merge(df1, df2, on='key')

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

# 2
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})

pd.merge(df3, df4, left_on='lkey', right_on='rkey')

	data1	lkey	data2	rkey
0	0	b	1	b
1	1	b	1	b
2	6	b	1	b
3	2	a	0	a
4	4	a	0	a
5	5	a	0	a

pd.merge(df1, df2, how='outer')

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

# 3
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],'data2': range(5)})
print(df1)
print(df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   bdata2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d

pd.merge(df1, df2, on='key', how='left')

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

pd.merge(df1, df2, how='inner')

	data1	key	data2
0	0	b	1
1	0	b	3
2	1	b	1
3	1	b	3
4	5	b	1
5	5	b	3
6	2	a	0
7	2	a	2
8	4	a	0
9	4	a	2

# 4
left = DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'],'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

# 5
pd.merge(left, right, on='key1')

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

索引上的合并

# 1
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5group_val
a        3.5
b        7.0

pd.merge(left1, right1, left_on='key', right_index=True)

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

# 2
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'key2': [2000, 2001, 2002, 2001, 2002],'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],[2001, 2000, 2000, 2000, 2001, 2002]],columns=['event1', 'event2'])
print(lefth)
print(righth)

   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002event1  event2
Nevada 2001       0       12000       2       3
Ohio   2000       4       52000       6       72001       8       92002      10      11

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

	data	key1	key2	event1	event2
0	0.0	Ohio	2000	4	5
0	0.0	Ohio	2000	6	7
1	1.0	Ohio	2001	8	9
2	2.0	Ohio	2002	10	11
3	3.0	Nevada	2001	0	1

pd.merge(lefth, righth, left_on=['key1', 'key2'],right_index=True, how='outer')

	data	key1	key2	event1	event2
0	0.0	Ohio	2000	4.0	5.0
0	0.0	Ohio	2000	6.0	7.0
1	1.0	Ohio	2001	8.0	9.0
2	2.0	Ohio	2002	10.0	11.0
3	3.0	Nevada	2001	0.0	1.0
4	4.0	Nevada	2002	NaN	NaN
4	NaN	Nevada	2000	2.0	3.0

left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)

   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

# 3
left2.join(right2, how='outer')

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

left1.join(right1, on='key')

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

left2.join([right2, another], how='outer')

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
b	NaN	NaN	7.0	8.0	NaN	NaN
c	3.0	4.0	9.0	10.0	9.0	10.0
d	NaN	NaN	11.0	12.0	NaN	NaN
e	5.0	6.0	13.0	14.0	11.0	12.0
f	NaN	NaN	NaN	NaN	16.0	17.0

轴向连接

# 1
arr = np.arange(12).reshape((3, 4))
print(arr)np.concatenate([arr, arr], axis=1)

[[ 0  1  2  3][ 4  5  6  7][ 8  9 10 11]]array([[ 0,  1,  2,  3,  0,  1,  2,  3],[ 4,  5,  6,  7,  4,  5,  6,  7],[ 8,  9, 10, 11,  8,  9, 10, 11]])

# 2
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

pd.concat([s1, s2, s3], axis=0)

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

pd.concat([s1, s2, s3], axis=1)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4 = pd.concat([s1 * 5, s3])
print(s4)

a    0
b    5
f    5
g    6
dtype: int64

pd.concat([s1, s4], axis=1)

	0	1
a	0.0	0
b	1.0	5
f	NaN	5
g	NaN	6

pd.concat([s1, s4], axis=1, join='inner')

	0	1
a	0	0
b	1	5

pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

	0	1
a	0.0	0.0
c	NaN	NaN
b	1.0	5.0
e	NaN	NaN

# 3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print(result)

one    a    0b    1
two    a    0b    1
three  f    5g    6
dtype: int64

result.unstack()

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

# 4
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],columns=['three', 'four'])

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat({'level1': df1, 'level2': df2}, axis=1)

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],names=['upper', 'lower'])

upper	level1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

# 5
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)

          a         b         c         d
0 -0.204708  0.478943 -0.519439 -0.555730
1  1.965781  1.393406  0.092908  0.281746
2  0.769023  1.246435  1.007189 -1.296221b         d         a
0  0.274992  0.228913  1.352917
1  0.886429 -2.001637 -0.371843

pd.concat([df1, df2], ignore_index=True)

	a	b	c	d
0	-0.204708	0.478943	-0.519439	-0.555730
1	1.965781	1.393406	0.092908	0.281746
2	0.769023	1.246435	1.007189	-1.296221
3	1.352917	0.274992	NaN	0.228913
4	-0.371843	0.886429	NaN	-2.001637

合并重叠数据

# 1
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nanprint(a)
print(b)np.where(pd.isnull(a), b, a)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

# 2
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

# 3
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],'b': [np.nan, 2., np.nan, 6.],'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

重塑层次化索引

# 1
data = DataFrame(np.arange(6).reshape((2, 3)),index=pd.Index(['Ohio', 'Colorado'], name='state'),columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)

number    one  two  three
state
Ohio        0    1      2
Colorado    3    4      5

# stack 把 column 转成 index
result = data.stack()
print(result)

state     number
Ohio      one       0two       1three     2
Colorado  one       3two       4three     5
dtype: int32

# unstack 把 index 转成 column
result.unstack()

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

result.unstack(0)

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

result.unstack('state')

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

# 2
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print(data2)

one  a    0b    1c    2d    3
two  c    4d    5e    6
dtype: int64

# data2.stack() # 没有column 执行会报错

data2.unstack()

	a	b	c	d	e
one	0.0	1.0	2.0	3.0	NaN
two	NaN	NaN	4.0	5.0	6.0

data2.unstack().stack()

one  a    0.0b    1.0c    2.0d    3.0
two  c    4.0d    5.0e    6.0
dtype: float64

data2.unstack().stack(dropna=False)

one  a    0.0b    1.0c    2.0d    3.0e    NaN
two  a    NaNb    NaNc    4.0d    5.0e    6.0
dtype: float64

# 3
df = DataFrame({'left': result, 'right': result + 5},columns=pd.Index(['left', 'right'], name='side'))
print(df)

side             left  right
state    number
Ohio     one        0      5two        1      6three      2      7
Colorado one        3      8two        4      9three      5     10

df.unstack('state')

side	left		right
state	Ohio	Colorado	Ohio	Colorado
number
one	0	3	5	8
two	1	4	6	9
three	2	5	7	10

df.unstack('state').stack('side')

	state	Colorado	Ohio
number	side
one	left	3	0
one	right	8	5
two	left	4	1
two	right	9	6
three	left	5	2
three	right	10	7

长宽格式的转换

data = pd.read_csv('data/macrodata.csv')
print(data)

       year  quarter    realgdp  realcons   realinv  realgovt  realdpi  \
0    1959.0      1.0   2710.349    1707.4   286.898   470.045   1886.9
1    1959.0      2.0   2778.801    1733.7   310.859   481.301   1919.7
2    1959.0      3.0   2775.488    1751.8   289.226   491.260   1916.4
3    1959.0      4.0   2785.204    1753.7   299.356   484.052   1931.3
4    1960.0      1.0   2847.699    1770.5   331.722   462.199   1955.5
5    1960.0      2.0   2834.390    1792.9   298.152   460.400   1966.1
6    1960.0      3.0   2839.022    1785.8   296.375   474.676   1967.8
7    1960.0      4.0   2802.616    1788.2   259.764   476.434   1966.6
8    1961.0      1.0   2819.264    1787.7   266.405   475.854   1984.5
9    1961.0      2.0   2872.005    1814.3   286.246   480.328   2014.4
10   1961.0      3.0   2918.419    1823.1   310.227   493.828   2041.9
11   1961.0      4.0   2977.830    1859.6   315.463   502.521   2082.0
12   1962.0      1.0   3031.241    1879.4   334.271   520.960   2101.7
13   1962.0      2.0   3064.709    1902.5   331.039   523.066   2125.2
14   1962.0      3.0   3093.047    1917.9   336.962   538.838   2137.0
15   1962.0      4.0   3100.563    1945.1   325.650   535.912   2154.6
16   1963.0      1.0   3141.087    1958.2   343.721   522.917   2172.5
17   1963.0      2.0   3180.447    1976.9   348.730   518.108   2193.1
18   1963.0      3.0   3240.332    2003.8   360.102   546.893   2217.9
19   1963.0      4.0   3264.967    2020.6   364.534   532.383   2254.6
20   1964.0      1.0   3338.246    2060.5   379.523   529.686   2299.6
21   1964.0      2.0   3376.587    2096.7   377.778   526.175   2362.1
22   1964.0      3.0   3422.469    2135.2   386.754   522.008   2392.7
23   1964.0      4.0   3431.957    2141.2   389.910   514.603   2420.4
24   1965.0      1.0   3516.251    2188.8   429.145   508.006   2447.4
25   1965.0      2.0   3563.960    2213.0   429.119   508.931   2474.5
26   1965.0      3.0   3636.285    2251.0   444.444   529.446   2542.6
27   1965.0      4.0   3724.014    2314.3   446.493   544.121   2594.1
28   1966.0      1.0   3815.423    2348.5   484.244   556.593   2618.4
29   1966.0      2.0   3828.124    2354.5   475.408   571.371   2624.7
..      ...      ...        ...       ...       ...       ...      ...
173  2002.0      2.0  11538.770    7997.8  1810.779   774.408   8658.9
174  2002.0      3.0  11596.430    8052.0  1814.531   786.673   8629.2
175  2002.0      4.0  11598.824    8080.6  1813.219   799.967   8649.6
176  2003.0      1.0  11645.819    8122.3  1813.141   800.196   8681.3
177  2003.0      2.0  11738.706    8197.8  1823.698   838.775   8812.5
178  2003.0      3.0  11935.461    8312.1  1889.883   839.598   8935.4
179  2003.0      4.0  12042.817    8358.0  1959.783   845.722   8986.4
180  2004.0      1.0  12127.623    8437.6  1970.015   856.570   9025.9
181  2004.0      2.0  12213.818    8483.2  2055.580   861.440   9115.0
182  2004.0      3.0  12303.533    8555.8  2082.231   876.385   9175.9
183  2004.0      4.0  12410.282    8654.2  2125.152   865.596   9303.4
184  2005.0      1.0  12534.113    8719.0  2170.299   869.204   9189.6
185  2005.0      2.0  12587.535    8802.9  2131.468   870.044   9253.0
186  2005.0      3.0  12683.153    8865.6  2154.949   890.394   9308.0
187  2005.0      4.0  12748.699    8888.5  2232.193   875.557   9358.7
188  2006.0      1.0  12915.938    8986.6  2264.721   900.511   9533.8
189  2006.0      2.0  12962.462    9035.0  2261.247   892.839   9617.3
190  2006.0      3.0  12965.916    9090.7  2229.636   892.002   9662.5
191  2006.0      4.0  13060.679    9181.6  2165.966   894.404   9788.8
192  2007.0      1.0  13099.901    9265.1  2132.609   882.766   9830.2
193  2007.0      2.0  13203.977    9291.5  2162.214   898.713   9842.7
194  2007.0      3.0  13321.109    9335.6  2166.491   918.983   9883.9
195  2007.0      4.0  13391.249    9363.6  2123.426   925.110   9886.2
196  2008.0      1.0  13366.865    9349.6  2082.886   943.372   9826.8
197  2008.0      2.0  13415.266    9351.0  2026.518   961.280  10059.0
198  2008.0      3.0  13324.600    9267.7  1990.693   991.551   9838.3
199  2008.0      4.0  13141.920    9195.3  1857.661  1007.273   9920.4
200  2009.0      1.0  12925.410    9209.2  1558.494   996.287   9926.4
201  2009.0      2.0  12901.504    9189.0  1456.678  1023.528  10077.5
202  2009.0      3.0  12990.341    9256.0  1486.398  1044.088  10040.6   cpi      m1  tbilrate  unemp      pop  infl  realint
0     28.980   139.7      2.82    5.8  177.146  0.00     0.00
1     29.150   141.7      3.08    5.1  177.830  2.34     0.74
2     29.350   140.5      3.82    5.3  178.657  2.74     1.09
3     29.370   140.0      4.33    5.6  179.386  0.27     4.06
4     29.540   139.6      3.50    5.2  180.007  2.31     1.19
5     29.550   140.2      2.68    5.2  180.671  0.14     2.55
6     29.750   140.9      2.36    5.6  181.528  2.70    -0.34
7     29.840   141.1      2.29    6.3  182.287  1.21     1.08
8     29.810   142.1      2.37    6.8  182.992 -0.40     2.77
9     29.920   142.9      2.29    7.0  183.691  1.47     0.81
10    29.980   144.1      2.32    6.8  184.524  0.80     1.52
11    30.040   145.2      2.60    6.2  185.242  0.80     1.80
12    30.210   146.4      2.73    5.6  185.874  2.26     0.47
13    30.220   146.5      2.78    5.5  186.538  0.13     2.65
14    30.380   146.7      2.78    5.6  187.323  2.11     0.67
15    30.440   148.3      2.87    5.5  188.013  0.79     2.08
16    30.480   149.7      2.90    5.8  188.580  0.53     2.38
17    30.690   151.3      3.03    5.7  189.242  2.75     0.29
18    30.750   152.6      3.38    5.5  190.028  0.78     2.60
19    30.940   153.7      3.52    5.6  190.668  2.46     1.06
20    30.950   154.8      3.51    5.5  191.245  0.13     3.38
21    31.020   156.8      3.47    5.2  191.889  0.90     2.57
22    31.120   159.2      3.53    5.0  192.631  1.29     2.25
23    31.280   160.7      3.76    5.0  193.223  2.05     1.71
24    31.380   162.0      3.93    4.9  193.709  1.28     2.65
25    31.580   163.1      3.84    4.7  194.303  2.54     1.30
26    31.650   166.0      3.93    4.4  194.997  0.89     3.04
27    31.880   169.1      4.35    4.1  195.539  2.90     1.46
28    32.280   171.8      4.62    3.9  195.999  4.99    -0.37
29    32.450   170.3      4.65    3.8  196.560  2.10     2.55
..       ...     ...       ...    ...      ...   ...      ...
173  180.000  1199.5      1.70    5.8  288.028  1.56     0.14
174  181.200  1204.0      1.61    5.7  288.783  2.66    -1.05
175  182.600  1226.8      1.20    5.8  289.421  3.08    -1.88
176  183.200  1248.4      1.14    5.9  290.019  1.31    -0.17
177  183.700  1287.9      0.96    6.2  290.704  1.09    -0.13
178  184.900  1297.3      0.94    6.1  291.449  2.60    -1.67
179  186.300  1306.1      0.90    5.8  292.057  3.02    -2.11
180  187.400  1332.1      0.94    5.7  292.635  2.35    -1.42
181  189.100  1340.5      1.21    5.6  293.310  3.61    -2.41
182  190.800  1361.0      1.63    5.4  294.066  3.58    -1.95
183  191.800  1366.6      2.20    5.4  294.741  2.09     0.11
184  193.800  1357.8      2.69    5.3  295.308  4.15    -1.46
185  194.700  1366.6      3.01    5.1  295.994  1.85     1.16
186  199.200  1375.0      3.52    5.0  296.770  9.14    -5.62
187  199.400  1380.6      4.00    4.9  297.435  0.40     3.60
188  200.700  1380.5      4.51    4.7  298.061  2.60     1.91
189  202.700  1369.2      4.82    4.7  298.766  3.97     0.85
190  201.900  1369.4      4.90    4.7  299.593 -1.58     6.48
191  203.574  1373.6      4.92    4.4  300.320  3.30     1.62
192  205.920  1379.7      4.95    4.5  300.977  4.58     0.36
193  207.338  1370.0      4.72    4.5  301.714  2.75     1.97
194  209.133  1379.2      4.00    4.7  302.509  3.45     0.55
195  212.495  1377.4      3.01    4.8  303.204  6.38    -3.37
196  213.997  1384.0      1.56    4.9  303.803  2.82    -1.26
197  218.610  1409.3      1.74    5.4  304.483  8.53    -6.79
198  216.889  1474.7      1.17    6.0  305.270 -3.16     4.33
199  212.174  1576.5      0.12    6.9  305.952 -8.79     8.91
200  212.671  1592.8      0.22    8.1  306.547  0.94    -0.71
201  214.469  1653.6      0.18    9.2  307.226  3.37    -3.19
202  216.385  1673.9      0.12    9.6  308.013  3.56    -3.44  [203 rows x 14 columns]

periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
print(periods)

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2','1960Q3', '1960Q4', '1961Q1', '1961Q2',...'2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3','2008Q4', '2009Q1', '2009Q2', '2009Q3'],dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

data = DataFrame(data.to_records(),columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),index=periods.to_timestamp('D', 'end'))
print(data)

item          realgdp  infl  unemp
date
1959-03-31   2710.349  0.00    5.8
1959-06-30   2778.801  2.34    5.1
1959-09-30   2775.488  2.74    5.3
1959-12-31   2785.204  0.27    5.6
1960-03-31   2847.699  2.31    5.2
1960-06-30   2834.390  0.14    5.2
1960-09-30   2839.022  2.70    5.6
1960-12-31   2802.616  1.21    6.3
1961-03-31   2819.264 -0.40    6.8
1961-06-30   2872.005  1.47    7.0
1961-09-30   2918.419  0.80    6.8
1961-12-31   2977.830  0.80    6.2
1962-03-31   3031.241  2.26    5.6
1962-06-30   3064.709  0.13    5.5
1962-09-30   3093.047  2.11    5.6
1962-12-31   3100.563  0.79    5.5
1963-03-31   3141.087  0.53    5.8
1963-06-30   3180.447  2.75    5.7
1963-09-30   3240.332  0.78    5.5
1963-12-31   3264.967  2.46    5.6
1964-03-31   3338.246  0.13    5.5
1964-06-30   3376.587  0.90    5.2
1964-09-30   3422.469  1.29    5.0
1964-12-31   3431.957  2.05    5.0
1965-03-31   3516.251  1.28    4.9
1965-06-30   3563.960  2.54    4.7
1965-09-30   3636.285  0.89    4.4
1965-12-31   3724.014  2.90    4.1
1966-03-31   3815.423  4.99    3.9
1966-06-30   3828.124  2.10    3.8
...               ...   ...    ...
2002-06-30  11538.770  1.56    5.8
2002-09-30  11596.430  2.66    5.7
2002-12-31  11598.824  3.08    5.8
2003-03-31  11645.819  1.31    5.9
2003-06-30  11738.706  1.09    6.2
2003-09-30  11935.461  2.60    6.1
2003-12-31  12042.817  3.02    5.8
2004-03-31  12127.623  2.35    5.7
2004-06-30  12213.818  3.61    5.6
2004-09-30  12303.533  3.58    5.4
2004-12-31  12410.282  2.09    5.4
2005-03-31  12534.113  4.15    5.3
2005-06-30  12587.535  1.85    5.1
2005-09-30  12683.153  9.14    5.0
2005-12-31  12748.699  0.40    4.9
2006-03-31  12915.938  2.60    4.7
2006-06-30  12962.462  3.97    4.7
2006-09-30  12965.916 -1.58    4.7
2006-12-31  13060.679  3.30    4.4
2007-03-31  13099.901  4.58    4.5
2007-06-30  13203.977  2.75    4.5
2007-09-30  13321.109  3.45    4.7
2007-12-31  13391.249  6.38    4.8
2008-03-31  13366.865  2.82    4.9
2008-06-30  13415.266  8.53    5.4
2008-09-30  13324.600 -3.16    6.0
2008-12-31  13141.920 -8.79    6.9
2009-03-31  12925.410  0.94    8.1
2009-06-30  12901.504  3.37    9.2
2009-09-30  12990.341  3.56    9.6[203 rows x 3 columns]

ldata = data.stack().reset_index().rename(columns={0: 'value'})
print(ldata)

          date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31     infl      0.000
2   1959-03-31    unemp      5.800
3   1959-06-30  realgdp   2778.801
4   1959-06-30     infl      2.340
5   1959-06-30    unemp      5.100
6   1959-09-30  realgdp   2775.488
7   1959-09-30     infl      2.740
8   1959-09-30    unemp      5.300
9   1959-12-31  realgdp   2785.204
10  1959-12-31     infl      0.270
11  1959-12-31    unemp      5.600
12  1960-03-31  realgdp   2847.699
13  1960-03-31     infl      2.310
14  1960-03-31    unemp      5.200
15  1960-06-30  realgdp   2834.390
16  1960-06-30     infl      0.140
17  1960-06-30    unemp      5.200
18  1960-09-30  realgdp   2839.022
19  1960-09-30     infl      2.700
20  1960-09-30    unemp      5.600
21  1960-12-31  realgdp   2802.616
22  1960-12-31     infl      1.210
23  1960-12-31    unemp      6.300
24  1961-03-31  realgdp   2819.264
25  1961-03-31     infl     -0.400
26  1961-03-31    unemp      6.800
27  1961-06-30  realgdp   2872.005
28  1961-06-30     infl      1.470
29  1961-06-30    unemp      7.000
..         ...      ...        ...
579 2007-06-30  realgdp  13203.977
580 2007-06-30     infl      2.750
581 2007-06-30    unemp      4.500
582 2007-09-30  realgdp  13321.109
583 2007-09-30     infl      3.450
584 2007-09-30    unemp      4.700
585 2007-12-31  realgdp  13391.249
586 2007-12-31     infl      6.380
587 2007-12-31    unemp      4.800
588 2008-03-31  realgdp  13366.865
589 2008-03-31     infl      2.820
590 2008-03-31    unemp      4.900
591 2008-06-30  realgdp  13415.266
592 2008-06-30     infl      8.530
593 2008-06-30    unemp      5.400
594 2008-09-30  realgdp  13324.600
595 2008-09-30     infl     -3.160
596 2008-09-30    unemp      6.000
597 2008-12-31  realgdp  13141.920
598 2008-12-31     infl     -8.790
599 2008-12-31    unemp      6.900
600 2009-03-31  realgdp  12925.410
601 2009-03-31     infl      0.940
602 2009-03-31    unemp      8.100
603 2009-06-30  realgdp  12901.504
604 2009-06-30     infl      3.370
605 2009-06-30    unemp      9.200
606 2009-09-30  realgdp  12990.341
607 2009-09-30     infl      3.560
608 2009-09-30    unemp      9.600[609 rows x 3 columns]

wdata = ldata.pivot(index='date',columns= 'item',values= 'value')
print(wdata)

item        infl    realgdp  unemp
date
1959-03-31  0.00   2710.349    5.8
1959-06-30  2.34   2778.801    5.1
1959-09-30  2.74   2775.488    5.3
1959-12-31  0.27   2785.204    5.6
1960-03-31  2.31   2847.699    5.2
1960-06-30  0.14   2834.390    5.2
1960-09-30  2.70   2839.022    5.6
1960-12-31  1.21   2802.616    6.3
1961-03-31 -0.40   2819.264    6.8
1961-06-30  1.47   2872.005    7.0
1961-09-30  0.80   2918.419    6.8
1961-12-31  0.80   2977.830    6.2
1962-03-31  2.26   3031.241    5.6
1962-06-30  0.13   3064.709    5.5
1962-09-30  2.11   3093.047    5.6
1962-12-31  0.79   3100.563    5.5
1963-03-31  0.53   3141.087    5.8
1963-06-30  2.75   3180.447    5.7
1963-09-30  0.78   3240.332    5.5
1963-12-31  2.46   3264.967    5.6
1964-03-31  0.13   3338.246    5.5
1964-06-30  0.90   3376.587    5.2
1964-09-30  1.29   3422.469    5.0
1964-12-31  2.05   3431.957    5.0
1965-03-31  1.28   3516.251    4.9
1965-06-30  2.54   3563.960    4.7
1965-09-30  0.89   3636.285    4.4
1965-12-31  2.90   3724.014    4.1
1966-03-31  4.99   3815.423    3.9
1966-06-30  2.10   3828.124    3.8
...          ...        ...    ...
2002-06-30  1.56  11538.770    5.8
2002-09-30  2.66  11596.430    5.7
2002-12-31  3.08  11598.824    5.8
2003-03-31  1.31  11645.819    5.9
2003-06-30  1.09  11738.706    6.2
2003-09-30  2.60  11935.461    6.1
2003-12-31  3.02  12042.817    5.8
2004-03-31  2.35  12127.623    5.7
2004-06-30  3.61  12213.818    5.6
2004-09-30  3.58  12303.533    5.4
2004-12-31  2.09  12410.282    5.4
2005-03-31  4.15  12534.113    5.3
2005-06-30  1.85  12587.535    5.1
2005-09-30  9.14  12683.153    5.0
2005-12-31  0.40  12748.699    4.9
2006-03-31  2.60  12915.938    4.7
2006-06-30  3.97  12962.462    4.7
2006-09-30 -1.58  12965.916    4.7
2006-12-31  3.30  13060.679    4.4
2007-03-31  4.58  13099.901    4.5
2007-06-30  2.75  13203.977    4.5
2007-09-30  3.45  13321.109    4.7
2007-12-31  6.38  13391.249    4.8
2008-03-31  2.82  13366.865    4.9
2008-06-30  8.53  13415.266    5.4
2008-09-30 -3.16  13324.600    6.0
2008-12-31 -8.79  13141.920    6.9
2009-03-31  0.94  12925.410    8.1
2009-06-30  3.37  12901.504    9.2
2009-09-30  3.56  12990.341    9.6[203 rows x 3 columns]

# 2
ldata[:10]

	date	item	value
0	1959-03-31	realgdp	2710.349
1	1959-03-31	infl	0.000
2	1959-03-31	unemp	5.800
3	1959-06-30	realgdp	2778.801
4	1959-06-30	infl	2.340
5	1959-06-30	unemp	5.100
6	1959-09-30	realgdp	2775.488
7	1959-09-30	infl	2.740
8	1959-09-30	unemp	5.300
9	1959-12-31	realgdp	2785.204

pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

	date	item	value	value2
0	1959-03-31	realgdp	2710.349	1.669025
1	1959-03-31	infl	0.000	-0.438570
2	1959-03-31	unemp	5.800	-0.539741
3	1959-06-30	realgdp	2778.801	0.476985
4	1959-06-30	infl	2.340	3.248944
5	1959-06-30	unemp	5.100	-1.021228
6	1959-09-30	realgdp	2775.488	-0.577087
7	1959-09-30	infl	2.740	0.124121
8	1959-09-30	unemp	5.300	0.302614
9	1959-12-31	realgdp	2785.204	0.523772

pivoted = ldata.pivot('date', 'item')
pivoted[:5]

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8	-0.438570	1.669025	-0.539741
1959-06-30	2.34	2778.801	5.1	3.248944	0.476985	-1.021228
1959-09-30	2.74	2775.488	5.3	0.124121	-0.577087	0.302614
1959-12-31	0.27	2785.204	5.6	0.000940	0.523772	1.343810
1960-03-31	2.31	2847.699	5.2	-0.831154	-0.713544	-2.370232

pivoted['value'][:5]

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8	-0.438570	1.669025	-0.539741
1959-06-30	2.34	2778.801	5.1	3.248944	0.476985	-1.021228
1959-09-30	2.74	2775.488	5.3	0.124121	-0.577087	0.302614
1959-12-31	0.27	2785.204	5.6	0.000940	0.523772	1.343810
1960-03-31	2.31	2847.699	5.2	-0.831154	-0.713544	-2.370232
1960-06-30	0.14	2834.390	5.2	-0.860757	-1.860761	0.560145
1960-09-30	2.70	2839.022	5.6	0.119827	-1.265934	-1.063512

移除重复数据

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)

    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4

data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

data.drop_duplicates()

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

data['v1'] = range(7)
print(data)

    k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6

data.drop_duplicates(['k1'])

	k1	k2	v1
0	one	1	0
3	two	3	3

data.drop_duplicates(['k1', 'k2'], keep='last')

	k1	k2	v1
1	one	1	1
2	one	2	2
4	two	3	4
6	two	4	6

利用函数或映射进行数据转换

# 1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
print(data)

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0

meat_to_animal = {'bacon': 'pig','pulled pork': 'pig','pastrami': 'cow','corned beef': 'cow','honey ham': 'pig','nova lox': 'salmon'
}data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)

          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon

data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

数据标准化

datafile = 'data/normalization_data.xls'  # 参数初始化
data = pd.read_excel(datafile, header=None)  # 读取数据
print(data)

     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571

最小-最大规范化

(data - data.min()) / (data.max() - data.min())  # 最小-最大规范化

	0	1	2	3
0	0.074380	0.937291	0.923520	1.000000
1	0.619835	0.000000	0.000000	0.850941
2	0.214876	0.119565	0.813322	0.000000
3	0.000000	1.000000	1.000000	0.563676
4	1.000000	0.942308	0.996711	0.804149
5	0.264463	0.838629	0.814967	0.909310
6	0.636364	0.846990	0.786184	0.929571

零-均值规范化

(data - data.mean()) / data.std()  # 零-均值规范化

	0	1	2	3
0	-0.905383	0.635863	0.464531	0.798149
1	0.604678	-1.587675	-2.193167	0.369390
2	-0.516428	-1.304030	0.147406	-2.078279
3	-1.111301	0.784628	0.684625	-0.456906
4	1.657146	0.647765	0.675159	0.234796
5	-0.379150	0.401807	0.152139	0.537286
6	0.650438	0.421642	0.069308	0.595564

小数定标规范化

data / 10 ** np.ceil(np.log10(data.abs().max()))  # 小数定标规范化

	0	1	2	3
0	0.078	0.521	0.602	0.2863
1	0.144	-0.600	-0.521	0.2245
2	0.095	-0.457	0.468	-0.1283
3	0.069	0.596	0.695	0.1054
4	0.190	0.527	0.691	0.2051
5	0.101	0.403	0.470	0.2487
6	0.146	0.413	0.435	0.2571

替换值

data = Series([1., -999., 2., -999., -1000., 3.])
print(data)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引

data = DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
print(data)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11

data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

data.index = data.index.map(str.upper)
print(data)

          one  two  three  four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11

data.rename(index=str.title, columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.rename(index={'OHIO': 'INDIANA'},columns={'three': 'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

# 总是返回DataFrame的引用
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
print(data)

          one  two  three  four
INDIANA     0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11

离散化与面元划分

# 1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(cats)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]closed='right',dtype='interval[int64]')

pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < YoungAdult < Youth]

data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.45, 0.67], (0.23, 0.45], (0.0037, 0.23], (0.45, 0.67], (0.67, 0.9], ..., (0.67, 0.9], (0.0037, 0.23], (0.0037, 0.23], (0.23, 0.45], (0.23, 0.45]]
Length: 20
Categories (4, interval[float64]): [(0.0037, 0.23] < (0.23, 0.45] < (0.45, 0.67] < (0.67, 0.9]]

# 2
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
print(cats)

[(-0.022, 0.641], (-3.746, -0.635], (0.641, 3.26], (-3.746, -0.635], (-0.022, 0.641], ..., (-0.022, 0.641], (0.641, 3.26], (-0.635, -0.022], (0.641, 3.26], (-0.635, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.635] < (-0.635, -0.022] < (-0.022, 0.641] < (0.641, 3.26]]

pd.value_counts(cats)

(0.641, 3.26]       250
(-0.022, 0.641]     250
(-0.635, -0.022]    250
(-3.746, -0.635]    250
dtype: int64

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-3.746, -1.266], (-0.022, 1.302], ..., (-0.022, 1.302], (-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-1.266, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.266] < (-1.266, -0.022] < (-0.022, 1.302] < (1.302, 3.26]]

检测和过滤异常值

# 1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

col = data[3]
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

data[(np.abs(data) > 3).any(1)]

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

# 2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

排列与随机采样

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print(df)

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19

sampler = np.random.permutation(5)
print(sampler)

[1 0 2 3 4]

df.take(sampler)

	0	1	2	3
1	4	5	6	7
0	0	1	2	3
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

# 2
df.take(np.random.permutation(len(df))[:3])

	0	1	2	3
1	4	5	6	7
3	12	13	14	15
4	16	17	18	19

# 3
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
print(sampler)

[4 4 2 2 2 0 3 0 4 1]

draws = bag.take(sampler)
print(draws)

[ 4  4 -1 -1 -1  5  6  5  4  7]

计算指标与哑变量

# 1
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
print(df)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b

pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

dummies = pd.get_dummies(df['key'], prefix='key')
print(dummies)

   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0

df_with_dummy = df[['data1']].join(dummies)
print(df_with_dummy)

   data1  key_a  key_b  key_c
0      0      0      1      0
1      1      0      1      0
2      2      1      0      0
3      3      0      0      1
4      4      1      0      0
5      5      0      1      0

# 2
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/movies.dat', sep='::', header=None,names=mnames)
movies[:10]

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.after removing the cwd from sys.path.

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
print(genres)

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
print(dummies)

      Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0        0.0        0.0        0.0         0.0     0.0    0.0          0.0
1        0.0        0.0        0.0         0.0     0.0    0.0          0.0
2        0.0        0.0        0.0         0.0     0.0    0.0          0.0
3        0.0        0.0        0.0         0.0     0.0    0.0          0.0
4        0.0        0.0        0.0         0.0     0.0    0.0          0.0
5        0.0        0.0        0.0         0.0     0.0    0.0          0.0
6        0.0        0.0        0.0         0.0     0.0    0.0          0.0
7        0.0        0.0        0.0         0.0     0.0    0.0          0.0
8        0.0        0.0        0.0         0.0     0.0    0.0          0.0
9        0.0        0.0        0.0         0.0     0.0    0.0          0.0
10       0.0        0.0        0.0         0.0     0.0    0.0          0.0
11       0.0        0.0        0.0         0.0     0.0    0.0          0.0
12       0.0        0.0        0.0         0.0     0.0    0.0          0.0
13       0.0        0.0        0.0         0.0     0.0    0.0          0.0
14       0.0        0.0        0.0         0.0     0.0    0.0          0.0
15       0.0        0.0        0.0         0.0     0.0    0.0          0.0
16       0.0        0.0        0.0         0.0     0.0    0.0          0.0
17       0.0        0.0        0.0         0.0     0.0    0.0          0.0
18       0.0        0.0        0.0         0.0     0.0    0.0          0.0
19       0.0        0.0        0.0         0.0     0.0    0.0          0.0
20       0.0        0.0        0.0         0.0     0.0    0.0          0.0
21       0.0        0.0        0.0         0.0     0.0    0.0          0.0
22       0.0        0.0        0.0         0.0     0.0    0.0          0.0
23       0.0        0.0        0.0         0.0     0.0    0.0          0.0
24       0.0        0.0        0.0         0.0     0.0    0.0          0.0
25       0.0        0.0        0.0         0.0     0.0    0.0          0.0
26       0.0        0.0        0.0         0.0     0.0    0.0          0.0
27       0.0        0.0        0.0         0.0     0.0    0.0          0.0
28       0.0        0.0        0.0         0.0     0.0    0.0          0.0
29       0.0        0.0        0.0         0.0     0.0    0.0          0.0
...      ...        ...        ...         ...     ...    ...          ...
3853     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3854     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3855     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3856     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3857     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3858     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3859     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3860     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3861     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3862     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3863     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3864     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3865     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3866     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3867     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3868     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3869     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3870     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3871     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3872     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3873     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3874     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3875     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3876     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3877     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3878     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3879     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3880     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3881     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3882     0.0        0.0        0.0         0.0     0.0    0.0          0.0   Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
1       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
2       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
4       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
5       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
6       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
7       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
8       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
9       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
10      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
11      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
12      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
13      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
14      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
15      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
16      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
17      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
18      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
19      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
20      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
21      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
22      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
23      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
24      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
25      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
26      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
27      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
28      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
29      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
...     ...      ...        ...     ...      ...      ...      ...     ...
3853    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3854    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3855    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3856    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3857    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3858    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3859    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3860    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3861    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3862    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3863    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3864    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3865    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3866    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3867    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3868    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3869    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3870    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3871    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3872    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3873    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3874    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3875    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3876    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3877    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3878    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3879    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3880    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3881    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3882    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   Thriller  War  Western
0          0.0  0.0      0.0
1          0.0  0.0      0.0
2          0.0  0.0      0.0
3          0.0  0.0      0.0
4          0.0  0.0      0.0
5          0.0  0.0      0.0
6          0.0  0.0      0.0
7          0.0  0.0      0.0
8          0.0  0.0      0.0
9          0.0  0.0      0.0
10         0.0  0.0      0.0
11         0.0  0.0      0.0
12         0.0  0.0      0.0
13         0.0  0.0      0.0
14         0.0  0.0      0.0
15         0.0  0.0      0.0
16         0.0  0.0      0.0
17         0.0  0.0      0.0
18         0.0  0.0      0.0
19         0.0  0.0      0.0
20         0.0  0.0      0.0
21         0.0  0.0      0.0
22         0.0  0.0      0.0
23         0.0  0.0      0.0
24         0.0  0.0      0.0
25         0.0  0.0      0.0
26         0.0  0.0      0.0
27         0.0  0.0      0.0
28         0.0  0.0      0.0
29         0.0  0.0      0.0
...        ...  ...      ...
3853       0.0  0.0      0.0
3854       0.0  0.0      0.0
3855       0.0  0.0      0.0
3856       0.0  0.0      0.0
3857       0.0  0.0      0.0
3858       0.0  0.0      0.0
3859       0.0  0.0      0.0
3860       0.0  0.0      0.0
3861       0.0  0.0      0.0
3862       0.0  0.0      0.0
3863       0.0  0.0      0.0
3864       0.0  0.0      0.0
3865       0.0  0.0      0.0
3866       0.0  0.0      0.0
3867       0.0  0.0      0.0
3868       0.0  0.0      0.0
3869       0.0  0.0      0.0
3870       0.0  0.0      0.0
3871       0.0  0.0      0.0
3872       0.0  0.0      0.0
3873       0.0  0.0      0.0
3874       0.0  0.0      0.0
3875       0.0  0.0      0.0
3876       0.0  0.0      0.0
3877       0.0  0.0      0.0
3878       0.0  0.0      0.0
3879       0.0  0.0      0.0
3880       0.0  0.0      0.0
3881       0.0  0.0      0.0
3882       0.0  0.0      0.0  [3883 rows x 18 columns]

for i, gen in enumerate(movies.genres):dummies.loc[i, gen.split('|')] = 1movies_windic = movies.join(dummies.add_prefix('Genre_'))
print(movies_windic.loc[0])

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object

# 3
np.random.seed(12345)
values = np.random.rand(10)
print(values)

[ 0.92961609  0.31637555  0.18391881  0.20456028  0.56772503  0.59554470.96451452  0.6531771   0.74890664  0.65356987]

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

属性构造

# 参数初始化
inputfile = 'data/electricity_data.xls'  # 供入供出电量数据
outputfile = 'data/electricity_data.xls'  # 属性构造后数据文件data = pd.read_excel(inputfile)  # 读入数据
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量']) / data[u'供入电量']data.to_excel(outputfile, index=False)  # 保存结果

字符串对象方法

val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

pieces = [x.strip() for x in val.split(',')]
print(pieces)

['a', 'b', 'guido']

first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

'::'.join(pieces)

'a::b::guido'

'guido' in val

True

val.index(',')

val.find(':')

-1

val.index(':')

---------------------------------------------------------------------------ValueError                                Traceback (most recent call last)<ipython-input-138-280f8b2856ce> in <module>()
----> 1 val.index(':')ValueError: substring not found

val.count('a')

val.replace(',', '::')

'a::b::  guido'

val.replace(',', '')

'ab  guido'

正则表达式

# 1
import retext = "foo    bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

regex.findall(text)

['    ', '\t ', '  \t']

# 2
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'# re.IGNORECASE 的作用是使正则表达式对大小写不敏感
regex = re.compile(pattern, flags=re.IGNORECASE)regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

m = regex.search(text)
print(m)

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

text[m.start():m.end()]

'dave@google.com'

print(regex.match(text))

None

print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

# 3
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

regex.findall(text)

[('dave', 'google', 'com'),('steve', 'gmail', 'com'),('rob', 'gmail', 'com'),('ryan', 'yahoo', 'com')]

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com

# 4
regex = re.compile(r"""(?P<username>[A-Z0-9._%+-]+)@(?P<domain>[A-Z0-9.-]+)\.(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE | re.VERBOSE)m = regex.match('wesm@bright.net')
m.groupdict()

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

pandas中矢量化的字符串函数

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
print(data)

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

matches = data.str.match(pattern, flags=re.IGNORECASE)
print(matches)

Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object

matches.str.get(1)

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

matches.str[0]

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

data.str[:5]

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

示例：USDA食品数据库

'''
{"id": 21441,"description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
Wing, meat and skin with breading","tags": ["KFC"],"manufacturer": "Kentucky Fried Chicken","group": "Fast Foods","portions": [{"amount": 1,"unit": "wing, with skin","grams": 68.0},...],"nutrients": [{"value": 20.8,"units": "g","description": "Protein","group": "Composition"},...]
}
'''import jsondb = json.load(open('data/foods-2011-10-03.json'))
len(db)

db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

db[0]['nutrients'][0]

{'description': 'Protein','group': 'Composition','units': 'g','value': 25.18}

nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]

	description	group	units	value
0	Protein	Composition	g	25.18
1	Total lipid (fat)	Composition	g	29.20
2	Carbohydrate, by difference	Composition	g	3.06
3	Ash	Other	g	3.28
4	Energy	Energy	kcal	376.00
5	Water	Composition	g	39.28
6	Energy	Energy	kJ	1573.00

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)info[:5]

	description	group	id
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028

pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

nutrients = []for rec in db:fnuts = DataFrame(rec['nutrients'])fnuts['id'] = rec['id']nutrients.append(fnuts)nutrients = pd.concat(nutrients, ignore_index=True)print(nutrients)

                               description        group    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
389325                        Selenium, Se     Elements      mcg     1.100
389326                       Vitamin A, IU     Vitamins       IU     5.000
389327                             Retinol     Vitamins      mcg     0.000
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
389329                      Carotene, beta     Vitamins      mcg     2.000
389330                     Carotene, alpha     Vitamins      mcg     2.000
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
389332                           Vitamin D     Vitamins       IU     0.000
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000
389335                            Lycopene     Vitamins      mcg     0.000
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
389338                             Thiamin     Vitamins       mg     0.020
389339                          Riboflavin     Vitamins       mg     0.060
389340                              Niacin     Vitamins       mg     0.540
389341                         Vitamin B-6     Vitamins       mg     0.260
389342                       Folate, total     Vitamins      mcg    17.000
389343                        Vitamin B-12     Vitamins      mcg     0.000
389344                      Choline, total     Vitamins       mg     4.100
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
389346                          Folic acid     Vitamins      mcg     0.000
389347                        Folate, food     Vitamins      mcg    17.000
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000
389349                    Vitamin E, added     Vitamins       mg     0.000
389350                 Vitamin B-12, added     Vitamins      mcg     0.000
389351                         Cholesterol        Other       mg     0.000
389352        Fatty acids, total saturated        Other        g     0.072
389353  Fatty acids, total monounsaturated        Other        g     0.028
389354  Fatty acids, total polyunsaturated        Other        g     0.041   id
0        1008
1        1008
2        1008
3        1008
4        1008
5        1008
6        1008
7        1008
8        1008
9        1008
10       1008
11       1008
12       1008
13       1008
14       1008
15       1008
16       1008
17       1008
18       1008
19       1008
20       1008
21       1008
22       1008
23       1008
24       1008
25       1008
26       1008
27       1008
28       1008
29       1008
...       ...
389325  43546
389326  43546
389327  43546
389328  43546
389329  43546
389330  43546
389331  43546
389332  43546
389333  43546
389334  43546
389335  43546
389336  43546
389337  43546
389338  43546
389339  43546
389340  43546
389341  43546
389342  43546
389343  43546
389344  43546
389345  43546
389346  43546
389347  43546
389348  43546
389349  43546
389350  43546
389351  43546
389352  43546
389353  43546
389354  43546  [389355 rows x 5 columns]

# 数据中有重复的数据
nutrients.duplicated().sum()

# 去重
nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food','group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info)

                                                   food  \
0                                       Cheese, caraway
1                                       Cheese, cheddar
2                                          Cheese, edam
3                                          Cheese, feta
4                    Cheese, mozzarella, part skim milk
5      Cheese, mozzarella, part skim milk, low moisture
6                                        Cheese, romano
7                                     Cheese, roquefort
8     Cheese spread, pasteurized process, american, ...
9                           Cream, fluid, half and half
10    Sour dressing, non-butterfat, cultured, filled...
11    Milk, filled, fluid, with blend of hydrogenate...
12    Cream substitute, liquid, with lauric acid oil...
13                           Cream substitute, powdered
14                  Milk, producer, fluid, 3.7% milkfat
15    Milk, reduced fat, fluid, 2% milkfat, with add...
16    Milk, reduced fat, fluid, 2% milkfat, with add...
17    Milk, reduced fat, fluid, 2% milkfat, protein ...
18    Milk, lowfat, fluid, 1% milkfat, with added vi...
19    Milk, lowfat, fluid, 1% milkfat, with added no...
20    Milk, lowfat, fluid, 1% milkfat, protein forti...
21    Milk, nonfat, fluid, with added vitamin A and ...
22    Milk, nonfat, fluid, with added nonfat milk so...
23    Milk, nonfat, fluid, protein fortified, with a...
24            Milk, buttermilk, fluid, cultured, lowfat
25                              Milk, low sodium, fluid
26               Milk, dry, whole, with added vitamin D
27    Milk, dry, nonfat, regular, without added vita...
28    Milk, dry, nonfat, instant, with added vitamin...
29                   Milk, dry, nonfat, calcium reduced
...                                                 ...
6606  Beef, tenderloin, steak, separable lean only, ...
6607  Beef, top sirloin, steak, separable lean only,...
6608  Beef, short loin, top loin, steak, separable l...
6609  Beef, chuck, arm pot roast, separable lean onl...
6610  Beef, brisket, flat half, separable lean only,...
6611  Beef, chuck, arm pot roast, separable lean onl...
6612  Beef, brisket, flat half, separable lean only,...
6613  Beef, round, eye of round, roast, separable le...
6614  Beef, round, top round, steak, separable lean ...
6615  Beef, round, bottom round, roast, separable le...
6616  Beef, rib, small end (ribs 10-12), separable l...
6617  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...
6618  CAMPBELL Soup Company, CAMPBELL's Red and Whit...
6619  CAMPBELL Soup Company, CAMPBELL'S SELECT Soups...
6620  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...
6621  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...
6622  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6623  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6624  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6625  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...
6626  CAMPBELL Soup Company, V8 Vegetable Juice, Ess...
6627  CAMPBELL Soup Company, V8 Vegetable Juice, Spi...
6628  CAMPBELL Soup Company, PACE, Jalapenos Nacho S...
6629  CAMPBELL Soup Company, V8 60% Vegetable Juice,...
6630  CAMPBELL Soup Company, V8 Vegetable Juice, Low...
6631                             Bologna, beef, low fat
6632  Turkey and pork sausage, fresh, bulk, patty or...
6633                              Babyfood, juice, pear
6634         Babyfood, dessert, banana yogurt, strained
6635              Babyfood, banana no tapioca, strained   fgroup     id       manufacturer
0                Dairy and Egg Products   1008
1                Dairy and Egg Products   1009
2                Dairy and Egg Products   1018
3                Dairy and Egg Products   1019
4                Dairy and Egg Products   1028
5                Dairy and Egg Products   1029
6                Dairy and Egg Products   1038
7                Dairy and Egg Products   1039
8                Dairy and Egg Products   1048
9                Dairy and Egg Products   1049
10               Dairy and Egg Products   1058
11               Dairy and Egg Products   1059
12               Dairy and Egg Products   1068
13               Dairy and Egg Products   1069
14               Dairy and Egg Products   1078
15               Dairy and Egg Products   1079               None
16               Dairy and Egg Products   1080
17               Dairy and Egg Products   1081
18               Dairy and Egg Products   1082
19               Dairy and Egg Products   1083
20               Dairy and Egg Products   1084
21               Dairy and Egg Products   1085
22               Dairy and Egg Products   1086
23               Dairy and Egg Products   1087
24               Dairy and Egg Products   1088
25               Dairy and Egg Products   1089
26               Dairy and Egg Products   1090
27               Dairy and Egg Products   1091
28               Dairy and Egg Products   1092
29               Dairy and Egg Products   1093
...                                 ...    ...                ...
6606                      Beef Products  23628
6607                      Beef Products  23629
6608                      Beef Products  23630
6609                      Beef Products  23631
6610                      Beef Products  23632
6611                      Beef Products  23633
6612                      Beef Products  23634
6613                      Beef Products  23635
6614                      Beef Products  23636
6615                      Beef Products  23637
6616                      Beef Products  23638
6617         Soups, Sauces, and Gravies  27015  Campbell Soup Co.
6618         Soups, Sauces, and Gravies  27016  Campbell Soup Co.
6619         Soups, Sauces, and Gravies  27021  Campbell Soup Co.
6620         Soups, Sauces, and Gravies  27022  Campbell Soup Co.
6621         Soups, Sauces, and Gravies  27023  Campbell Soup Co.
6622         Soups, Sauces, and Gravies  27024  Campbell Soup Co.
6623         Soups, Sauces, and Gravies  27025  Campbell Soup Co.
6624         Soups, Sauces, and Gravies  27026  Campbell Soup Co.
6625         Soups, Sauces, and Gravies  27032  Campbell Soup Co.
6626  Vegetables and Vegetable Products  31010  Campbell Soup Co.
6627  Vegetables and Vegetable Products  31013  Campbell Soup Co.
6628  Vegetables and Vegetable Products  31014  Campbell Soup Co.
6629  Vegetables and Vegetable Products  31016  Campbell Soup Co.
6630  Vegetables and Vegetable Products  31017  Campbell Soup Co.
6631        Sausages and Luncheon Meats  42161
6632        Sausages and Luncheon Meats  42173
6633                         Baby Foods  43408               None
6634                         Baby Foods  43539               None
6635                         Baby Foods  43546               None  [6636 rows x 4 columns]

col_mapping = {'description': 'nutrient','group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients)

                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
389325                        Selenium, Se     Elements      mcg     1.100
389326                       Vitamin A, IU     Vitamins       IU     5.000
389327                             Retinol     Vitamins      mcg     0.000
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
389329                      Carotene, beta     Vitamins      mcg     2.000
389330                     Carotene, alpha     Vitamins      mcg     2.000
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
389332                           Vitamin D     Vitamins       IU     0.000
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000
389335                            Lycopene     Vitamins      mcg     0.000
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
389338                             Thiamin     Vitamins       mg     0.020
389339                          Riboflavin     Vitamins       mg     0.060
389340                              Niacin     Vitamins       mg     0.540
389341                         Vitamin B-6     Vitamins       mg     0.260
389342                       Folate, total     Vitamins      mcg    17.000
389343                        Vitamin B-12     Vitamins      mcg     0.000
389344                      Choline, total     Vitamins       mg     4.100
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
389346                          Folic acid     Vitamins      mcg     0.000
389347                        Folate, food     Vitamins      mcg    17.000
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000
389349                    Vitamin E, added     Vitamins       mg     0.000
389350                 Vitamin B-12, added     Vitamins      mcg     0.000
389351                         Cholesterol        Other       mg     0.000
389352        Fatty acids, total saturated        Other        g     0.072
389353  Fatty acids, total monounsaturated        Other        g     0.028
389354  Fatty acids, total polyunsaturated        Other        g     0.041   id
0        1008
1        1008
2        1008
3        1008
4        1008
5        1008
6        1008
7        1008
8        1008
9        1008
10       1008
11       1008
12       1008
13       1008
14       1008
15       1008
16       1008
17       1008
18       1008
19       1008
20       1008
21       1008
22       1008
23       1008
24       1008
25       1008
26       1008
27       1008
28       1008
29       1008
...       ...
389325  43546
389326  43546
389327  43546
389328  43546
389329  43546
389330  43546
389331  43546
389332  43546
389333  43546
389334  43546
389335  43546
389336  43546
389337  43546
389338  43546
389339  43546
389340  43546
389341  43546
389342  43546
389343  43546
389344  43546
389345  43546
389346  43546
389347  43546
389348  43546
389349  43546
389350  43546
389351  43546
389352  43546
389353  43546
389354  43546  [375176 rows x 5 columns]

ndata = pd.merge(nutrients, info, on='id', how='outer')
print(ndata)

                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
375146                        Selenium, Se     Elements      mcg     1.100
375147                       Vitamin A, IU     Vitamins       IU     5.000
375148                             Retinol     Vitamins      mcg     0.000
375149                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
375150                      Carotene, beta     Vitamins      mcg     2.000
375151                     Carotene, alpha     Vitamins      mcg     2.000
375152        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
375153                           Vitamin D     Vitamins       IU     0.000
375154                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
375155                 Cryptoxanthin, beta     Vitamins      mcg     0.000
375156                            Lycopene     Vitamins      mcg     0.000
375157                 Lutein + zeaxanthin     Vitamins      mcg    20.000
375158      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
375159                             Thiamin     Vitamins       mg     0.020
375160                          Riboflavin     Vitamins       mg     0.060
375161                              Niacin     Vitamins       mg     0.540
375162                         Vitamin B-6     Vitamins       mg     0.260
375163                       Folate, total     Vitamins      mcg    17.000
375164                        Vitamin B-12     Vitamins      mcg     0.000
375165                      Choline, total     Vitamins       mg     4.100
375166           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
375167                          Folic acid     Vitamins      mcg     0.000
375168                        Folate, food     Vitamins      mcg    17.000
375169                         Folate, DFE     Vitamins  mcg_DFE    17.000
375170                    Vitamin E, added     Vitamins       mg     0.000
375171                 Vitamin B-12, added     Vitamins      mcg     0.000
375172                         Cholesterol        Other       mg     0.000
375173        Fatty acids, total saturated        Other        g     0.072
375174  Fatty acids, total monounsaturated        Other        g     0.028
375175  Fatty acids, total polyunsaturated        Other        g     0.041   id                                   food                  fgroup  \
0        1008                        Cheese, caraway  Dairy and Egg Products
1        1008                        Cheese, caraway  Dairy and Egg Products
2        1008                        Cheese, caraway  Dairy and Egg Products
3        1008                        Cheese, caraway  Dairy and Egg Products
4        1008                        Cheese, caraway  Dairy and Egg Products
5        1008                        Cheese, caraway  Dairy and Egg Products
6        1008                        Cheese, caraway  Dairy and Egg Products
7        1008                        Cheese, caraway  Dairy and Egg Products
8        1008                        Cheese, caraway  Dairy and Egg Products
9        1008                        Cheese, caraway  Dairy and Egg Products
10       1008                        Cheese, caraway  Dairy and Egg Products
11       1008                        Cheese, caraway  Dairy and Egg Products
12       1008                        Cheese, caraway  Dairy and Egg Products
13       1008                        Cheese, caraway  Dairy and Egg Products
14       1008                        Cheese, caraway  Dairy and Egg Products
15       1008                        Cheese, caraway  Dairy and Egg Products
16       1008                        Cheese, caraway  Dairy and Egg Products
17       1008                        Cheese, caraway  Dairy and Egg Products
18       1008                        Cheese, caraway  Dairy and Egg Products
19       1008                        Cheese, caraway  Dairy and Egg Products
20       1008                        Cheese, caraway  Dairy and Egg Products
21       1008                        Cheese, caraway  Dairy and Egg Products
22       1008                        Cheese, caraway  Dairy and Egg Products
23       1008                        Cheese, caraway  Dairy and Egg Products
24       1008                        Cheese, caraway  Dairy and Egg Products
25       1008                        Cheese, caraway  Dairy and Egg Products
26       1008                        Cheese, caraway  Dairy and Egg Products
27       1008                        Cheese, caraway  Dairy and Egg Products
28       1008                        Cheese, caraway  Dairy and Egg Products
29       1008                        Cheese, caraway  Dairy and Egg Products
...       ...                                    ...                     ...
375146  43546  Babyfood, banana no tapioca, strained              Baby Foods
375147  43546  Babyfood, banana no tapioca, strained              Baby Foods
375148  43546  Babyfood, banana no tapioca, strained              Baby Foods
375149  43546  Babyfood, banana no tapioca, strained              Baby Foods
375150  43546  Babyfood, banana no tapioca, strained              Baby Foods
375151  43546  Babyfood, banana no tapioca, strained              Baby Foods
375152  43546  Babyfood, banana no tapioca, strained              Baby Foods
375153  43546  Babyfood, banana no tapioca, strained              Baby Foods
375154  43546  Babyfood, banana no tapioca, strained              Baby Foods
375155  43546  Babyfood, banana no tapioca, strained              Baby Foods
375156  43546  Babyfood, banana no tapioca, strained              Baby Foods
375157  43546  Babyfood, banana no tapioca, strained              Baby Foods
375158  43546  Babyfood, banana no tapioca, strained              Baby Foods
375159  43546  Babyfood, banana no tapioca, strained              Baby Foods
375160  43546  Babyfood, banana no tapioca, strained              Baby Foods
375161  43546  Babyfood, banana no tapioca, strained              Baby Foods
375162  43546  Babyfood, banana no tapioca, strained              Baby Foods
375163  43546  Babyfood, banana no tapioca, strained              Baby Foods
375164  43546  Babyfood, banana no tapioca, strained              Baby Foods
375165  43546  Babyfood, banana no tapioca, strained              Baby Foods
375166  43546  Babyfood, banana no tapioca, strained              Baby Foods
375167  43546  Babyfood, banana no tapioca, strained              Baby Foods
375168  43546  Babyfood, banana no tapioca, strained              Baby Foods
375169  43546  Babyfood, banana no tapioca, strained              Baby Foods
375170  43546  Babyfood, banana no tapioca, strained              Baby Foods
375171  43546  Babyfood, banana no tapioca, strained              Baby Foods
375172  43546  Babyfood, banana no tapioca, strained              Baby Foods
375173  43546  Babyfood, banana no tapioca, strained              Baby Foods
375174  43546  Babyfood, banana no tapioca, strained              Baby Foods
375175  43546  Babyfood, banana no tapioca, strained              Baby Foods   manufacturer
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
...             ...
375146         None
375147         None
375148         None
375149         None
375150         None
375151         None
375152         None
375153         None
375154         None
375155         None
375156         None
375157         None
375158         None
375159         None
375160         None
375161         None
375162         None
375163         None
375164         None
375165         None
375166         None
375167         None
375168         None
375169         None
375170         None
375171         None
375172         None
375173         None
375174         None
375175         None  [375176 rows x 8 columns]

ndata.loc[30000]

nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer
Name: 30000, dtype: object

result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
print(result)

nutrient          fgroup
Adjusted Protein  Sweets                               12.9000Vegetables and Vegetable Products     2.1800
Alanine           Baby Foods                            0.0850Baked Products                        0.2480Beef Products                         1.5500Beverages                             0.0030Breakfast Cereals                     0.3110Cereal Grains and Pasta               0.3730Dairy and Egg Products                0.2710Ethnic Foods                          1.2900Fast Foods                            0.5140Fats and Oils                         0.0000Finfish and Shellfish Products        1.2180Fruits and Fruit Juices               0.0270Lamb, Veal, and Game Products         1.4080Legumes and Legume Products           0.4100Meals, Entrees, and Sidedishes        0.3270Nut and Seed Products                 0.7345Pork Products                         1.3070Poultry Products                      1.3940Restaurant Foods                      0.4650Sausages and Luncheon Meats           0.9420Snacks                                0.4335Soups, Sauces, and Gravies            0.0650Spices and Herbs                      0.5550Sweets                                0.1020Vegetables and Vegetable Products     0.0840
Alcohol, ethyl    Baby Foods                            0.0000Baked Products                        0.0000Beef Products                         0.0000...
Water             Snacks                                3.5200Soups, Sauces, and Gravies           85.9000Spices and Herbs                     43.6700Sweets                                9.0500Vegetables and Vegetable Products    89.1950
Zinc, Zn          Baby Foods                            0.5900Baked Products                        0.6600Beef Products                         5.3900Beverages                             0.0400Breakfast Cereals                     2.8850Cereal Grains and Pasta               1.0900Dairy and Egg Products                1.3900Ethnic Foods                          1.0450Fast Foods                            1.2500Fats and Oils                         0.0200Finfish and Shellfish Products        0.6700Fruits and Fruit Juices               0.1000Lamb, Veal, and Game Products         3.9400Legumes and Legume Products           1.1400Meals, Entrees, and Sidedishes        0.6300Nut and Seed Products                 3.2900Pork Products                         2.3200Poultry Products                      2.5000Restaurant Foods                      0.8000Sausages and Luncheon Meats           2.1300Snacks                                1.4700Soups, Sauces, and Gravies            0.2000Spices and Herbs                      2.7500Sweets                                0.3600Vegetables and Vegetable Products     0.3300
Name: value, Length: 2246, dtype: float64

# result['Zinc, Zn'].order().plot(kind='barh') #AttributeError: 'Series' object has no attribute 'order'
result['Zinc, Zn'].sort_values().plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x2156ce3c748>

by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
print(max_foods)

                                                 value  \
nutgroup    nutrient
Amino Acids Alanine                              8.009   Arginine                             7.436   Aspartic acid                       10.203   Cystine                              1.307   Glutamic acid                       17.452   Glycine                             19.049   Histidine                            2.999   Hydroxyproline                       0.803   Isoleucine                           4.300   Leucine                              7.200   Lysine                               6.690   Methionine                           1.859   Phenylalanine                        4.600   Proline                             12.295   Serine                               4.600   Threonine                            3.300   Tryptophan                           1.600   Tyrosine                             3.300   Valine                               4.500
Composition Adjusted Protein                    12.900   Carbohydrate, by difference        100.000   Fiber, total dietary                79.000   Protein                             88.320   Sugars, total                       99.800   Total lipid (fat)                  100.000   Water                              100.000
Elements    Calcium, Ca                       7364.000   Copper, Cu                          15.050   Fluoride, F                        584.000   Iron, Fe                            87.470
...                                                ...
Vitamins    Cryptoxanthin, beta               6186.000   Dihydrophylloquinone               103.800   Folate, DFE                       2630.000   Folate, food                      2340.000   Folate, total                     2340.000   Folic acid                        1538.000   Lutein + zeaxanthin              39550.000   Lycopene                         46260.000   Menaquinone-4                       33.200   Niacin                              97.000   Pantothenic acid                    35.000   Retinol                          30000.000   Riboflavin                          14.300   Thiamin                             20.000   Tocopherol, beta                     6.490   Tocopherol, delta                   30.880   Tocopherol, gamma                  100.880   Vitamin A, IU                   100000.000   Vitamin A, RAE                   30000.000   Vitamin B-12                        98.890   Vitamin B-12, added                 24.000   Vitamin B-6                         12.000   Vitamin C, total ascorbic acid    2400.000   Vitamin D                        10000.000   Vitamin D (D2 + D3)                250.000   Vitamin D2 (ergocalciferol)         28.100   Vitamin D3 (cholecalciferol)        27.400   Vitamin E (alpha-tocopherol)       149.400   Vitamin E, added                    46.550   Vitamin K (phylloquinone)         1714.500   food
nutgroup    nutrient
Amino Acids Alanine                                         Gelatins, dry powder, unsweetened  Arginine                                             Seeds, sesame flour, low-fat  Aspartic acid                                                 Soy protein isolate  Cystine                              Seeds, cottonseed flour, low fat (glandless)  Glutamic acid                                                 Soy protein isolate  Glycine                                         Gelatins, dry powder, unsweetened  Histidine                              Whale, beluga, meat, dried (Alaska Native)  Hydroxyproline                  KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...  Isoleucine                      Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Leucine                         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Lysine                          Seal, bearded (Oogruk), meat, dried (Alaska Na...  Methionine                                  Fish, cod, Atlantic, dried and salted  Phenylalanine                   Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Proline                                         Gelatins, dry powder, unsweetened  Serine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Threonine                       Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Tryptophan                       Sea lion, Steller, meat with fat (Alaska Native)  Tyrosine                        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  Valine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Composition Adjusted Protein                           Baking chocolate, unsweetened, squares  Carbohydrate, by difference           Sweeteners, tabletop, fructose, dry, powder  Fiber, total dietary                                             Corn bran, crude  Protein                         Soy protein isolate, potassium type, crude pro...  Sugars, total                                                  Sugars, granulated  Total lipid (fat)                                                 Oil, wheat germ  Water                                               Water, bottled, POLAND SPRING
Elements    Calcium, Ca                     Leavening agents, baking powder, double-acting...  Copper, Cu                      Veal, variety meats and by-products, liver, co...  Fluoride, F                     Tea, instant, sweetened with sugar, lemon-flav...  Iron, Fe                            Salad dressing, russian dressing, low calorie
...                                                                                       ...
Vitamins    Cryptoxanthin, beta                                               Spices, paprika  Dihydrophylloquinone            Margarine, 80% fat, stick, includes regular an...  Folate, DFE                            Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  Folate, food                         Leavening agents, yeast, baker's, active dry  Folate, total                        Leavening agents, yeast, baker's, active dry  Folic acid                             Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  Lutein + zeaxanthin                                                     Kale, raw  Lycopene                                                            Tomato powder  Menaquinone-4                   Chicken, broilers or fryers, drumstick, meat a...  Niacin                                                       Yeast extract spread  Pantothenic acid                Cereals ready-to-eat, KELLOGG, KELLOGG'S Compl...  Retinol                                                       Fish oil, cod liver  Riboflavin                                                   Yeast extract spread  Thiamin                         MORNINGSTAR FARMS Hot and Spicy Veggie Sausage...  Tocopherol, beta                Yellow pond lily, Wocas, dried seeds (Pacific ...  Tocopherol, delta                 Oil, cooking and salad, ENOVA, 80% diglycerides  Tocopherol, gamma                 Oil, cooking and salad, ENOVA, 80% diglycerides  Vitamin A, IU                                                 Fish oil, cod liver  Vitamin A, RAE                                                Fish oil, cod liver  Vitamin B-12                    Mollusks, clam, mixed species, cooked, moist heat  Vitamin B-12, added             Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  Vitamin B-6                     Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  Vitamin C, total ascorbic acid  Orange-flavor drink, breakfast type, low calor...  Vitamin D                                                     Fish oil, cod liver  Vitamin D (D2 + D3)                                           Fish oil, cod liver  Vitamin D2 (ergocalciferol)                               Mushrooms, maitake, raw  Vitamin D3 (cholecalciferol)                        Fish, halibut, Greenland, raw  Vitamin E (alpha-tocopherol)                                      Oil, wheat germ  Vitamin E, added                Cereals ready-to-eat, GENERAL MILLS, Multi-Gra...  Vitamin K (phylloquinone)                                    Spices, sage, ground  [94 rows x 2 columns]

max_foods.loc['Amino Acids']['food']

nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

参考资料：炼数成金Python数据分析课程

Python数据分析_第06课：数据清洗与初步分析_笔记相关推荐

【Python数据分析】数据预处理1——数据清洗
在数据挖掘过程中,数据预处理过程是占比很大的一部分工作数据预处理过程主要有以下几个部分 1.数据清洗--2.数据集成--3.数据变换--4.数据规约本文介绍数据清洗部分一.缺失值分析与处理 1. ...
Python数据分析实战,，美国总统大选数据可视化分析[基于pandas]
目录前言一.任务详情二.数据集来源三.实现过程四.运行代码前言在学习Python数据分析的过程中,是离不开实战的. 今天跟大家带来数据分析可视化经典项目,美国总统大选数据可视化分析,希望 ...
python数据分析工资_python3对拉勾数据进行可视化分析的方法详解
前言上回说到我们如何把拉勾的数据抓取下来的,既然获取了数据,就别放着不动,把它拿出来分析一下,看看这些数据里面都包含了什么信息. 下面话不多说了,来一起看看详细的介绍吧一.前期准备由于上次抓的数 ...
python数据分析之产品销量时序分析与商品关联分析
这是我们之前的课后作业,根据自己的想法对这个数据进行分析,只要求写出五个点出来就可以了,因此我就对这些数据进行了分析一番.涉及的python知识点还是挺多的,包括了python连接数据库,SQL提取数 ...
python数据分析与可视化清华大学_【官方正版】 Python数据分析与可视化微课视频版清华大学出版社魏伟一李晓红软件工具程序设计...
第1章数据分析与可视化概述 1．1数据分析 1．2数据可视化 1．3数据分析与可视化常用工具 1．4为何选用Python进行数据分析与可视化 1．5Python数据分析与可视化常用类库 1．6Jupy ...
python数据分析与人工智能_正版 Python数据分析与可视化微课视频版魏伟一李晓红大数据与人工智能技术丛书程序源码...
第1章数据分析与可视化概述 1．1数据分析 1．2数据可视化 1．3数据分析与可视化常用工具 1．4为何选用Python进行数据分析与可视化 1．5Python数据分析与可视化常用类库 1．6Jupy ...
Python数据分析之数据预处理（数据清洗、数据合并、数据重塑、数据转换）学习笔记
文章目录 1. 数据清洗 1.1 空值和缺失值的处理 1.1.1 使用isnull()和notnull()函数 1.1.1.1 isnull()语法格式: 1.1.1.2 notnull()语法格式: ...
Python数据分析第十二课：单变量、双变量及多变量分析图
一.单变量分析绘图什么是单变量分析? 单变量其实就是我们通常接触到的数据集中的一列数据. 单变量分析是数据分析中最简单的形式,其中被分析的数据只包含一个变量.因为它是一个单一的变量,它不处理原因或关 ...
python数据分析、整理、汇总展示_python-数据分析与展示（Numpy、matplotlib、pandas）---2...
笔记内容整理自mooc上北京理工大学嵩天老师python系列课程数据分析与展示,本人小白一枚,如有不对,多加指正 1.python自带的图像库PIL 1.1常用API Image.open() Ima ...

Python数据分析_第06课：数据清洗与初步分析_笔记

文章目录

缺失值处理——拉格朗日插值法

dataframe合并

索引上的合并

轴向连接

合并重叠数据

重塑层次化索引

长宽格式的转换

移除重复数据

利用函数或映射进行数据转换

数据标准化

最小-最大规范化

零-均值规范化

小数定标规范化

替换值

重命名轴索引

离散化与面元划分

检测和过滤异常值

排列与随机采样

计算指标与哑变量

属性构造

字符串对象方法

正则表达式

pandas中矢量化的字符串函数

示例：USDA食品数据库

Python数据分析_第06课：数据清洗与初步分析_笔记相关推荐

最新文章

热门文章

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0