数据清洗
1.处理缺失数据
import pandas as pdlog_data = pd.read_csv('log.csv')
log_data
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
1
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
2
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
3
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
4
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
5
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
6
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
7
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
8
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
9
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
10
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
11
|
1469974454
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
12
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
13
|
1469974424
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
14
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
15
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
16
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
17
|
1469974724
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
18
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
19
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
20
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
21
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
22
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
23
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
24
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
25
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
26
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
27
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
28
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
29
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
30
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
31
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
32
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.isnull()
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
False
|
False
|
False
|
False
|
False
|
False
|
1
|
False
|
False
|
False
|
False
|
True
|
True
|
2
|
False
|
False
|
False
|
False
|
True
|
True
|
3
|
False
|
False
|
False
|
False
|
True
|
True
|
4
|
False
|
False
|
False
|
False
|
True
|
True
|
5
|
False
|
False
|
False
|
False
|
True
|
True
|
6
|
False
|
False
|
False
|
False
|
True
|
True
|
7
|
False
|
False
|
False
|
False
|
True
|
True
|
8
|
False
|
False
|
False
|
False
|
True
|
True
|
9
|
False
|
False
|
False
|
False
|
True
|
True
|
10
|
False
|
False
|
False
|
False
|
True
|
True
|
11
|
False
|
False
|
False
|
False
|
True
|
True
|
12
|
False
|
False
|
False
|
False
|
True
|
True
|
13
|
False
|
False
|
False
|
False
|
False
|
False
|
14
|
False
|
False
|
False
|
False
|
True
|
True
|
15
|
False
|
False
|
False
|
False
|
True
|
True
|
16
|
False
|
False
|
False
|
False
|
True
|
False
|
17
|
False
|
False
|
False
|
False
|
True
|
True
|
18
|
False
|
False
|
False
|
False
|
True
|
True
|
19
|
False
|
False
|
False
|
False
|
True
|
True
|
20
|
False
|
False
|
False
|
False
|
True
|
True
|
21
|
False
|
False
|
False
|
False
|
True
|
True
|
22
|
False
|
False
|
False
|
False
|
True
|
True
|
23
|
False
|
False
|
False
|
False
|
True
|
True
|
24
|
False
|
False
|
False
|
False
|
False
|
False
|
25
|
False
|
False
|
False
|
False
|
True
|
True
|
26
|
False
|
False
|
False
|
False
|
True
|
True
|
27
|
False
|
False
|
False
|
False
|
True
|
True
|
28
|
False
|
False
|
False
|
False
|
True
|
True
|
29
|
False
|
False
|
False
|
False
|
True
|
True
|
30
|
False
|
False
|
False
|
False
|
True
|
True
|
31
|
False
|
False
|
False
|
False
|
True
|
True
|
32
|
False
|
False
|
False
|
False
|
True
|
True
|
log_data['paused'].isnull()
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 False
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
Name: paused, dtype: bool
# 取出volume不为空的数据
log_data[log_data['volume'].notnull()]
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
13
|
1469974424
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
16
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
24
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data.set_index(['time', 'user'], inplace=True)
log_data.sort_index(inplace=True)
log_data
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.fillna(0)
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
0
|
0.0
|
sue
|
advanced.html
|
24
|
0
|
0.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
0
|
0.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
0
|
0.0
|
1469974524
|
sue
|
advanced.html
|
25
|
0
|
0.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
0
|
0.0
|
1469974554
|
sue
|
advanced.html
|
26
|
0
|
0.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
0
|
0.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
0
|
0.0
|
1469974624
|
sue
|
advanced.html
|
27
|
0
|
0.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
0
|
0.0
|
1469974654
|
sue
|
advanced.html
|
28
|
0
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
0
|
0.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
0
|
0.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
0
|
0.0
|
sue
|
advanced.html
|
29
|
0
|
0.0
|
1469974754
|
sue
|
advanced.html
|
30
|
0
|
0.0
|
1469974824
|
sue
|
advanced.html
|
31
|
0
|
0.0
|
1469974854
|
sue
|
advanced.html
|
32
|
0
|
0.0
|
1469974924
|
sue
|
advanced.html
|
33
|
0
|
0.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977484
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977514
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977544
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977574
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977604
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977634
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977664
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977694
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977724
|
bob
|
intro.html
|
1
|
0
|
0.0
|
log_data.dropna(subset=['volume'])
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.ffill()
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
False
|
10.0
|
sue
|
advanced.html
|
24
|
False
|
10.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
False
|
10.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
False
|
10.0
|
1469974524
|
sue
|
advanced.html
|
25
|
False
|
10.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
False
|
10.0
|
1469974554
|
sue
|
advanced.html
|
26
|
False
|
10.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
False
|
10.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
False
|
10.0
|
1469974624
|
sue
|
advanced.html
|
27
|
False
|
10.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
False
|
10.0
|
1469974654
|
sue
|
advanced.html
|
28
|
False
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
False
|
5.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
False
|
5.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
False
|
5.0
|
sue
|
advanced.html
|
29
|
False
|
5.0
|
1469974754
|
sue
|
advanced.html
|
30
|
False
|
5.0
|
1469974824
|
sue
|
advanced.html
|
31
|
False
|
5.0
|
1469974854
|
sue
|
advanced.html
|
32
|
False
|
5.0
|
1469974924
|
sue
|
advanced.html
|
33
|
False
|
5.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977484
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977514
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977544
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977574
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977604
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977634
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977664
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977694
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977724
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data.bfill()
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
True
|
5.0
|
sue
|
advanced.html
|
24
|
True
|
5.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
True
|
5.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
True
|
5.0
|
1469974524
|
sue
|
advanced.html
|
25
|
True
|
5.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
True
|
5.0
|
1469974554
|
sue
|
advanced.html
|
26
|
True
|
5.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
True
|
5.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
True
|
5.0
|
1469974624
|
sue
|
advanced.html
|
27
|
True
|
5.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
True
|
5.0
|
1469974654
|
sue
|
advanced.html
|
28
|
True
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
True
|
10.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
True
|
10.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
True
|
10.0
|
sue
|
advanced.html
|
29
|
True
|
10.0
|
1469974754
|
sue
|
advanced.html
|
30
|
True
|
10.0
|
1469974824
|
sue
|
advanced.html
|
31
|
True
|
10.0
|
1469974854
|
sue
|
advanced.html
|
32
|
True
|
10.0
|
1469974924
|
sue
|
advanced.html
|
33
|
True
|
10.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
2.数据变形
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data
|
k1
|
k2
|
0
|
one
|
1
|
1
|
two
|
1
|
2
|
one
|
2
|
3
|
two
|
3
|
4
|
one
|
3
|
5
|
two
|
4
|
6
|
two
|
4
|
# 判断数据是否重复
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
# 去除重复数据
data.drop_duplicates()
|
k1
|
k2
|
0
|
one
|
1
|
1
|
two
|
1
|
2
|
one
|
2
|
3
|
two
|
3
|
4
|
one
|
3
|
5
|
two
|
4
|
data['v1'] = range(7)
data
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
2
|
one
|
2
|
2
|
3
|
two
|
3
|
3
|
4
|
one
|
3
|
4
|
5
|
two
|
4
|
5
|
6
|
two
|
4
|
6
|
# 去除指定列的重复数据
data.drop_duplicates(['k1'])
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
data.drop_duplicates(['k1', 'k2'], keep='last')
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
2
|
one
|
2
|
2
|
3
|
two
|
3
|
3
|
4
|
one
|
3
|
4
|
6
|
two
|
4
|
6
|
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
|
food
|
ounces
|
0
|
bacon
|
4.0
|
1
|
pulled pork
|
3.0
|
2
|
bacon
|
12.0
|
3
|
Pastrami
|
6.0
|
4
|
corned beef
|
7.5
|
5
|
Bacon
|
8.0
|
6
|
pastrami
|
3.0
|
7
|
honey ham
|
5.0
|
8
|
nova lox
|
6.0
|
# 添加一列,用于指定食物的来源
meat_to_animal = {'bacon': 'pig','pulled pork': 'pig','pastrami': 'cow','corned beef': 'cow','honey ham': 'pig','nova lox': 'salmon'
}
# 使用map()
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data
|
food
|
ounces
|
animal
|
0
|
bacon
|
4.0
|
pig
|
1
|
pulled pork
|
3.0
|
pig
|
2
|
bacon
|
12.0
|
pig
|
3
|
Pastrami
|
6.0
|
cow
|
4
|
corned beef
|
7.5
|
cow
|
5
|
Bacon
|
8.0
|
pig
|
6
|
pastrami
|
3.0
|
cow
|
7
|
honey ham
|
5.0
|
pig
|
8
|
nova lox
|
6.0
|
salmon
|
# 使用方法
data['animal2'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
|
food
|
ounces
|
animal
|
animal2
|
0
|
bacon
|
4.0
|
pig
|
pig
|
1
|
pulled pork
|
3.0
|
pig
|
pig
|
2
|
bacon
|
12.0
|
pig
|
pig
|
3
|
Pastrami
|
6.0
|
cow
|
cow
|
4
|
corned beef
|
7.5
|
cow
|
cow
|
5
|
Bacon
|
8.0
|
pig
|
pig
|
6
|
pastrami
|
3.0
|
cow
|
cow
|
7
|
honey ham
|
5.0
|
pig
|
pig
|
8
|
nova lox
|
6.0
|
salmon
|
salmon
|
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
import numpy as np# 将-999替换为空值
data.replace(-999, np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
# 将-999,-1000都替换为空值
data.replace([-999, -1000], np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
# 将-999,-1000分别替换为空值和0
data.replace([-999, -1000], [np.nan, 0])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data.replace({-999: np.nan, -1000: 0})
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
import pandas as pd
# 年龄数据
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]# 分箱的边界
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(type(cats))
<class 'pandas.core.arrays.categorical.Categorical'>
# Categorical对象
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 获取分箱编码
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
# 返回分箱边界索引
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]closed='right',dtype='interval[int64]')
# 统计箱中元素的个数
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
# 带标签的分箱
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
cats.get_values()
array(['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', 'Youth','MiddleAged', 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged','YoungAdult'], dtype=object)
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df
|
key
|
data1
|
0
|
b
|
0
|
1
|
b
|
1
|
2
|
a
|
2
|
3
|
c
|
3
|
4
|
a
|
4
|
5
|
b
|
5
|
pd.get_dummies(df['key'])
|
a
|
b
|
c
|
0
|
0
|
1
|
0
|
1
|
0
|
1
|
0
|
2
|
1
|
0
|
0
|
3
|
0
|
0
|
1
|
4
|
1
|
0
|
0
|
5
|
0
|
1
|
0
|
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
dtype: object
data.str.contains('gmail')
Dave False
Steve True
Rob True
Wes NaN
dtype: object
data.str[:5]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
split_df = data.str.split('@', expand=True)
split_df
|
0
|
1
|
Dave
|
dave
|
google.com
|
Steve
|
steve
|
gmail.com
|
Rob
|
rob
|
gmail.com
|
Wes
|
NaN
|
NaN
|
split_df[0].str.cat(split_df[1], sep='@')
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
Name: 0, dtype: object
lect02_codes02_pandas相关推荐
最新文章
- 安装 并且使用_房子装地暖还是墙暖暖气?使用过的人建议这样去选择!
- TP5.0 PHPExcel 数据表格导出导入(引)
- (Mark)操作系统原理
- Artech的MVC4框架学习——第三章controller的激活
- php+jquery实现图片上传预览_和拖动位置值,PHP教程:thinkphp jquery实现图片上传和预览效果...
- html5 手机拍视频滤镜,用canvas实现图片滤镜效果附演示_html5教程技巧
- 模拟器不全屏_puNES 适用于 Windows 和 Linux 的开源 NES 模拟器
- 使用 Caddy 替代 Nginx,全站升级 https,配置更加简单
- mybitsplus的idworker应java什么类型_MyBatisPlus学习整理(一)
- CentOS 7安装docker环境
- Linux之进程通信20160720
- springboot json 嵌套_Java Bean Validation 2.0 (一): 对Spring Boot应用的数据验证
- DataTable序列化为JSON字符串
- 信息类产品检测培训讲义-EN55022
- 新手零基础一小时学会小程序开发制作教程
- 串联型与并联型电压基准的区别
- 线性规划——产销平衡
- 移动端cpu天梯图2020_笔记本CPU排行天梯图2020
- 学生信息管理系统Java版(信息自动生成)
- 港科百创|【未磁科技】勇夺百万大奖,2021年度总决赛圆满收官!香港科大-越秀集团百万奖金国际创业大赛年度总决赛成功举办!...
热门文章
- [体感游戏]关于体感游戏的一些思考(二) --- POV和基本场景
- [Vue-Treeselect Warning] Unloaded branch node detected. “loadOptions“ prop is required to load its c
- #### mysql联合索引 注意事项 ####
- 爬虫(04)cookie+session+正则+字典快速生成2020-12-18
- CentOS下安装XAMPP详细教程
- Linux 如何检测硬盘坏道?
- mysql 查询半径范围内经纬度坐标
- 热门Java开发工具IDEA入门指南——IntelliJ IDEA概述(下)
- Ubuntu 安装 nvm
- 分布式与集群的区别是什么