包含序列Series和数据框DataFrame两种数据结构

Series类似numpy的一维数组

一维数组创建

字典创建

DataFrames某一行或者列创建

DataFrames类似numpy的二维数组

二维数组创建

字典创建

数据框创建

index属性获取序列的索引

import pandas as  pd
s=pd.Series([1,2,3,4,5])

s.index#看索引

RangeIndex(start=0, stop=5, step=1)

s.index=['a','b','c','d','e']#索引赋值

s[0]

s['a']

s[:3]#前三个，位置获取

a    1
b    2
c    3
dtype: int64

s['c':]#索引获取

c    3
d    4
e    5
dtype: int64

# 自动化对其

Pandas查询数据

data=pd.DataFrame({'a':[1,2,3,4,5,6,7],'b':[1,2,3,4,5,6,7],'c':[1,2,3,4,5,6,7]})

data

	a	b	c
0	1	1	1
1	2	2	2
2	3	3	3
3	4	4	4
4	5	5	5
5	6	6	6
6	7	7	7

#查看前五行，末尾五行
data.head()
data.tail()
#查看指定的行，两层括号
data.iloc[[0,1]]

	a	b	c
0	1	1	1
1	2	2	2

#查看指定的列,两层括号
data[['a','b']]

	a	b
0	1	1
1	2	2
2	3	3
3	4	4
4	5	5
5	6	6
6	7	7

#同时查看行与列
data.loc[[0,1],['a','b']]

	a	b
0	1	1
1	2	2

#具体值的多个条件查询
data[(data['a']>1)&(data['b']>1)][['a','b']]

	a	b
1	2	2
2	3	3
3	4	4
4	5	5
5	6	6
6	7	7

统计分析

import numpy as np
a=np.random.normal(size=15)#正态分布
da=pd.Series(2*a+3)
db=np.random.f(2,4,size=15)#浮点数
dc=np.random.randint(1,100,size=15)#随机整数
print(da)
print(db)
print(dc)

0     1.470286
1     0.533744
2     2.600050
3    -3.209468
4     2.574417
5     0.155033
6     7.886463
7     0.723382
8     3.801481
9     3.328130
10    1.369691
11    5.334218
12    1.345225
13    2.156444
14    4.545433
dtype: float64
[1.33910328 1.49237007 1.34909029 3.22248552 0.59518615 0.181178656.8445107  0.37093249 0.52197335 0.12504141 0.25438051 0.172090442.19681018 0.42410786 0.34586842]
[39 28 38 90 29 61 83 61  7 92 59 87 18 91 24]

da.count()#非空元素计算

da.min()#最小值

-3.2094683534682567

da.idxmin()#最小值的位置

da.quantile(1/3)#分位数

1.3615359728014402

da.sum()#求和

34.614527439491056

da.mean()#均值

2.307635162632737

da.median()#中位数

2.156444001142751

da.mode()#众数

0    -3.209468
1     0.155033
2     0.533744
3     0.723382
4     1.345225
5     1.369691
6     1.470286
7     2.156444
8     2.574417
9     2.600050
10    3.328130
11    3.801481
12    4.545433
13    5.334218
14    7.886463
dtype: float64

da.var()#方差

6.569970412166845

da.std()#标准差

2.563195351932202

da.mad()#平均绝对偏差

1.8555660211821798

da.skew()#偏度

0.13327016461191873

da.kurt()#峰度

1.4497970301567644

da.describe()#一次性描绘多个统计指标

count    15.000000
mean      2.307635
std       2.563195
min      -3.209468
25%       1.034304
50%       2.156444
75%       3.564806
max       7.886463
dtype: float64

df=pd.DataFrame(np.array([da,db,dc]).T,columns=['x1','x2','x3'])#构造数据框

print(df.head())

         x1        x2    x3
0  1.470286  1.339103  39.0
1  0.533744  1.492370  28.0
2  2.600050  1.349090  38.0
3 -3.209468  3.222486  90.0
4  2.574417  0.595186  29.0

#定义函数将所有统计指标汇总在一起
def state(x):return pd.Series([x.count(),x.min(),x.idxmin(),x.median(),x.mean(),x.max(),x.idxmax(),x.mad(),x.var(),x.std(),x.skew(),x.kurt()],index=['count','min','idxmin','median','mean','max','idxmax','mad','var','std','skew','kurt'])

df.apply(state)#函数应用

	x1	x2	x3
count	15.000000	15.000000	15.000000
min	-3.209468	0.125041	7.000000
idxmin	3.000000	9.000000	8.000000
median	2.156444	0.521973	59.000000
mean	2.307635	1.295675	53.800000
max	7.886463	6.844511	92.000000
idxmax	6.000000	6.000000	9.000000
mad	1.855566	1.156042	25.813333
var	6.569970	3.137363	882.028571
std	2.563195	1.771260	29.698966
skew	0.133270	2.522569	0.008105
kurt	1.449797	7.075194	-1.527611

df.describe()#数值型数据描述，离散型也可同样方法描述

	x1	x2	x3
count	15.000000	15.000000	15.000000
mean	2.307635	1.295675	53.800000
std	2.563195	1.771260	29.698966
min	-3.209468	0.125041	7.000000
25%	1.034304	0.300124	28.500000
50%	2.156444	0.521973	59.000000
75%	3.564806	1.420730	85.000000
max	7.886463	6.844511	92.000000

#相关系数pearson,kendall,speraman,默认为pearson
df.corr()

	x1	x2	x3
x1	1.000000	0.251691	0.009402
x2	0.251691	1.000000	0.190504
x3	0.009402	0.190504	1.000000

#数值型变量之间的协方差矩阵
df.cov()

	x1	x2	x3
x1	6.569970	1.142701	0.715703
x2	1.142701	3.137363	10.021379
x3	0.715703	10.021379	882.028571

Pandas实现Sql操作

对数据的增删改查

dict={'name':['liu an','shu  li'],'sex':['m','f'],'age':[23,55],'height':[168,178],'weight':[62,67]}
student1=pd.DataFrame(dict)
dict={'name':['liu','shu'],'sex':['m','f'],'age':[23,55],'height':[168,178],'weight':[62,67]}
student2=pd.DataFrame(dict)

student1

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67

student2

	age	height	name	sex	weight
0	23	168	liu	m	62
1	55	178	shu	f	67

#上下合并
student3=pd.concat([student1,student2],ignore_index='True')#忽视index
student3

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67
2	23	168	liu	m	62
3	55	178	shu	f	67

student3['score']=[0,0,0,0]
student3

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67
2	23	168	liu	m	62
3	55	178	shu	f	67

#删除整个dataframe
del student2
student2

---------------------------------------------------------------------------NameError                                 Traceback (most recent call last)<ipython-input-90-9281b2e2ebb6> in <module>()1 #删除整个dataframe2 del student2
----> 3 student2NameError: name 'student2' is not defined

#删除指定的行
student3.drop([0])

	age	height	name	sex	weight
1	55	178	shu li	f	67
2	23	168	liu	m	62
3	55	178	shu	f	67

#删除25岁以下
student3[student3['age']<25]

	age	height	name	sex	weight	score
0	23	168	liu an	m	62	0
2	23	168	liu	m	62	0

#删除指定的列
student3.drop(['height','weight'],axis=1)
#axis=0,删除行；axis=1，删除列

	age	name	sex
0	23	liu an	m
1	55	shu li	f
2	23	liu	m
3	55	shu	f

更改表中的数据

student3

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67
2	23	168	liu	m	62
3	55	178	shu	f	67

# 修改liu的身高为176
student3.loc[student3['name']=='liu','height']=176
student3

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67
2	23	176	liu	m	62
3	55	178	shu	f	67

#查数据

#聚合,groupby()
#根据性别、年龄分组统计体重与身高
student3.groupby(['sex','age']).mean()

		height	weight	score
sex	age
f	55	178	67	0
m	23	172	62	0

#每个分组统计多个统计量
student3.drop('age',axis=1).groupby('sex').agg([np.mean,np.median])

	height		weight		score
	mean	median	mean	median	mean	median
sex
f	178	178	67	67	0	0
m	172	172	62	62	0	0

#排序
series=pd.Series(np.array(np.random.randint(1,20,10)))#产生0-20,10个随机整数

series.sort_values()#默认升序排列

7     3
0     5
5     5
6     7
9     7
2     8
4    12
1    14
3    16
8    19
dtype: int32

series.sort_values(ascending=False)#降序

8    19
3    16
1    14
4    12
2     8
9     7
6     7
5     5
0     5
7     3
dtype: int32

#按照特定值排列
student3.sort_values(by=['sex','age'])

	age	height	name	sex	weight
1	55	178	shu li	f	67
3	55	178	shu	f	67
0	23	168	liu an	m	62
2	23	176	liu	m	62

#多表连接-merge

dict={'name':['liu an','shu  li'],'sex':['m','f'],'age':[23,55],'height':[168,178],'weight':[62,67]}
student1=pd.DataFrame(dict)
dict={'name':['liu','shu'],'sex':['m','f'],'age':[23,55],'height':[168,178],'weight':[62,67]}
student2=pd.DataFrame(dict)

student1

	age	height	name	sex	weight
0	23	168	liu an	m	62
1	55	178	shu li	f	67

student2

	age	height	name	sex	weight
0	23	168	liu	m	62
1	55	178	shu	f	67

#连接两个表
a=pd.merge(student1,student2,on='age')

	age	height_x	name_x	sex_x	weight_x	height_y	name_y	sex_y	weight_y
0	23	168	liu an	m	62	168	liu	m	62
1	55	178	shu li	f	67	178	shu	f	67

#how连接方式，left左连接，right有链接，outer为外连接

缺失值处理

删除：dropna

替补法：fillna

连续型变量正太分布，均值替代缺失值
偏差，中位数代替缺失值

离散型变量，众数替代缺失值

删除法

当数据中某个变量大部分缺失时，可以删除该变量；
当缺失值随机分布，且数量较少，可以删除缺失的行

sum(pd.isnum(s))#查看包含多少缺失值

#直接删除缺失值,默认删除含有缺失值的所有行
s.dropna()

df=pd.DataFrame([[1,2,3,np.nan],[2,3,np.nan,np.nan],[3,4,np.nan,np.nan],[np.nan,np.nan,np.nan,np.nan]],columns=['x1','x2','x3','x4'])

df

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	NaN	NaN
2	3.0	4.0	NaN	NaN
3	NaN	NaN	NaN	NaN

df.dropna()

	x1	x2	x3	x4

df.dropna(how='any')#删除有行为缺失值的观测

	x1	x2	x3	x4

df.dropna(how='all')#删除行全为缺失值的观测

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	NaN	NaN
2	3.0	4.0	NaN	NaN

df.dropna(how='all',axis=1)#删除列全为缺失值的观测

	x1	x2	x3
0	1.0	2.0	3.0
1	2.0	3.0	NaN
2	3.0	4.0	NaN
3	NaN	NaN	NaN

#  利用thresh，保留一些为nan的值

df.dropna(thresh=3)#保留至少三个非nan的项

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN

df.dropna(thresh=3,axis=1)#保留列至少三个非nan的项

	x1	x2
0	1.0	2.0
1	2.0	3.0
2	3.0	4.0
3	NaN	NaN

替补法

fillna

value:值
method:插值方式
axis：轴向，默认axis=0
inplace:修改不产生副本
limit:连续填充的最大数量

df

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	NaN	NaN
2	3.0	4.0	NaN	NaN
3	NaN	NaN	NaN	NaN

df.fillna(0)#用0填充

	x1	x2	x3
0	1.0	2.0	3.0
1	2.0	3.0	0.0
2	3.0	4.0	0.0
3	0.0	0.0	0.0

#前项填充
df.fillna(method='ffill')

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	3.0	NaN
2	3.0	4.0	3.0	NaN
3	3.0	4.0	3.0	NaN

#后项填充，最后边无法填充
df.fillna(method='bfill')

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	NaN	NaN
2	3.0	4.0	NaN	NaN
3	NaN	NaN	NaN	NaN

#均值填充
x1_m=df['x1'].median()
x2_m=df['x2'].median()
x3_m=df['x3'].median()
x4_m=df['x4'].median()

G:\anaconda\envs\tensorflow\lib\site-packages\numpy\lib\nanfunctions.py:1113: RuntimeWarning: Mean of empty slicereturn np.nanmean(a, axis, out=out, keepdims=keepdims)

#均值填充各自的列
df.fillna({'x1':x1_m,'x2':x2_m,'x3':x3_m,'x4':x4_m})

	x1	x2	x3	x4
0	1.0	2.0	3.0	NaN
1	2.0	3.0	3.0	NaN
2	3.0	4.0	3.0	NaN
3	2.0	3.0	3.0	NaN

多层索引与透视表参考官方文档

Pandas相关笔记整理相关推荐

Spring相关笔记整理
Spring Boot Spring Boot @SpringApplicationConfiguration 不能导入 Unable to establish loopback connection ...
深度学习总结——CS231n课程深度学习（机器视觉相关）笔记整理
深度学习笔记整理说明基本知识点一:模型的设置(基本) 1. 激活函数的设置 2. 损失函数的设置 (1) 分类问题 (2) 属性问题 (3) 回归问题 3. 正则化方式的设置 (1) 损失函数添加 ...
Google机器学习速成课程 - 视频笔记整理汇总 - 基础篇核心部分
Google机器学习速成课程 - 视频笔记整理 - 基础篇核心部分课程网址: https://developers.google.com/machine-learning/crash-course/ ...
Python-pandas 笔记整理
个人笔记整理,仅供自学使用 Reference: https://vitu.ai/course/65599145397748608 https://vitu.ai/course/65598890065 ...
epoll相关资料整理
http://www.cppblog.com/converse/archive/2008/10/13/63928.html epoll相关资料整理学习epoll有一段时间了,最近终于有一个服务器采用 ...
《繁凡的深度学习笔记》前言、目录大纲一文让你完全弄懂深度学习所有基础（DL笔记整理系列）
<繁凡的深度学习笔记>前言.目录大纲 (DL笔记整理系列) 一文弄懂深度学习所有基础 ! 3043331995@qq.com https://fanfansann.blog.csdn.ne ...
一文让你完全弄懂逻辑回归和分类问题实战《繁凡的深度学习笔记》第 3 章分类问题与信息论基础（上）（DL笔记整理系列）
好吧,只好拆分为上下两篇发布了>_< 终于肝出来了,今天就是除夕夜了,祝大家新快乐!^q^ <繁凡的深度学习笔记>第 3 章分类问题与信息论基础 (上)(逻辑回归.Softm ...
一文让你完全弄懂回归问题、激活函数、梯度下降和神经元模型实战《繁凡的深度学习笔记》第 2 章回归问题与神经元模型（DL笔记整理系列）
<繁凡的深度学习笔记>第 2 章回归问题与神经元模型(DL笔记整理系列) 3043331995@qq.com https://fanfansann.blog.csdn.net/ http ...
机器学习总结——机器学习课程笔记整理
机器学习笔记整理说明基础点整理 1. 基础数学知识 (1) 一些零七八碎的基础知识 (2) 最优化相关问题 (3) 概率论相关问题 (4) 矩阵相关问题 2. 回归(线性回归.Logistic回归 ...

Pandas相关笔记整理