Pandas 学习笔记一

文章目录

Pandas 学习
- 核心数据结构
- - DataFrame
  - - 属性
  - Panel
  - Series
- 基本数据操作
- 运算

Pandas 学习

核心数据结构

DataFrame

import numpy as np
data = np.random.normal(0,1,(10,5))
data

array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],[-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],[-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],[-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],[-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],[-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],[ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],[-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],[-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])

import pandas as pd
pd.DataFrame(data)

	0	1	2	3	4
0	1.212185	-0.306787	-0.667815	0.134141	0.797992
1	-1.351055	1.263780	0.049323	-0.936266	-0.116189
2	-0.010316	-0.531111	0.175734	1.449613	0.951039
3	-0.384850	-0.112718	0.862891	-0.704635	-0.673579
4	-0.233423	-1.156614	1.729237	-1.140583	-0.068172
5	-1.129472	-0.281252	-0.710164	-0.226608	-1.885420
6	-0.024217	0.450156	-0.570102	1.003065	0.494557
7	1.124109	-0.598309	-0.963233	1.290792	1.035346
8	-1.582825	-0.609526	1.101178	1.578188	1.694988
9	-0.027372	0.516504	-0.123221	1.844097	-1.205116

# 添加行索引
num = ["num{}".format(i) for i in range(10)]
pd.DataFrame(data,index=num)

	0	1	2	3	4
num0	1.212185	-0.306787	-0.667815	0.134141	0.797992
num1	-1.351055	1.263780	0.049323	-0.936266	-0.116189
num2	-0.010316	-0.531111	0.175734	1.449613	0.951039
num3	-0.384850	-0.112718	0.862891	-0.704635	-0.673579
num4	-0.233423	-1.156614	1.729237	-1.140583	-0.068172
num5	-1.129472	-0.281252	-0.710164	-0.226608	-1.885420
num6	-0.024217	0.450156	-0.570102	1.003065	0.494557
num7	1.124109	-0.598309	-0.963233	1.290792	1.035346
num8	-1.582825	-0.609526	1.101178	1.578188	1.694988
num9	-0.027372	0.516504	-0.123221	1.844097	-1.205116

# 添加列索引
num2 = ["index{}".format(i) for i in range(5)]
data = pd.DataFrame(data,index=num,columns=num2)
data

	index0	index1	index2	index3	index4
num0	1.212185	-0.306787	-0.667815	0.134141	0.797992
num1	-1.351055	1.263780	0.049323	-0.936266	-0.116189
num2	-0.010316	-0.531111	0.175734	1.449613	0.951039
num3	-0.384850	-0.112718	0.862891	-0.704635	-0.673579
num4	-0.233423	-1.156614	1.729237	-1.140583	-0.068172
num5	-1.129472	-0.281252	-0.710164	-0.226608	-1.885420
num6	-0.024217	0.450156	-0.570102	1.003065	0.494557
num7	1.124109	-0.598309	-0.963233	1.290792	1.035346
num8	-1.582825	-0.609526	1.101178	1.578188	1.694988
num9	-0.027372	0.516504	-0.123221	1.844097	-1.205116

属性

data.shape

(10, 5)

data.index

Index(['num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8','num9'],dtype='object')

data.columns

Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')

data.values

array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],[-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],[-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],[-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],[-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],[-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],[ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],[-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],[-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])

data.T

	num0	num1	num2	num3	num4	num5	num6	num7	num8	num9
index0	1.212185	-1.351055	-0.010316	-0.384850	-0.233423	-1.129472	-0.024217	1.124109	-1.582825	-0.027372
index1	-0.306787	1.263780	-0.531111	-0.112718	-1.156614	-0.281252	0.450156	-0.598309	-0.609526	0.516504
index2	-0.667815	0.049323	0.175734	0.862891	1.729237	-0.710164	-0.570102	-0.963233	1.101178	-0.123221
index3	0.134141	-0.936266	1.449613	-0.704635	-1.140583	-0.226608	1.003065	1.290792	1.578188	1.844097
index4	0.797992	-0.116189	0.951039	-0.673579	-0.068172	-1.885420	0.494557	1.035346	1.694988	-1.205116

data.head()

	index0	index1	index2	index3	index4
num0	1.212185	-0.306787	-0.667815	0.134141	0.797992
num1	-1.351055	1.263780	0.049323	-0.936266	-0.116189
num2	-0.010316	-0.531111	0.175734	1.449613	0.951039
num3	-0.384850	-0.112718	0.862891	-0.704635	-0.673579
num4	-0.233423	-1.156614	1.729237	-1.140583	-0.068172

data.tail()

	index0	index1	index2	index3	index4
num5	-1.129472	-0.281252	-0.710164	-0.226608	-1.885420
num6	-0.024217	0.450156	-0.570102	1.003065	0.494557
num7	1.124109	-0.598309	-0.963233	1.290792	1.035346
num8	-1.582825	-0.609526	1.101178	1.578188	1.694988
num9	-0.027372	0.516504	-0.123221	1.844097	-1.205116

# 重设索引
data.reset_index()

	index	index0	index1	index2	index3	index4
0	num0	1.212185	-0.306787	-0.667815	0.134141	0.797992
1	num1	-1.351055	1.263780	0.049323	-0.936266	-0.116189
2	num2	-0.010316	-0.531111	0.175734	1.449613	0.951039
3	num3	-0.384850	-0.112718	0.862891	-0.704635	-0.673579
4	num4	-0.233423	-1.156614	1.729237	-1.140583	-0.068172
5	num5	-1.129472	-0.281252	-0.710164	-0.226608	-1.885420
6	num6	-0.024217	0.450156	-0.570102	1.003065	0.494557
7	num7	1.124109	-0.598309	-0.963233	1.290792	1.035346
8	num8	-1.582825	-0.609526	1.101178	1.578188	1.694988
9	num9	-0.027372	0.516504	-0.123221	1.844097	-1.205116

# 设置新索引
df = pd.DataFrame({'month': [1, 4, 7, 10],'year': [2012, 2014, 2013, 2014],'sale':[55, 40, 84, 31]})
df

	month	year	sale
0	1	2012	55
1	4	2014	40
2	7	2013	84
3	10	2014	31

# 以月份设置新的索引
df.set_index("month", drop=True)

	year	sale
month
1	2012	55
4	2014	40
7	2013	84
10	2014	31

# 设置多个索引，以年和月份
new_df = df.set_index(["year", "month"])
new_df.index

MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]],labels=[[0, 2, 1, 2], [0, 1, 2, 3]],names=['year', 'month'])

new_df.index.names # levels的名称

FrozenList(['year', 'month'])

new_df.index.levels # 每个level的元组值

FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

Panel

# panel 三维数据，需要从不同维度访问
p = pd.Panel(np.arange(24).reshape(4,3,2),items=list('ABCD'),major_axis=pd.date_range('20130101', periods=3),minor_axis=['first', 'second'])
p

C:\Users\28599\AppData\Roaming\Python\Python37\site-packages\IPython\core\interactiveshell.py:3418: FutureWarning:
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.exec(code_obj, self.user_global_ns, self.user_ns)<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: A to D
Major_axis axis: 2013-01-01 00:00:00 to 2013-01-03 00:00:00
Minor_axis axis: first to second

p["A"]

	first	second
2013-01-01	0	1
2013-01-02	2	3
2013-01-03	4	5

p.major_xs("2013-01-03")

	A	B	C	D
first	4	10	16	22
second	5	11	17	23

p.minor_xs("first")

	A	B	C	D
2013-01-01	0	6	12	18
2013-01-02	2	8	14	20
2013-01-03	4	10	16	22

Series

# Series 带索引的一维数组
sr = data.iloc[1,:]
sr

index0   -1.351055
index1    1.263780
index2    0.049323
index3   -0.936266
index4   -0.116189
Name: num1, dtype: float64

sr.index

Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')

sr.values

array([-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ])

pd.Series(np.arange(10))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

pd.Series(np.arange(3, 9, 2), index=["a", "b", "c"])

a    3
b    5
c    7
dtype: int32

基本数据操作

data = pd.read_csv("./stock_day.csv")
data.head()

	open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58

data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data.head()

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	1.53
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	1.32
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	0.90
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	0.58

# 先列后行
data["open"]["2018-02-26"]

22.8

# 按名字索引
data.loc["2018-02-26"]["open"]
# 同 data.loc["2018-02-26","open"]

22.8

# 按数字索引
data.iloc[1,0]

22.8

# 组合索引
# 获取行第1天到第4天，['open', 'close', 'high', 'low']这个四个指标的结果
# data.ix[:4, ['open', 'close', 'high', 'low']] # 将要废弃，不推荐
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
# 同 data.iloc[0:4,data.columns.get_indexer(['open', 'close', 'high', 'low'])]

	open	close	high	low
2018-02-27	23.53	24.16	25.88	23.53
2018-02-26	22.80	23.53	23.78	22.80
2018-02-23	22.88	22.82	23.37	22.71
2018-02-22	22.25	22.28	22.76	22.02

# 赋值操作 (先索引再赋值即可)
data.open=100
data.head()

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	100	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2018-02-26	100	23.78	23.53	22.80	60985.11	0.69	3.02	1.53
2018-02-23	100	23.37	22.82	22.71	52914.01	0.54	2.42	1.32
2018-02-22	100	22.76	22.28	22.02	36105.01	0.36	1.64	0.90
2018-02-14	100	21.99	21.92	21.48	23331.04	0.44	2.05	0.58

# 对内容排序
# ascending=True表示升序 high相同就看p_change
data.sort_values(by=['high','p_change'],ascending=True).head()

	open	high	close	low	volume	price_change	p_change	turnover
2015-03-02	100	12.67	12.52	12.20	96291.73	0.32	2.62	3.30
2015-03-04	100	12.92	12.90	12.61	67075.44	0.20	1.57	2.30
2015-03-03	100	13.06	12.70	12.52	139071.61	0.18	1.44	4.76
2015-09-07	100	13.38	12.77	12.63	52490.04	0.37	2.98	1.80
2015-03-05	100	13.45	13.16	12.87	93180.39	0.26	2.02	3.19

data.sort_index().head()

	open	high	close	low	volume	price_change	p_change	turnover
2015-03-02	100	12.67	12.52	12.20	96291.73	0.32	2.62	3.30
2015-03-03	100	13.06	12.70	12.52	139071.61	0.18	1.44	4.76
2015-03-04	100	12.92	12.90	12.61	67075.44	0.20	1.57	2.30
2015-03-05	100	13.45	13.16	12.87	93180.39	0.26	2.02	3.19
2015-03-06	100	14.48	14.28	13.13	179831.72	1.12	8.51	6.16

运算

# 算术运算
data["open"].sub(3).head()
# data.open + 3

2018-02-27    97
2018-02-26    97
2018-02-23    97
2018-02-22    97
2018-02-14    97
Name: open, dtype: int64

# 逻辑运算
data[data['p_change'] > 2].head()

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	100	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2018-02-26	100	23.78	23.53	22.80	60985.11	0.69	3.02	1.53
2018-02-23	100	23.37	22.82	22.71	52914.01	0.54	2.42	1.32
2018-02-14	100	21.99	21.92	21.48	23331.04	0.44	2.05	0.58
2018-02-12	100	21.40	21.19	20.63	32445.39	0.82	4.03	0.81

data[(data["p_change"] > 2) & (data["low"] > 15)].head()

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	100	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2018-02-26	100	23.78	23.53	22.80	60985.11	0.69	3.02	1.53
2018-02-23	100	23.37	22.82	22.71	52914.01	0.54	2.42	1.32
2018-02-14	100	21.99	21.92	21.48	23331.04	0.44	2.05	0.58
2018-02-12	100	21.40	21.19	20.63	32445.39	0.82	4.03	0.81

# 逻辑运算函数
data.query("p_change > 2 & low > 15").head()

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	100	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2018-02-26	100	23.78	23.53	22.80	60985.11	0.69	3.02	1.53
2018-02-23	100	23.37	22.82	22.71	52914.01	0.54	2.42	1.32
2018-02-14	100	21.99	21.92	21.48	23331.04	0.44	2.05	0.58
2018-02-12	100	21.40	21.19	20.63	32445.39	0.82	4.03	0.81

# 判断'turnover'是否为4.19, 2.39
data[data["turnover"].isin([4.19, 2.39])]

	open	high	close	low	volume	price_change	p_change	turnover
2018-02-27	100	25.88	24.16	23.53	95578.03	0.63	2.68	2.39
2017-07-25	100	24.20	23.70	22.64	167489.48	0.67	2.91	4.19
2016-09-28	100	20.98	20.86	19.71	95580.75	0.98	4.93	2.39
2015-04-07	100	17.98	17.54	16.50	122471.85	0.88	5.28	4.19

# 统计运算
data.describe()

	open	high	close	low	volume	price_change	p_change	turnover
count	643.0	643.000000	643.000000	643.000000	643.000000	643.000000	643.000000	643.000000
mean	100.0	21.900513	21.336267	20.771835	99905.519114	0.018802	0.190280	2.936190
std	0.0	4.077578	3.942806	3.791968	73879.119354	0.898476	4.079698	2.079375
min	100.0	12.670000	12.360000	12.200000	1158.120000	-3.520000	-10.030000	0.040000
25%	100.0	19.500000	19.045000	18.525000	48533.210000	-0.390000	-1.850000	1.360000
50%	100.0	21.970000	21.450000	20.980000	83175.930000	0.050000	0.260000	2.500000
75%	100.0	24.065000	23.415000	22.850000	127580.055000	0.455000	2.305000	3.915000
max	100.0	36.350000	35.210000	34.010000	501915.410000	3.030000	10.030000	12.560000

# 累计统计函数 (类似前缀和)
data["p_change"].cumsum().head()

2018-02-27     2.68
2018-02-26     5.70
2018-02-23     8.12
2018-02-22     9.76
2018-02-14    11.81
Name: p_change, dtype: float64

data.max(axis=0)

open               100.00
high                36.35
close               35.21
low                 34.01
volume          501915.41
price_change         3.03
p_change            10.03
turnover            12.56
dtype: float64

data.idxmax(axis=0)

open            2018-02-27
high            2015-06-10
close           2015-06-12
low             2015-06-12
volume          2017-10-26
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object

# 累计统计函数
data["p_change"].sort_index().cumsum().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1959a1c6b38>

# 自定义计算
data.apply(lambda x: x.max() - x.min()) # 每一列最大值减最小值

open                 0.00
high                23.68
close               22.85
low                 21.81
volume          500757.29
price_change         6.55
p_change            20.06
turnover            12.52
dtype: float64

data["volume"].max() - data["volume"].min() # volume列最大值减最小值

500757.29

data.plot(x="volume", y="turnover", kind="scatter")

<matplotlib.axes._subplots.AxesSubplot at 0x1959a228438>

data.plot(x="high", y="low", kind="scatter")

<matplotlib.axes._subplots.AxesSubplot at 0x19597e1e3c8>

Pandas 学习笔记一相关推荐

pandas学习笔记：pandas.Dataframe.rename()函数用法
pandas学习笔记:pandas.Dataframe.rename()函数用法 pandas.Dataframe.rename()函数主要是用来修改Dataframe数据的行名和列名. 主要用到的参 ...
Pandas学习笔记(一)
Pandas学习笔记一 Pandas数组读取读取csv.tsv.txt文件读取excel文件读取mysql数据表 Pandas数据结构创建Series的几种方法根据标签查询Series数据 ...
pandas学习笔记之DateFrame
pandas学习笔记之DateFrame 文章目录 pandas学习笔记之DateFrame 1.DateFrame的创建 1)认识DataFrame对象 2)由二维列表创建(默认index和colu ...
数据分析之pandas学习笔记（六）（层次化索引、重塑、轴向旋转、行列变换、合并表数据）
数据分析之Pandas学习笔记(六)(层次化索引.重塑.轴向旋转.行列变换.合并表数据) level层次化索引 unstack()与stack()进行重塑,即:行列索引变换 swaplevel()交换 ...
[Pandas 学习笔记] - No.1 pandas学习笔记
pandas学习笔记 pandas是基于numpy开发出的数据分析包,用于高效地操作大型数据集.pandas的数据结构有三种分别为 series,dataframe和panel,对应一维,二维,三维 ...
pandas学习笔记之Series
pandas学习笔记之Series 文章目录 pandas学习笔记之Series pandas中Series的创建 1)用python中的列表list创建: 2)用numpy数组创建 3)用pytho ...
pandas学习笔记（三）：数据的变换与数据的管理
注:学习笔记基于文彤老师的pandas的系列课程课程链接:https://study.163.com/course/courseMain.htm?courseId=1005124008&sh ...
pandas 每一列相加_Python3 numpy amp; pandas 学习笔记
写在前面在用python做一些ML和DL的工作前,先看莫烦大佬的视频学习一下numpy和pandas这两个数据处理包,学习中记了一些笔记,便于自己日后查阅,同时发布到知乎希望能够帮助到其他小伙伴! ...
Python科学计算-Numpy和Pandas学习笔记(一)-安装、Numpy属性和常用方法(chaochao`s Blog)
1 前言由于项目的原因,最近开始学习机器学习的相关内容,在写文字笔记的时候发现我写的字确实很丑.很难看,并且记笔记的速度也很慢,由此萌生了编写自己博客的想法.从今天开始,我会将学习的笔记以博客的方式 ...

Pandas 学习笔记一

文章目录

Pandas 学习

核心数据结构

DataFrame

属性

Panel

Series

基本数据操作

运算

Pandas 学习笔记一相关推荐

最新文章

热门文章