文章目录

  • Pandas 学习
    • 核心数据结构
      • DataFrame
        • 属性
      • Panel
      • Series
    • 基本数据操作
    • 运算

Pandas 学习

核心数据结构

DataFrame
import numpy as np
data = np.random.normal(0,1,(10,5))
data
array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],[-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],[-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],[-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],[-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],[-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],[ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],[-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],[-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])
import pandas as pd
pd.DataFrame(data)
0 1 2 3 4
0 1.212185 -0.306787 -0.667815 0.134141 0.797992
1 -1.351055 1.263780 0.049323 -0.936266 -0.116189
2 -0.010316 -0.531111 0.175734 1.449613 0.951039
3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579
4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420
6 -0.024217 0.450156 -0.570102 1.003065 0.494557
7 1.124109 -0.598309 -0.963233 1.290792 1.035346
8 -1.582825 -0.609526 1.101178 1.578188 1.694988
9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
# 添加行索引
num = ["num{}".format(i) for i in range(10)]
pd.DataFrame(data,index=num)
0 1 2 3 4
num0 1.212185 -0.306787 -0.667815 0.134141 0.797992
num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189
num2 -0.010316 -0.531111 0.175734 1.449613 0.951039
num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579
num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420
num6 -0.024217 0.450156 -0.570102 1.003065 0.494557
num7 1.124109 -0.598309 -0.963233 1.290792 1.035346
num8 -1.582825 -0.609526 1.101178 1.578188 1.694988
num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
# 添加列索引
num2 = ["index{}".format(i) for i in range(5)]
data = pd.DataFrame(data,index=num,columns=num2)
data
index0 index1 index2 index3 index4
num0 1.212185 -0.306787 -0.667815 0.134141 0.797992
num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189
num2 -0.010316 -0.531111 0.175734 1.449613 0.951039
num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579
num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420
num6 -0.024217 0.450156 -0.570102 1.003065 0.494557
num7 1.124109 -0.598309 -0.963233 1.290792 1.035346
num8 -1.582825 -0.609526 1.101178 1.578188 1.694988
num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
属性
data.shape
(10, 5)
data.index
Index(['num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8','num9'],dtype='object')
data.columns
Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')
data.values
array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],[-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],[-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],[-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],[-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],[-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],[ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],[-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],[-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])
data.T
num0 num1 num2 num3 num4 num5 num6 num7 num8 num9
index0 1.212185 -1.351055 -0.010316 -0.384850 -0.233423 -1.129472 -0.024217 1.124109 -1.582825 -0.027372
index1 -0.306787 1.263780 -0.531111 -0.112718 -1.156614 -0.281252 0.450156 -0.598309 -0.609526 0.516504
index2 -0.667815 0.049323 0.175734 0.862891 1.729237 -0.710164 -0.570102 -0.963233 1.101178 -0.123221
index3 0.134141 -0.936266 1.449613 -0.704635 -1.140583 -0.226608 1.003065 1.290792 1.578188 1.844097
index4 0.797992 -0.116189 0.951039 -0.673579 -0.068172 -1.885420 0.494557 1.035346 1.694988 -1.205116
data.head()
index0 index1 index2 index3 index4
num0 1.212185 -0.306787 -0.667815 0.134141 0.797992
num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189
num2 -0.010316 -0.531111 0.175734 1.449613 0.951039
num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579
num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
data.tail()
index0 index1 index2 index3 index4
num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420
num6 -0.024217 0.450156 -0.570102 1.003065 0.494557
num7 1.124109 -0.598309 -0.963233 1.290792 1.035346
num8 -1.582825 -0.609526 1.101178 1.578188 1.694988
num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
# 重设索引
data.reset_index()
index index0 index1 index2 index3 index4
0 num0 1.212185 -0.306787 -0.667815 0.134141 0.797992
1 num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189
2 num2 -0.010316 -0.531111 0.175734 1.449613 0.951039
3 num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579
4 num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
5 num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420
6 num6 -0.024217 0.450156 -0.570102 1.003065 0.494557
7 num7 1.124109 -0.598309 -0.963233 1.290792 1.035346
8 num8 -1.582825 -0.609526 1.101178 1.578188 1.694988
9 num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
# 设置新索引
df = pd.DataFrame({'month': [1, 4, 7, 10],'year': [2012, 2014, 2013, 2014],'sale':[55, 40, 84, 31]})
df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31
# 以月份设置新的索引
df.set_index("month", drop=True)
year sale
month
1 2012 55
4 2014 40
7 2013 84
10 2014 31
# 设置多个索引,以年和月份
new_df = df.set_index(["year", "month"])
new_df.index
MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]],labels=[[0, 2, 1, 2], [0, 1, 2, 3]],names=['year', 'month'])
new_df.index.names # levels的名称
FrozenList(['year', 'month'])
new_df.index.levels # 每个level的元组值
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
Panel
# panel 三维数据,需要从不同维度访问
p = pd.Panel(np.arange(24).reshape(4,3,2),items=list('ABCD'),major_axis=pd.date_range('20130101', periods=3),minor_axis=['first', 'second'])
p
C:\Users\28599\AppData\Roaming\Python\Python37\site-packages\IPython\core\interactiveshell.py:3418: FutureWarning:
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.exec(code_obj, self.user_global_ns, self.user_ns)<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: A to D
Major_axis axis: 2013-01-01 00:00:00 to 2013-01-03 00:00:00
Minor_axis axis: first to second
p["A"]
first second
2013-01-01 0 1
2013-01-02 2 3
2013-01-03 4 5
p.major_xs("2013-01-03")
A B C D
first 4 10 16 22
second 5 11 17 23
p.minor_xs("first")
A B C D
2013-01-01 0 6 12 18
2013-01-02 2 8 14 20
2013-01-03 4 10 16 22
Series
# Series 带索引的一维数组
sr = data.iloc[1,:]
sr
index0   -1.351055
index1    1.263780
index2    0.049323
index3   -0.936266
index4   -0.116189
Name: num1, dtype: float64
sr.index
Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')
sr.values
array([-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ])
pd.Series(np.arange(10))
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
pd.Series(np.arange(3, 9, 2), index=["a", "b", "c"])
a    3
b    5
c    7
dtype: int32

基本数据操作

data = pd.read_csv("./stock_day.csv")
data.head()
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
# 先列后行
data["open"]["2018-02-26"]
22.8
# 按名字索引
data.loc["2018-02-26"]["open"]
# 同 data.loc["2018-02-26","open"]
22.8
# 按数字索引
data.iloc[1,0]
22.8
# 组合索引
# 获取行第1天到第4天,['open', 'close', 'high', 'low']这个四个指标的结果
# data.ix[:4, ['open', 'close', 'high', 'low']] # 将要废弃,不推荐
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
# 同 data.iloc[0:4,data.columns.get_indexer(['open', 'close', 'high', 'low'])]
open close high low
2018-02-27 23.53 24.16 25.88 23.53
2018-02-26 22.80 23.53 23.78 22.80
2018-02-23 22.88 22.82 23.37 22.71
2018-02-22 22.25 22.28 22.76 22.02
# 赋值操作 (先索引再赋值即可)
data.open=100
data.head()
open high close low volume price_change p_change turnover
2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-22 100 22.76 22.28 22.02 36105.01 0.36 1.64 0.90
2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
# 对内容排序
# ascending=True表示升序 high相同就看p_change
data.sort_values(by=['high','p_change'],ascending=True).head()
open high close low volume price_change p_change turnover
2015-03-02 100 12.67 12.52 12.20 96291.73 0.32 2.62 3.30
2015-03-04 100 12.92 12.90 12.61 67075.44 0.20 1.57 2.30
2015-03-03 100 13.06 12.70 12.52 139071.61 0.18 1.44 4.76
2015-09-07 100 13.38 12.77 12.63 52490.04 0.37 2.98 1.80
2015-03-05 100 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
data.sort_index().head()
open high close low volume price_change p_change turnover
2015-03-02 100 12.67 12.52 12.20 96291.73 0.32 2.62 3.30
2015-03-03 100 13.06 12.70 12.52 139071.61 0.18 1.44 4.76
2015-03-04 100 12.92 12.90 12.61 67075.44 0.20 1.57 2.30
2015-03-05 100 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
2015-03-06 100 14.48 14.28 13.13 179831.72 1.12 8.51 6.16

运算

# 算术运算
data["open"].sub(3).head()
# data.open + 3
2018-02-27    97
2018-02-26    97
2018-02-23    97
2018-02-22    97
2018-02-14    97
Name: open, dtype: int64
# 逻辑运算
data[data['p_change'] > 2].head()
open high close low volume price_change p_change turnover
2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
data[(data["p_change"] > 2) & (data["low"] > 15)].head()
open high close low volume price_change p_change turnover
2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
# 逻辑运算函数
data.query("p_change > 2 & low > 15").head()
open high close low volume price_change p_change turnover
2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
# 判断'turnover'是否为4.19, 2.39
data[data["turnover"].isin([4.19, 2.39])]
open high close low volume price_change p_change turnover
2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2017-07-25 100 24.20 23.70 22.64 167489.48 0.67 2.91 4.19
2016-09-28 100 20.98 20.86 19.71 95580.75 0.98 4.93 2.39
2015-04-07 100 17.98 17.54 16.50 122471.85 0.88 5.28 4.19
# 统计运算
data.describe()
open high close low volume price_change p_change turnover
count 643.0 643.000000 643.000000 643.000000 643.000000 643.000000 643.000000 643.000000
mean 100.0 21.900513 21.336267 20.771835 99905.519114 0.018802 0.190280 2.936190
std 0.0 4.077578 3.942806 3.791968 73879.119354 0.898476 4.079698 2.079375
min 100.0 12.670000 12.360000 12.200000 1158.120000 -3.520000 -10.030000 0.040000
25% 100.0 19.500000 19.045000 18.525000 48533.210000 -0.390000 -1.850000 1.360000
50% 100.0 21.970000 21.450000 20.980000 83175.930000 0.050000 0.260000 2.500000
75% 100.0 24.065000 23.415000 22.850000 127580.055000 0.455000 2.305000 3.915000
max 100.0 36.350000 35.210000 34.010000 501915.410000 3.030000 10.030000 12.560000
# 累计统计函数 (类似前缀和)
data["p_change"].cumsum().head()
2018-02-27     2.68
2018-02-26     5.70
2018-02-23     8.12
2018-02-22     9.76
2018-02-14    11.81
Name: p_change, dtype: float64
data.max(axis=0)
open               100.00
high                36.35
close               35.21
low                 34.01
volume          501915.41
price_change         3.03
p_change            10.03
turnover            12.56
dtype: float64
data.idxmax(axis=0)
open            2018-02-27
high            2015-06-10
close           2015-06-12
low             2015-06-12
volume          2017-10-26
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object
# 累计统计函数
data["p_change"].sort_index().cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1959a1c6b38>

# 自定义计算
data.apply(lambda x: x.max() - x.min()) # 每一列最大值减最小值
open                 0.00
high                23.68
close               22.85
low                 21.81
volume          500757.29
price_change         6.55
p_change            20.06
turnover            12.52
dtype: float64
data["volume"].max() - data["volume"].min() # volume列最大值减最小值
500757.29
data.plot(x="volume", y="turnover", kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0x1959a228438>

data.plot(x="high", y="low", kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0x19597e1e3c8>



Pandas 学习笔记一相关推荐

  1. pandas学习笔记:pandas.Dataframe.rename()函数用法

    pandas学习笔记:pandas.Dataframe.rename()函数用法 pandas.Dataframe.rename()函数主要是用来修改Dataframe数据的行名和列名. 主要用到的参 ...

  2. Pandas学习笔记(一)

    Pandas学习笔记一 Pandas数组读取 读取csv.tsv.txt文件 读取excel文件 读取mysql数据表 Pandas数据结构 创建Series的几种方法 根据标签查询Series数据 ...

  3. pandas学习笔记之DateFrame

    pandas学习笔记之DateFrame 文章目录 pandas学习笔记之DateFrame 1.DateFrame的创建 1)认识DataFrame对象 2)由二维列表创建(默认index和colu ...

  4. 数据分析之pandas学习笔记(六)(层次化索引、重塑、轴向旋转、行列变换、合并表数据)

    数据分析之Pandas学习笔记(六)(层次化索引.重塑.轴向旋转.行列变换.合并表数据) level层次化索引 unstack()与stack()进行重塑,即:行列索引变换 swaplevel()交换 ...

  5. [Pandas 学习笔记] - No.1 pandas学习笔记

    pandas学习笔记 pandas是基于numpy开发出的数据分析包,用于高效地操作大型数据集.pandas的数据结构有三种 分别为 series,dataframe和panel,对应一维,二维,三维 ...

  6. pandas学习笔记之Series

    pandas学习笔记之Series 文章目录 pandas学习笔记之Series pandas中Series的创建 1)用python中的列表list创建: 2)用numpy数组创建 3)用pytho ...

  7. pandas学习笔记(三):数据的变换与数据的管理

    注:学习笔记基于文彤老师的pandas的系列课程 课程链接:https://study.163.com/course/courseMain.htm?courseId=1005124008&sh ...

  8. pandas 每一列相加_Python3 numpy amp; pandas 学习笔记

    写在前面 在用python做一些ML和DL的工作前,先看莫烦大佬的视频学习一下numpy和pandas这两个数据处理包,学习中记了一些笔记,便于自己日后查阅,同时发布到知乎希望能够帮助到其他小伙伴! ...

  9. Python科学计算-Numpy和Pandas学习笔记(一)-安装、Numpy属性和常用方法(chaochao`s Blog)

    1 前言 由于项目的原因,最近开始学习机器学习的相关内容,在写文字笔记的时候发现我写的字确实很丑.很难看,并且记笔记的速度也很慢,由此萌生了编写自己博客的想法.从今天开始,我会将学习的笔记以博客的方式 ...

最新文章

  1. elasticsearch: 权威指南_你还不会Elasticsearch的CUD?
  2. javascript:正则表达式、一个表单验证的例子
  3. SpringBoot - 优雅的实现【自定义参数校验】高级进阶
  4. Bounce 弹飞绵羊
  5. Neo4j简单的样例
  6. esri geometry-api-java的maven创建
  7. 使用HttpClient消费ASP.NET Web API服务
  8. 设计师分享社区,展示风采平台
  9. css文字不被点击,css 让文字不被选中之-moz-user-select 属性介绍
  10. Oracle完全手册,Oracle_11g+Oracle Sqldeveloper 安装完全手册(for win 7 64x)
  11. dedecms自定义表单提交成功后提示信息修改和跳转链接修改
  12. 关系抽取之远程监督算法:别再跟我提知识图谱(下篇)
  13. java 访问https 证书_Java+SSL证书访问Https站点
  14. 随机计数器java_Java高效计数器
  15. Java语言课程设计——简易计算器(详细含报告和源代码)
  16. 学习笔记:微波遥感反演土壤水分——理论模型和经验模型(1)
  17. AUTOCAD——线宽设置
  18. python bytes
  19. 三大指数快速入门和应用
  20. 安卓的网络视频播放器(简易版)

热门文章

  1. Win10怎么关闭自带杀毒软件?2招彻底关闭Windows Defender方法
  2. Excel 自动生成序号公式
  3. Generic patch v2.6 for TabsStudio and DevArt Entity Developer -屈指可数的更新版
  4. C语言——函数和数组的简单介绍
  5. 豆瓣:长在畅销榜边上
  6. Unity 灯光Shader
  7. android微信的抢红包插件
  8. 扫地机器人黑色耐脏吗_石头扫地机器人极夜黑评测丨石头扫地机器人极夜黑怎么样_什么值得买...
  9. const指针用法总结
  10. unity在打开项目时就完成相应平台的转换