pandas的dataframe节省内存

##############################第1种###################################################

[1]代码如下(下面的会损失数据精度):

def memory_usage_mb(df, *args, **kwargs):"""Dataframe memory usage in MB. """return df.memory_usage(*args, **kwargs).sum() / 1024**2def reduce_mem_usage(df, verbose=True):numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']start_mem = df.memory_usage(deep=True).sum() / 1024**2for col in df.columns:col_type = df[col].dtypesif col_type in numerics:c_min = df[col].min()c_max = df[col].max()if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64)else:c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)end_mem = df.memory_usage().sum() / 1024**2if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))return df

##############################第2种###################################################

[2]代码如下(下面的不会损失数据精度):


def memory_usage_mb(df, *args, **kwargs):"""Dataframe memory usage in MB. """return df.memory_usage(*args, **kwargs).sum() / 1024**2
def reduce_memory_usage(df, deep=True, verbose=True, categories=True):# All types that we want to change for "lighter" ones.# int8 and float16 are not include because we cannot reduce# those data types.# float32 is not include because float16 has too low precision.numeric2reduce = ["int16", "int32", "int64", "float64"]start_mem = 0if verbose:start_mem = memory_usage_mb(df, deep=deep)for col, col_type in df.dtypes.iteritems():best_type = Noneif col_type == "object":df[col] = df[col].astype("category")best_type = "category"elif col_type in numeric2reduce:downcast = "integer" if "int" in str(col_type) else "float"df[col] = pd.to_numeric(df[col], downcast=downcast)best_type = df[col].dtype.name# Log the conversion performed.if verbose and best_type is not None and best_type != str(col_type):print(f"Column '{col}' converted from {col_type} to {best_type}")if verbose:end_mem = memory_usage_mb(df, deep=deep)diff_mem = start_mem - end_mempercent_mem = 100 * diff_mem / start_memprint(f"Memory usage decreased from"f" {start_mem:.2f}MB to {end_mem:.2f}MB"f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")return df

上面的代码的bug是：

df[col] = df[col].astype("category")

会导致无法使用fillna函数，直接报错。因为fillna主要是针对object对象的，不能直接针对category对象

所以填充工作必须在节省内存的前面.

使用方法是:

import datatable as dttrain=dt.fread(folder+"train.csv")
train=train.to_pandas()train = reduce_mem_usage(train)

##############################第3种###################################################

代码来自[3]

def get_stats(df):stats = pd.DataFrame(index=df.columns, columns=['na_count', 'n_unique', 'type', 'memory_usage'])for col in df.columns:stats.loc[col] = [df[col].isna().sum(), df[col].nunique(dropna=False), df[col].dtypes, df[col].memory_usage(deep=True, index=False) / 1024**2]stats.loc['Overall'] = [stats['na_count'].sum(), stats['n_unique'].sum(), None, df.memory_usage(deep=True).sum() / 1024**2]return statsdef print_header():print('col         conversion        dtype    na    uniq  size')print()def print_values(name, conversion, col):template = '{:10}  {:16}  {:>7}  {:2}  {:6}  {:1.2f}MB'print(template.format(name, conversion, str(col.dtypes), col.isna().sum(), col.nunique(dropna=False), col.memory_usage(deep=True, index=False) / 1024 ** 2))# safe downcast
def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):"""max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]avg_loss_limit - same but calculates avg throughout the series.na_loss_limit - not really useful.n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing."""is_float = str(col.dtypes)[:5] == 'float'na_count = col.isna().sum()n_uniq = col.nunique(dropna=False)try_types = ['float16', 'float32']if na_count <= na_loss_limit:try_types = ['int8', 'int16', 'float16', 'int32', 'float32']for type in try_types:col_tmp = col# float to int conversion => try to round to minimize casting errorif is_float and (str(type)[:3] == 'int'):col_tmp = col_tmp.copy().fillna(fillna).round()col_tmp = col_tmp.astype(type)max_loss = (col_tmp - col).abs().max()avg_loss = (col_tmp - col).abs().mean()na_loss = np.abs(na_count - col_tmp.isna().sum())n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:return col_tmp# field can't be convertedreturn coldef reduce_mem_usage_sd(df, deep=True, verbose=False, obj_to_cat=False):numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2for col in df.columns:col_type = df[col].dtypes# collect statsna_count = df[col].isna().sum()n_uniq = df[col].nunique(dropna=False)# numericsif col_type in numerics:df[col] = sd(df[col])# stringsif (col_type == 'object') and obj_to_cat:df[col] = df[col].astype('category')if verbose:print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')new_na_count = df[col].isna().sum()if (na_count != new_na_count):print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')new_n_uniq = df[col].nunique(dropna=False)if (n_uniq != new_n_uniq):print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2percent = 100 * (start_mem - end_mem) / start_memif verbose:print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))return df

使用方法:

print("缩小前,train情况统计")
stats = get_stats(train)
print(stats)
train= reduce_mem_usage_sd(train, verbose=True)
print("缩小后,test情况统计")
stats = get_stats(train)
print(stats)

代码来自:
[1]https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

[2]https://www.kaggle.com/c/ieee-fraud-detection/discussion/107653#latest-619384

pandas的dataframe节省内存相关推荐

dataframe 空值替换为0_缓解Pandas中DataFrame占用内存过高
0 背景在我们使用pandas进行数据处理的时候,有时候发现文件在本地明明不大,但是用pandas以DataFrame形式加载内存中的时候会占用非常高的内存,本文即解决这样的问题. 1 原因如果是 ...
解决 pandas 读取数据时内存过大的问题
解决 pandas 读取数据时内存过大的问题背景: 在我们使用pandas进行数据处理的时候,有时候发现文件在本地明明不大,但是用pandas以DataFrame形式加载内存中的时候会占用非常高的内 ...
【Python学习】 - 解决DataFrame占用内存过大问题
这篇文章原文出自kaggle,文中给出了reduce_mem_usage方法可以用来自动缩减dataframe占用空间这篇notebook展示了通过使用更合理的数据类型来减少dataframe的内存 ...
python dataframe 中位数_python下的Pandas中DataFrame基本操作（一），基本函数整理
pandas作者Wes McKinney 在[PYTHON FOR DATA ANALYSIS]中对pandas的方方面面都有了一个权威简明的入门级的介绍,但在实际使用过程中,我发现书中的内容还只是冰 ...
pandas中DataFrame的ix，loc，iloc索引方式的异同
pandas中DataFrame的ix,loc,iloc索引方式的异同 1.loc: 按照标签索引,范围包括start和end 2.iloc: 在位置上进行索引,不包括end 3.ix: 先在inde ...
Pandas的DataFrame数据类型
Pandas的DataFrame数据类型纵轴表示不同索引axis=0,横轴表示不同列axis=1 DataFrame类型创建 1.从二维ndarray对象创建 import pandas as pd ...
Python—pandas中DataFrame类型数据操作函数
python数据分析工具pandas中DataFrame和Series作为主要的数据结构. 本文主要是介绍如何对DataFrame数据进行操作并结合一个实例测试操作函数. 1)查看DataFram ...
pandas基于dataframe字符串数据列不包含特定字符串来筛选dataframe中的数据行（rows where values do not contain substring）
pandas基于dataframe字符串数据列不包含(not contains)特定字符串来筛选dataframe中的数据行(rows where values do not contain subs ...
pandas将dataframe原有的数据列名称转化为整数数值列名称(convert dataframe column labelsl into integers)
pandas将dataframe原有的数据列名称转化为整数数值列名称(convert dataframe column labelsl into integers) 目录 pandas将datafra ...

pandas的dataframe节省内存

pandas的dataframe节省内存相关推荐

最新文章

热门文章