小方法函数

// 显示数据的缺失程度
def miss_value_table(df):mis_val = df.isnull().sum()mis_val_percent = 100*(mis_val/len(df))mis_val_t = pd.concat([mis_val,mis_val_percent],axis=1)mis_val_table = mis_val_t.rename(columns = {0:'Missing Value',1:'Missing Percent'})mis_val_table = mis_val_table[mis_val_table.iloc[:,1]!=0].sort_values('Missing Percent',ascending= False).round(1)print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      "There are " + str(mis_val_table.shape[0]) +" columns that have missing values.")return mis_val_table

// 记录运行所耗时间
def timer(start_time=None):if not start_time:start_time = datetime.now()return start_timeelif start_time:thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)tmin, tsec = divmod(temp_sec, 60)print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

// 降低数据文件的大小
def reduce_mem_usage(df, verbose=True):numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']start_mem = df.memory_usage().sum() / 1024**2    for col in df.columns:col_type = df[col].dtypesif col_type in numerics:c_min = df[col].min()c_max = df[col].max()if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64)  else:if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:df[col] = df[col].astype(np.float16)elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)    end_mem = df.memory_usage().sum() / 1024**2if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))return df

// .agg部分用处 ，上面为一个df的两列截取，其中，如果要求每个不同card_id的_flag平均值和总和：-
agg_fun = {'authorized_flag': ['mean','sum']}
auth_mean = df.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values] //标准化显示结果
auth_mean.reset_index(inplace=True)

# .map用法
def get(x):if(x>0.5):return 1else:return 0y = list(map(get,oof))

#apply

作图分析

def plot_feature_distribution(df1, df2, label1, label2, features):i = 0sns.set_style('whitegrid')plt.figure()fig, ax = plt.subplots(10,10,figsize=(18,22))for feature in features:i += 1plt.subplot(10,10,i)sns.distplot(df1[feature], hist=False,label=label1)sns.distplot(df2[feature], hist=False,label=label2)plt.xlabel(feature, fontsize=9)locs, labels = plt.xticks()plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)plt.tick_params(axis='y', which='major', labelsize=6)plt.show();t0 = train_df.loc[train_df['target'] == 0]
t1 = train_df.loc[train_df['target'] == 1]
features = train_df.columns.values[2:102]
plot_feature_distribution(t0, t1, '0', '1', features)

查看趋势

利用分位数查看整体股票趋势走向

data = []  #股票趋势
#market_train_df['close'] = market_train_df['close'] / 20
for i in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:price_df = market_train_df.groupby('time')['close'].quantile(i).reset_index()data.append(go.Scatter(x = price_df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = price_df['close'].values,name = f'{i} quantile'))
layout = go.Layout(dict(title = "Trends of closing prices by quantiles",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"))
py.iplot(dict(data=data, layout=layout), filename='basic-line')

某些事件使所有股票都在下跌，且有可能贫富差距越来越大（低的越低，高的越高)

数据预处理

0、特征工程流程：https://www.cnblogs.com/jasonfreak/p/5448385.html 其中的特征选择方法相当重要
1、查看特征值分布情况，计算偏度(连续特征)，左偏或右偏的情况下考虑使用log或是Box-Cox变换
2、数据中心化或标准化 (对于某些模型很有必要）
3、利用回归线查看特征(连续)与结果变量是否是线性的或非线性的，考虑增加某些变量的二次项。

特征选择

1.简易的利用树模型来对特征重要性排列：
https://blog.csdn.net/m0_37477175/article/details/80567010
2.利用null importances (target permutation)进行特征选择，
论文来源：https://academic.oup.com/bioinformatics/article/26/10/1340/193348
kaggle案例代码为：https://www.kaggle.com/ogrellier/feature-selection-with-null-importances
使用2的原因：树模型倾向于值的个数多的类别特征

调参

贝叶斯超参优化 (介绍)：https://blog.csdn.net/Snail_Ren/article/details/79005069 ，https://blog.csdn.net/buptgshengod/article/details/81906225

来源kaggle: https://www.kaggle.com/fabiendaniel/hyperparameter-tuning

//以lightGBM为例：
def LGB_CV(max_depth,num_leaves,min_data_in_leaf,feature_fraction,bagging_fraction,lambda_l1):folds = KFold(n_splits=5, shuffle=True, random_state=15)oof = np.zeros(train.shape[0])for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):print("fold n°{}".format(fold_))trn_data = lgb.Dataset(train.iloc[trn_idx][features],label=target.iloc[trn_idx],categorical_feature=categorical_feats)val_data = lgb.Dataset(train.iloc[val_idx][features],label=target.iloc[val_idx],categorical_feature=categorical_feats)param = {'num_leaves': int(num_leaves),'min_data_in_leaf': int(min_data_in_leaf), 'objective':'regression','max_depth': int(max_depth),'learning_rate': 0.01,"boosting": "gbdt","feature_fraction": feature_fraction,"bagging_freq": 1,"bagging_fraction": bagging_fraction ,"bagging_seed": 11,"metric": 'rmse',"lambda_l1": lambda_l1,"verbosity": -1}clf = lgb.train(param,trn_data,10000,valid_sets = [trn_data, val_data],verbose_eval=500,early_stopping_rounds = 200)oof[val_idx] = clf.predict(train.iloc[val_idx][features],num_iteration=clf.best_iteration)// predictions_ridge += clf.predict(test_data) / folds.n_splits  预测del clf, trn_idx, val_idxgc.collect()return -mean_squared_error(oof, target)**0.5 //优化使用的是最大化结果，所以这里加上负号。LGB_BO = BayesianOptimization(LGB_CV, {'max_depth': (4, 10),'num_leaves': (5, 130),'min_data_in_leaf': (10, 150),'feature_fraction': (0.7, 1.0),'bagging_fraction': (0.7, 1.0),'lambda_l1': (0, 6)})start_time = timer(None)
with warnings.catch_warnings():warnings.filterwarnings('ignore')LGB_BO.maximize(init_points=2, n_iter=20, acq='ei', xi=0.0)
timer(start_time)

处理离群标签

使用outlier labeling 和分层抽样:https://www.kaggle.com/chauhuynh/my-first-kernel-3-699

Stacking

将训练好的所有基模型对整个训练集进行预测，第j个基模型对第i个训练样本的预测值将作为新的训练集中第i个样本的第j个特征值，最后基于新的训练集进行训练。预测同理。
https://www.kaggle.com/tunguz/eloda-with-feature-engineering-and-stacking

// xgboost，被融合的一个模型
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True}oof_xgb_3 = np.zeros(len(train))
predictions_xgb_3 = np.zeros(len(test))folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):    print('-')print("Fold {}".format(fold_ + 1))trn_data = xgb.DMatrix(data=train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])val_data = xgb.DMatrix(data=train.iloc[val_idx][train_columns], label=target.iloc[val_idx])watchlist = [(trn_data, 'train'), (val_data, 'valid')]print("xgb " + str(fold_) + "-" * 50)num_round = 10000xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=50, verbose_eval=1000)oof_xgb_3[val_idx] = xgb_model.predict(xgb.DMatrix(train.iloc[val_idx][train_columns]), ntree_limit=xgb_model.best_ntree_limit+50)predictions_xgb_3 += xgb_model.predict(xgb.DMatrix(test[train_columns]), ntree_limit=xgb_model.best_ntree_limit+50) / folds.n_splitsnp.save('oof_xgb_3', oof_xgb_3)
np.save('predictions_xgb_3', predictions_xgb_3)
np.sqrt(mean_squared_error(target.values, oof_xgb_3))

// stacking
train_stack = np.vstack([oof_lgb_3, oof_xgb_3]).transpose()
test_stack = np.vstack([predictions_lgb_3, predictions_xgb_3]).transpose()folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train['outliers'].values)):print("fold n°{}".format(fold_))trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].valuesval_data, val_y = train_stack[val_idx], target.iloc[val_idx].valuesclf = Ridge(alpha=1)clf.fit(trn_data, trn_y)oof[val_idx] = clf.predict(val_data)predictions += clf.predict(test_stack) / folds.n_splits
np.sqrt(mean_squared_error(target.values, oof))

数据分析 kaggle 天池竞赛常用函数整理相关推荐

mysql数据库常用函数_《MySQL数据库》常用函数整理
原标题:<MySQL数据库>常用函数整理以下内容,是我整理出来的比较常用的字符串函数,数值函数,日期函数. 第一类:字符串函数 1.conv(n,from_base,to_base):对 ...
pandas 常用函数整理
pandas常用函数整理,作为个人笔记. 仅标记函数大概用途做索引用,具体使用方式请参照pandas官方技术文档. 约定 from pandas import Series, DataFrame im ...
DL4J的矩阵处理模块ND4J的常用函数整理
ND4J矩阵变换的常用函数整理由本人阅读nd4j源码整理,nd4j是dl4j为了矩阵运算整出来的一套工具.对应python里的numpy,但是并没有numpy那么普及,不过至少使用dl4j搭建神经网 ...
opencv常用函数整理
opencv常用函数整理一.常见函数整理 1.1 cv2.resize函数(图像size重定义) 1.2 cv2.getStructuringElement函数(构造卷积核) 1.3.cv2.put ...
oracle有哪些常用函数,Oracle常用函数整理
点击关注上方"SQL数据库开发", 设为"置顶或星标",第一时间送达干货之前已经给小伙伴们整理了SQL Server和Mysql的常用函数,还没有看的可以戳下 ...
C++-string常用函数整理（建议收藏）
作者:翟天保Steven 版权声明:著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处最近刷笔试题,涉及到许多字符串相关的题目,所以将一些常用的函数整理一下,便于后面查看.本文后续持 ...
STM32F103标准固件库寄存器及常用函数整理
学习STM32其实就是学习它的寄存器以及函数的使用,能用单片机的资源实现自己想要的功能,从最开始的点亮一个led,到使用按键,串口,ADC/DAC,定时器计时计数,输出PWM,驱动电机,使用IIC,S ...
mysql常用函数整理
mysql常用函数: 1⃣️concat(str1,str2,str3-) 同一条记录中多个字段进行连接 2⃣️group_concat(字段名) 一对多关联查询时,如果返回一条记录对应另外一张表多条 ...
最详细的HIve常用函数整理及案例演示
Hive常用函数一.测试数据集 1.1 测试数据集: 1.2 结果展示二.常用函数 2.1 关系运算 2.1.1 常见关系运算符 2.1.2 空值判断 2.1.3 非空判断 2.1.4 LIKE ...

数据分析 kaggle 天池竞赛常用函数整理

小方法函数

作图分析

查看趋势

数据预处理

特征选择

调参

处理离群标签

Stacking

数据分析 kaggle 天池竞赛常用函数整理相关推荐

最新文章

热门文章

数据分析 kaggle 天池 竞赛常用函数整理

小方法函数

作图分析

查看趋势

数据预处理

特征选择

调参

处理离群标签

Stacking

数据分析 kaggle 天池 竞赛常用函数整理相关推荐

最新文章

热门文章

数据分析 kaggle 天池竞赛常用函数整理

数据分析 kaggle 天池竞赛常用函数整理相关推荐