Microsoft_Malware_Mrediction_kaggle_2nd
2024-05-10 21:05:39
GitHub原文
该种解决方法其中心思路为:先进行数据预处理,接着构建两个模型进行训练,最后计算两种模型的预测平均值。
使用的算法:LightGBM
数据处理
减少内存使用
对训练集和测试集进行数据清理
# Data types for less memory usage
dtypes = {'MachineIdentifier': 'category','ProductName': 'category','EngineVersion': 'category','AppVersion': 'category','AvSigVersion': 'category','IsBeta': 'int8','RtpStateBitfield': 'float16','IsSxsPassiveMode': 'int8','DefaultBrowsersIdentifier': 'float16','AVProductStatesIdentifier': 'float32','AVProductsInstalled': 'float16','AVProductsEnabled': 'float16','HasTpm': 'int8','CountryIdentifier': 'int16','CityIdentifier': 'float32','OrganizationIdentifier': 'float16','GeoNameIdentifier': 'float16','LocaleEnglishNameIdentifier': 'int8','Platform': 'category','Processor': 'category','OsVer': 'category','OsBuild': 'int16','OsSuite': 'int16','OsPlatformSubRelease': 'category','OsBuildLab': 'category','SkuEdition': 'category','IsProtected': 'float16','AutoSampleOptIn': 'int8','PuaMode': 'category','SMode': 'float16','IeVerIdentifier': 'float16','SmartScreen': 'category','Firewall': 'float16','UacLuaenable': 'float32','Census_MDC2FormFactor': 'category','Census_DeviceFamily': 'category','Census_OEMNameIdentifier': 'float16','Census_OEMModelIdentifier': 'float32','Census_ProcessorCoreCount': 'float16','Census_ProcessorManufacturerIdentifier': 'float16','Census_ProcessorModelIdentifier': 'float16','Census_ProcessorClass': 'category','Census_PrimaryDiskTotalCapacity': 'float32','Census_PrimaryDiskTypeName': 'category','Census_SystemVolumeTotalCapacity': 'float32','Census_HasOpticalDiskDrive': 'int8','Census_TotalPhysicalRAM': 'float32','Census_ChassisTypeName': 'category','Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16','Census_InternalPrimaryDisplayResolutionHorizontal': 'float16','Census_InternalPrimaryDisplayResolutionVertical': 'float16','Census_PowerPlatformRoleName': 'category','Census_InternalBatteryType': 'category','Census_InternalBatteryNumberOfCharges': 'float32','Census_OSVersion': 'category','Census_OSArchitecture': 'category','Census_OSBranch': 'category','Census_OSBuildNumber': 'int16','Census_OSBuildRevision': 'int32','Census_OSEdition': 'category','Census_OSSkuName': 'category','Census_OSInstallTypeName': 'category','Census_OSInstallLanguageIdentifier': 'float16','Census_OSUILocaleIdentifier': 'int16','Census_OSWUAutoUpdateOptionsName': 'category','Census_IsPortableOperatingSystem': 'int8','Census_GenuineStateName': 'category','Census_ActivationChannel': 'category','Census_IsFlightingInternal': 'float16','Census_IsFlightsDisabled': 'float16','Census_FlightRing': 'category','Census_ThresholdOptIn': 'float16','Census_FirmwareManufacturerIdentifier': 'float16','Census_FirmwareVersionIdentifier': 'float32','Census_IsSecureBootEnabled': 'int8','Census_IsWIMBootEnabled': 'float16','Census_IsVirtualDevice': 'float16','Census_IsTouchEnabled': 'int8','Census_IsPenCapable': 'int8','Census_IsAlwaysOnAlwaysConnectedCapable': 'float16','Wdft_IsGamer': 'float16','Wdft_RegionIdentifier': 'float16','HasDetections': 'int8'}
def convert_types(df):# Convert data types to reduce memoryfor c in df:col_type = str(df[c].dtypes)numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']# Convert objects to categoryif col_type == 'object':df[c] = df[c].astype('category')# numericselif col_type in numerics:c_min = df[c].min()c_max = df[c].max()if col_type[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[c] = df[c].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[c] = df[c].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[c] = df[c].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[c] = df[c].astype(np.int64) else:if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:df[c] = df[c].astype(np.float16)elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:df[c] = df[c].astype(np.float32)else:df[c] = df[c].astype(np.float64) return df
首先将存在太多缺失值的特征删去,再将特征分为:numerical,categorical, binary三类。
数值特征(numerical):将NaN值设置为“-1”
for feature in numerical_features:tmp1 = len(df)df[feature] = df[feature].fillna(-1)list_nan_minus1.append(feature)
二进制类特征(binary):用最常见的值替换NaN值,并将其添加到列表中,以便稍后对测试数据执行相同的操作
for feature in binary_features:df[feature] = df[feature].fillna(df[feature].mode()[0])if df[feature].mode()[0] == 0:list_nan_0.append(feature)else:list_nan_1.append(feature)
category类型特征:降低其所有值(将大写转换为小写),有助于防止不同的拼写。
for feature in categorical_features:df[feature] = df[feature].str.lower()
然后手工对一些值进行修改,并将一些很少出现的值更改为“unknown”;之后再用“unknown”替换NaN值。
df['Census_ActivationChannel'] = df['Census_ActivationChannel'].astype(str)df['Census_ActivationChannel'] = df['Census_ActivationChannel'].apply(rename_Census_ActivationChannel)df['Census_ActivationChannel'] = df['Census_ActivationChannel'].astype('category')def rename_Census_ChassisTypeName(x):x = x.lower()if 'laptop' in x:return 'Notebook'elif 'other' in x:return 'unknown' else:return xdf['Census_ChassisTypeName'] = df['Census_ChassisTypeName'].fillna('unknown')
df['Census_ChassisTypeName'] = df['Census_ChassisTypeName'].astype(str)
df['Census_ChassisTypeName'] = df['Census_ChassisTypeName'].apply(rename_Census_ChassisTypeName)
df['Census_ChassisTypeName'] = df['Census_ChassisTypeName'].astype('category')
df['Census_ChassisTypeName'] = df['Census_ChassisTypeName'].cat.remove_unused_categories()
对训练集进行以上的数据清理后,再对测试集也进行相同的处理
构建模型并训练
接着在构建模型一之前对一些基本的标签进行编码。
list_frequency_encoding = ['AppVersion','AvSigVersion','Census_OSVersion','EngineVersion','OsBuildLab']# 创建list存放要进行编码的特征
correct_feature_by_hand = ['AppVersion','AvSigVersion','Census_ActivationChannel','Census_ChassisTypeName','Census_DeviceFamily','Census_FlightRing','Census_GenuineStateName','Census_MDC2FormFactor','Census_OSArchitecture','Census_OSBranch','Census_OSEdition','Census_OSInstallTypeName','Census_OSSkuName','Census_OSVersion','Census_OSWUAutoUpdateOptionsName','Census_PowerPlatformRoleName','Census_PrimaryDiskTypeName','EngineVersion','MachineIdentifier','OsBuildLab','OsPlatformSubRelease','OsVer','Platform','Processor','SkuEdition','SmartScreen','Census_MDC2FormFactor_new']list_label_encoding = list(set(correct_feature_by_hand)-set(list_frequency_encoding))
list_label_encoding.remove(data_id) # We don't want to encode this feature#创建一个基于频率编码的函数
def frequency_encoding(feature):# Count the number of values of each feature and reset the indicest = df[feature].value_counts().reset_index()# Building up a new index (old index is set by default to 'level_0')t = t.reset_index()# Set the old index 'level_0' for all values, which only occur once, to NaNt.loc[t[feature] == 1, 'level_0'] = np.nan# Reset the original index (= the value name) as indext.set_index('index', inplace=True)# return the number of values , which occur two or more times, +1max_label = t['level_0'].max() + 1# fill all nan-values to max_labelt.fillna(max_label, inplace=True)return t.to_dict()['level_0']# 将所有‘list_frequency_encoding’里的特征进行编码
for feature in tqdm(list_frequency_encoding):freq_enc_dict = frequency_encoding(feature)df[feature] = df[feature].map(lambda x: freq_enc_dict.get(x, np.nan))df[feature] = df[feature].astype('int64') # 将所有 'list_label_encoding'里的特征进行编码
for feature in tqdm(list_label_encoding):le = preprocessing.LabelEncoder()df[feature] = le.fit_transform(df[feature])df[feature] = df[feature].astype('int64') # Restore the train data
train = df[df[target].notnull()]
train.shape# Restore the test data
test = df[df[target].isnull()]
test.shape# ## Safe cleaned and encoded Train and Test data file# Save the encoded data-files
train.to_csv('../data/train_M1.csv', index = False)
test.to_csv('../data/test_M1.csv', index = False)
运用LightGBM算法,设置超参数,对模型进行训练,最后使用来自测试集的值验证模型。列出按重要性排序的所有使用特性的列表。
# labels from train file
train_labels = np.array(train[target].astype(np.int8)).reshape((-1, ))# remove target and data_id for model training
train = train.drop(columns = [target, data_id])# save data_id for submission file
test_ids = list(test[data_id])# remove target and data_id for prediction
test = test.drop(columns = [target, data_id])# ## Set parameters for LightGBM algorithm# Categorical features for LighGBM parameter
categorical_feature = ['OsPlatformSubRelease','Census_MDC2FormFactor_new','Census_FlightRing','Census_PrimaryDiskTypeName','Census_OSSkuName','Census_OSBranch','OsVer','SkuEdition','Census_OSArchitecture','Census_OSEdition','Census_GenuineStateName','Processor','SmartScreen','Census_OSInstallTypeName','Census_OSWUAutoUpdateOptionsName','Census_ChassisTypeName','Census_MDC2FormFactor','Platform','Census_DeviceFamily','Census_ActivationChannel','Census_PowerPlatformRoleName','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_ProcessorCoreCount','Census_HasOpticalDiskDrive','Census_IsAlwaysOnAlwaysConnectedCapable','Census_IsPenCapable','Census_IsPortableOperatingSystem','Census_IsSecureBootEnabled','Census_IsTouchEnabled','Census_IsVirtualDevice','Firewall','HasTpm','IsProtected','IsSxsPassiveMode','SMode','Wdft_IsGamer']best_hyp = {'boosting_type': 'gbdt','class_weight': None,'colsample_bytree': 0.6110437067662637,'learning_rate': 0.0106,'min_child_samples': 295,'num_leaves': 160,'reg_alpha': 0.6321152748961743,'reg_lambda': 0.6313659622714517,'subsample_for_bin': 80000,'subsample': 0.8202307264855064}estimators = 12000# ## Train modeltrain_set = lgb.Dataset(train, label = train_labels)# Re-create the best model and train on full training data
model = lgb.LGBMClassifier(n_estimators=estimators, n_jobs = -1, objective = 'binary', random_state = 50, **best_hyp)model.fit(train, train_labels, categorical_feature=categorical_feature)
# time: 1h 52min 39s# ## save modelpickle.dump(model, open("../models/model_M1.p", "wb"))
#model = pickle.load(open("model/model_M1.p", "rb"))# # predict on Test setpreds = model.predict_proba(test)[:, 1]
# time: 40min 42s# create submission
submission = pd.DataFrame({data_id: test_ids, target: preds})submission.shape
# rows=7.853.253submission.head()# save submission file
submission.to_csv('../submissions/Submission_M1.csv', index = False)# # Feature Importance# list of all features
feature_names = list(train.columns)# create feature importance
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})# sort for most important
feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)# calculate normalized and cumulative feature importance
feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])feature_importancesfeature_importances.to_csv('../feature_importances/FeatureImportance_M1.csv', index = False)
来自模型1的预处理数据扩展了新的特征(‘monitor_dims’, ‘SmartScreen_AVProductsInstalled’),用扩展后的训练集对模型二进行训练。
在训练模型之前仍需对特征进行处理
#建立特征工程函数
def featureengineering (df):df['primary_drive_c_ratio'] = df['Census_SystemVolumeTotalCapacity']/ df['Census_PrimaryDiskTotalCapacity']df['primary_drive_c_ratio'] = df['primary_drive_c_ratio'].fillna(-1)df['non_primary_drive_MB'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity']df['non_primary_drive_MB'] = df['non_primary_drive_MB'].fillna(-1)df['aspect_ratio'] = df['Census_InternalPrimaryDisplayResolutionHorizontal']/ df['Census_InternalPrimaryDisplayResolutionVertical']df['aspect_ratio'] = df['aspect_ratio'].fillna(-1)df['monitor_dims'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'].astype(str) + '*' + df['Census_InternalPrimaryDisplayResolutionVertical'].astype('str')df['monitor_dims'] = df['monitor_dims'].fillna('unknown')df['monitor_dims'] = df['monitor_dims'].astype('category')df['Screen_Area'] = (df['aspect_ratio']* (df['Census_InternalPrimaryDiagonalDisplaySizeInInches']**2))/(df['aspect_ratio']**2 + 1)df['Screen_Area'] = df['Screen_Area'].fillna(-1)df['ram_per_processor'] = df['Census_TotalPhysicalRAM']/ df['Census_ProcessorCoreCount']df['ram_per_processor'] = df['ram_per_processor'].fillna(-1)df['ProcessorCoreCount_DisplaySizeInInches'] = df['Census_ProcessorCoreCount'] * df['Census_InternalPrimaryDiagonalDisplaySizeInInches']df['ProcessorCoreCount_DisplaySizeInInches'] = df['ProcessorCoreCount_DisplaySizeInInches'].fillna(-1)df['SmartScreen'] = df['SmartScreen'].astype(str)df['AVProductsInstalled'] = df['AVProductsInstalled'].astype(str)df['SmartScreen_AVProductsInstalled'] = df['SmartScreen'] + df['AVProductsInstalled']df['SmartScreen_AVProductsInstalled'] = df['SmartScreen_AVProductsInstalled'].fillna('unknown')df['SmartScreen_AVProductsInstalled'] = df['SmartScreen_AVProductsInstalled'].astype('category')df['SmartScreen'] = df['SmartScreen'].astype('category')df['AVProductsInstalled'] = df['AVProductsInstalled'].astype('category')return(df)# add feature to encoding list
# 将特征加入到需要进行编码的列表中
new_features_labelencode.append('monitor_dims')
# add feature to parameter list
# 将特征加入到参数列表中
new_features_category.append('monitor_dims')
new_features_labelencode.append('SmartScreen_AVProductsInstalled')
new_features_category.append('SmartScreen_AVProductsInstalled')# create features for train data
train = featureengineering(train)# create features for test data
test = featureengineering(test)new_features_labelencode
new_features_category# ## save Data sets
train.to_csv('../data/train_featureengineering_M2.csv', index = False)
test.to_csv('../data/test_featureengineering_M2.csv', index = False)
将处理好的训练集进行基于频率和标签的编码,为训练模型二做准备
# Build up a list with all the features, which should be encoded via frequency
list_frequency_encoding = ['AppVersion','AvSigVersion','Census_OSVersion','EngineVersion','OsBuildLab']# Build up a list with all the features, which should be encoded by label (part 1)
correct_feature_by_hand = ['AppVersion','AvSigVersion','Census_ActivationChannel','Census_ChassisTypeName','Census_DeviceFamily','Census_FlightRing','Census_GenuineStateName','Census_MDC2FormFactor','Census_OSArchitecture','Census_OSBranch','Census_OSEdition','Census_OSInstallTypeName','Census_OSSkuName','Census_OSVersion','Census_OSWUAutoUpdateOptionsName','Census_PowerPlatformRoleName','Census_PrimaryDiskTypeName','EngineVersion','MachineIdentifier','OsBuildLab','OsPlatformSubRelease','OsVer','Platform','Processor','SkuEdition','SmartScreen','Census_MDC2FormFactor_new']# new features from feature engineering
new_features_labelencode = ['monitor_dims', 'SmartScreen_AVProductsInstalled']# Build up a list with all the features, which should be encoded by label (part 2)
list_label_encoding = list(set(correct_feature_by_hand)-set(list_frequency_encoding))
list_label_encoding.remove(data_id)
list_label_encoding.extend(new_features_labelencode)# Create a function for frequency encoding
def frequency_encoding(feature):# Count the number of values of each feature and reset the indicest = df[feature].value_counts().reset_index()# Building up a new index (old index is set by default to 'level_0')t = t.reset_index()# Set the old index 'level_0' for all values, which only occur once, to NaNt.loc[t[feature] == 1, 'level_0'] = np.nan# Reset the original index (= the value name) as indext.set_index('index', inplace=True)# return the number of values , which occur two or more times, +1max_label = t['level_0'].max() + 1# fill all nan-values to max_labelt.fillna(max_label, inplace=True)return t.to_dict()['level_0']# Creating a dictionary for storing the encoder
enc_dict = dict();# Encode all the features in 'list_frequency_encoding'
for feature in tqdm(list_frequency_encoding):freq_enc_dict = frequency_encoding(feature)df[feature] = df[feature].map(lambda x: freq_enc_dict.get(x, np.nan))df[feature] = df[feature].astype('int64') # Save the freq_enc_dictenc_dict[feature] = freq_enc_dict# Encode all the features in 'list_label_encoding'
for feature in tqdm(list_label_encoding):le = preprocessing.LabelEncoder()df[feature] = le.fit_transform(df[feature])df[feature] = df[feature].astype('int64') # Save the freq_enc_dictenc_dict[feature] = le# Save the encoding dictionary
pickle.dump(enc_dict, open("../data/encoding_dictionary.p", "wb"))# Restore the train data
train = df[df[target].notnull()]
train.shape# Restore the test data
test = df[df[target].isnull()]
test.shape# ## Safe cleaned and encoded Train and Test data file# Save the encoded data-files
train.to_csv('../data/train_M2.csv', index = False)
test.to_csv('../data/test_M2.csv', index = False)
提取一些重要的特征进行模型二的训练,这个过程类似于模型一,同样运用的是LightGBM算法。
为了获得一个好的拟合模型,超参数被调优。
unimportant_features = ['Census_IsTouchEnabled','HasTpm','Census_IsVirtualDevice','SMode','Census_IsPenCapable','Platform','Census_ProcessorManufacturerIdentifier','Census_IsAlwaysOnAlwaysConnectedCapable','Census_IsPortableOperatingSystem','OsVer','Census_DeviceFamily','index']
train = train.drop(columns = unimportant_features)
train.shape
test = test.drop(columns = unimportant_features)
test.shape
# rows=7.853.253# # Model Training# ## prepare files for training and prediction# labels from train file
train_labels = np.array(train[target].astype(np.int8)).reshape((-1, ))# remove target and data_id for model training
train = train.drop(columns = [target, data_id])# save data_id for submission file
test_ids = list(test[data_id])# remove target and data_id for prediction
test = test.drop(columns = [target, data_id])# ## Set parameters for LightGBM algorithm# Categorical features for LighGBM parameter
# including Feature Selection
categorical_feature = ['OsPlatformSubRelease','Census_MDC2FormFactor_new','Census_FlightRing','Census_PrimaryDiskTypeName','Census_OSSkuName','Census_OSBranch','SkuEdition','Census_OSArchitecture','Census_OSEdition','Census_GenuineStateName','Processor','SmartScreen','Census_OSInstallTypeName','Census_OSWUAutoUpdateOptionsName','Census_ChassisTypeName','Census_MDC2FormFactor','Census_ActivationChannel','Census_PowerPlatformRoleName','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_ProcessorCoreCount','Census_HasOpticalDiskDrive','Census_IsSecureBootEnabled','Firewall','IsProtected','IsSxsPassiveMode','Wdft_IsGamer','monitor_dims','SmartScreen_AVProductsInstalled']best_hyp = {'boosting_type': 'gbdt','class_weight': None,'colsample_bytree': 0.6027132059774907,'learning_rate': 0.010899921631042043,'min_child_samples': 145,'num_leaves': 156,'reg_alpha': 0.45996805852518485,'reg_lambda': 0.7336912016500579,'subsample_for_bin': 440000,'subsample': 0.5512957111882841}estimators = 20000# ## Train modeltrain_set = lgb.Dataset(train, label = train_labels)# Re-create the best model and train on full training data
model = lgb.LGBMClassifier(n_estimators=estimators, n_jobs = -1, objective = 'binary', random_state = 50, **best_hyp)model.fit(train, train_labels, categorical_feature=categorical_feature)
# time: 2h 29min 34s# ## save modelpickle.dump(model, open("../models/model_M2.p", "wb"))
#model = pickle.load(open("../models/model_M2.p", "rb"))# # predict on Test setpreds = model.predict_proba(test)[:, 1]
# time: 1h 6min 53s# create submission
submission = pd.DataFrame({data_id: test_ids, target: preds})submission.shape
# rows=7.853.253
submission.head()# save submission file
submission.to_csv('../submissions/Submission_M2.csv', index = False)# # Feature Importance# list of all features
feature_names = list(train.columns)# create feature importance
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})# sort for most important
feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)# calculate normalized and cumulative feature importance
feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])feature_importancesfeature_importances.to_csv('../feature_importances/FeatureImportance_M2.csv', index = False)
最后,对模型1和模型2的预测取平均值(权重分别为0.5和0.5)
sub1.shape
sub2.shape# ## Blending
submission = sub1
submission.shape# add predictions
pred = np.array(sub1['HasDetections']) + np.array(sub2['HasDetections'])
pred.shape# devide predictions by number of predictions
submission[target] = pred / 2
submission.shape
# rows=7.853.253submission.to_csv('../submissions/Submission_Solution.csv', index = False)
Microsoft_Malware_Mrediction_kaggle_2nd相关推荐
最新文章
- 《正则表达式必知必会》读书笔记
- java plc通讯_树莓派+西门子PLC+Aliyun
- VisDA-2020亚军技术方案分享
- 分组查询最晚一条数据(ORACLE)
- iOS--百度地图相关功能的实现
- Vue强制绑定class和style_使用字符串_对象_素组的方式实现强制绑定---vue工作笔记0007
- 搜狗输入法语音转文字的体验点
- DRBD 管理、故障处理部分
- Atitit.数据库分区的设计 attilax 总结
- Cookie、LocalStorage 与 SessionStorage的区别
- 机器视觉在工业互联网中的应用
- hdu 5336 XYZ and Drops
- ERROR】Unable to open underlying table which is differently defined or of non-MyISAM type or ...
- 存折和银行卡的区别?
- java+javascript获得两个日期之间的所有月份
- Matlab绘制区域图
- 广东省内免费试用 | 医院绩效考核病案首页系统,解决医院上报难题的智能助手
- 大白话说 Reactor 模型
- 微信SDK中含有的支付功能怎么去掉?
- 附件的文件夹超过了服务器,邮件附件太大发不了 这3种方式了解一下