Data fountain 基于人工智能的恶意软件家族分类参赛总结

我自己单独做的loss是在0.78左右，学长的方法loss能达到0.207左右

给的数据，是一个恶意软件去掉pe头的asm文件和对应的pe文件。此外，样本有分布不均的特点，因此在训练模型的时候需要注意加上对应的权重

我自己使用的方法是：
提取两部分特征，一部分是文件的区间熵+文件大小+2进制读取的时候0-255字节分别出现的数量+常用opcode出现的次数+字符串出现的数量，字符串最长长度，平均长度，这部分特征是长度2500的一维向量

第二部分是文件内容取前30万行，将其中以6个空格开头的行收集起来，筛去以db,dd开头的行，取这些行里最前面的1万行，最后面的一万行以及中间的一万行，把里面的字符切成常用的字符串，然后映射成数字，每一行以0结尾，长度是40000的特征。

最后使用lgm+kfold5折法来进行判断
学长的方法比起我自己的方法有两部分改进，第一是做了壳分析，即提取asm中的section名称，此外在进行5折验证的时候，学长是加入了其他几种模型如cat,xgb,rf,ext这些模型来综合分析，我是只使用了一种lgb

import re
import json
from math import log
from collections import *
from scipy import stats
import tensorflow as tf
import codecs
from tensorflow.keras import models, layers
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, \GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, StratifiedKFold
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
import pickle
import os
from sklearn.utils import compute_sample_weight
import csv
words = open("words-by-frequency.txt").read().split()
wordcost = dict((k, log((i + 1) * log(len(words)))) for i, k in enumerate(words))
maxword = max(len(x) for x in words)def do_mymlp(train_x, train_y):# mlptf.keras.backend.clear_session()print("do mymlp")# inputs = layers.Input(shape=max_features)inputs = layers.Input(shape=2500)x = layers.Dense(1000, activation='tanh')(inputs)x = layers.Dense(500, activation='sigmoid')(x)# x=layers.Dropout(0.2)(x)x = layers.Dense(250, activation='sigmoid')(x)x = layers.Dense(100, activation='sigmoid')(x)x = layers.Dense(50, activation='sigmoid')(x)outputs = layers.Dense(10, activation='softmax')(x)model = models.Model(inputs=inputs, outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.binary_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit(train_x, train_y, epochs=30, validation_split=0.1,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef cnn_mlp(train_x1, train_x2, train_y):# cnn和mlp网络进行组合# x1是40000长度的内容特征,x2是2500长度的文件特征tf.keras.backend.clear_session()# train_x1=np.reshape(train_x1, (-1, 200, 200))input1 = layers.Input(shape=(200, 200))input2 = layers.Input(shape=2500)x1 = layers.Conv1D(16, kernel_size=5, name="conv_1", activation="relu")(input1)x1 = layers.MaxPool1D(name="maxpool1")(x1)x1 = layers.Conv1D(128, kernel_size=2, name="conv_2", activation="relu")(x1)x1 = layers.MaxPool1D(name="maxpool2")(x1)x1 = layers.Flatten()(x1)x2 = layers.Dense(1200, activation='tanh')(input2)c = layers.concatenate([x1, x2], axis=1)c = layers.Dense(600, activation='sigmoid')(c)c = layers.Dense(300, activation='sigmoid')(c)c = layers.Dense(150, activation='sigmoid')(c)c = layers.Dense(60, activation='sigmoid')(c)outputs = layers.Dense(10, activation='softmax')(c)model = models.Model(inputs=[input1, input2], outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.categorical_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit([train_x1, train_x2], train_y, epochs=30, validation_split=0.2,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef load_numpy_data(x_name, y_name):# 读取保存的numpy数组return np.load(x_name), np.load(y_name)def infer_spaces(s):# 将连续字符串分割成单词"""Uses dynamic programming to infer the location of spaces in a stringwithout spaces."""# Find the best match for the i first characters, assuming cost has# been built for the i-1 first characters.# Returns a pair (match_cost, match_length).def best_match(i):candidates = enumerate(reversed(cost[max(0, i - maxword):i]))return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1) for k, c in candidates)# Build the cost array.cost = [0]for i in range(1, len(s) + 1):c, k = best_match(i)cost.append(c)# Backtrack to recover the minimal-cost string.out = []i = len(s)while i > 0:c, k = best_match(i)assert c == cost[i]out.append(s[i - k:i])i -= kreturn " ".join(reversed(out))def write_dict2json(dictdata, save_path="test_dict.json"):# 字典保存为json文件try:json_str = json.dumps(dictdata)if dictdata == {}:returnwith open(save_path, 'w') as json_file:json_file.write(json_str)except Exception as e:print("write_dict2json Error")def read_json2dict(save_path="test_dict.json"):# 从json文件中读取字典try:with open(save_path, encoding="utf-8") as file:one_dict = json.load(file)return one_dictexcept Exception as e:print("read_json2dict Error", e)return {}def readf(filename, dict_collect, type):# 读取asm文件并生成特征token_pattern = r'\b\w+\b'total_line_num = 0op_line_num = 0string_pattern1 = re.compile(r'["](.*?)["]', re.S)string_pattern2 = re.compile(r'[\'](.*?)[\']', re.S)str_collect = []cmdfeature = []file_md5 = filename.split('\\')[-1][:-4]pe_file_path = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\pe" + "\\" + file_md5filesize = os.path.getsize(filename) // 1000print(filesize)for line in open(filename, encoding='utf-8', errors='ignore'):total_line_num = total_line_num + 1if total_line_num % 10000 == 0:print(total_line_num)if total_line_num >= 1000000:breakif line.startswith("        "):op_line_num = op_line_num + 1line = line.lower()if '"' in line:string_result = re.findall(string_pattern1, line)if string_result:str_collect = str_collect + string_resultelse:if "'" in line:string_result = re.findall(string_pattern2, line)if string_result:str_collect = str_collect + string_resultcmdfeature = cmdfeature + generate_command_fature(line, dict_collect)cmdfeature_length = len(cmdfeature)if cmdfeature_length > 250000:fixed_feature = cmdfeature[:62500] + cmdfeature[cmdfeature_length // 2 - 62500:cmdfeature_length // 2 + 62500] + cmdfeature[-62500:]else:fixed_feature = cmdfeature + [0] * (250000 - cmdfeature_length)str_feature = np.array(generate_str_feature(str_collect))byte_feature = generate_byte_feature(pe_file_path)count_feature = np.array(turn_op_count2feature(dict_collect))file_feature = np.array([filesize, op_line_num])static_feature = np.hstack((file_feature, count_feature, str_feature, byte_feature))content_feature = np.array(fixed_feature)print(static_feature.shape)return static_feature, content_featuredef check_asm(filename, word_dict={}):# 检测asm文件里是否有混杂的无法解码的文字file = open(filename)flag = 0while True:try:line = file.readline()print(line)except Exception as e:line = file.readlines()def get_data_path(dir="D:\\pythonWorkspace\\data_fountaion\\train\\train\\asm"):# 获取指定文件下所有asm文件的全路径，返回一个列表g = os.walk(dir)result_ls = []for path, d, filelist in g:for filename in filelist:if filename.endswith('.asm'):final_path = os.path.join(path, filename)result_ls.append(final_path)return result_lsdef dict_filter(dictdata={}):# 去掉字典里只出现1词的词for each_key in list(dictdata.keys()):if dictdata.get(each_key) == 1:dictdata.pop(each_key, None)def generate_dict_from_ls(lsdata=[]):# 从列表里生成对应的映射字典count = 1result_dict = {}for each in lsdata:result_dict.update({each: count})count = count + 1return result_dictdef generate_dict_from_count(file_name='wordcount.json'):# 从单词统计文件中生成对应的映射字典dictdata = read_json2dict(file_name)final_word_ls = []for each_word in dictdata.keys():if dictdata.get(each_word) >= 500 and len(each_word) > 2:final_word_ls.append(each_word)return final_word_lsdef mix_list(lsa=[], lsb=[]):# 融合两个列表，剔除其中相同的元素final_ls = lsafor each in lsb:if each not in final_ls:final_ls.append(each)return final_lsdef Entropy(labels, base=2):# 计算概率分布probs = pd.Series(labels).value_counts() / len(labels)# 计算底数为base的熵en = stats.entropy(probs, base=base)return endef generate_byte_feature(file_name):# 生成字节统计和区间熵特征,长度为feature_lengthfeature_length = 50000with open(file_name, mode='rb') as f:file_rb = np.fromfile(f, dtype=np.ubyte)byte_laymap = Counter(file_rb)file_length = len(file_rb)byte_layout = []  # 统计0-255个字节出现的次数for byte in range(0, 256):# 0-255字节出现的次数byte_layout.append(byte_laymap[byte])current_loc = 0  # 滑动窗口统计熵step = 1024  # 滑动窗口一次移动字节数entro_list = []while current_loc <= file_length:if current_loc + step > file_length:end = file_lengthelse:end = current_loc + stepentro_list.append(Entropy(file_rb[current_loc:end]))current_loc = current_loc + stepfinal_result = byte_layout + entro_listresult_length = len(final_result)print(result_length)if len(final_result) < feature_length:final_result = np.array(final_result)final_result = list(np.pad(final_result, (0, feature_length - len(final_result)), 'constant', constant_values=(0, 0)))return final_result[0:feature_length]else:return final_result[0:result_length // 4] + final_result[result_length // 2 - result_length // 4:result_length // 2 + result_length // 4] + final_result[-result_length // 4:0]def generate_str_feature(str_ls):# 生成字符串特征，输入是一个字符串列表，提取的特征分别是字符串里不同字符出现次数的统计，以及字符串平均长度，字符串数量，字符串最大长度charsum = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'specialsum = '!?@#$%~^&*()_+=[];:.'charactersum = charsum + specialsumchara_count = []max_len = -1str_num = max(len(str_ls), 1)total_len = 0fullcontent = ""for each_str in str_ls:current_length = len(each_str)fullcontent = fullcontent + each_strtotal_len = total_len + current_lengthif current_length > max_len:max_len = current_lengthavelen = total_len // str_numfor each in list(charactersum):current_count = fullcontent.count(each)chara_count.append(current_count)# 最后返回的特征长度为85return [str_num, avelen, max_len] + chara_countdef generate_command_fature(cmdline="        db 0E2h ; ?     db 0FCh ; ?     db  3Fh ; ?",dict_collect={}):token_pattern = r'\b\w+\b'# 两个字典，一个字典统计常见操作码出现次数，另一个是将命令映射成数字的字典op_count_dict = dict_collect.get('op_count_dict')feature_dict = dict_collect.get('feature_dict')word_ls = re.findall(token_pattern, cmdline)cmdlinefeature = []if word_ls == []:return []if word_ls[0] == 'dd' or word_ls[0] == 'db':# 这两个出现太多，所以只计数不化成向量for each in word_ls:if each in op_count_dict.keys():op_count_dict.update({each: 1 + op_count_dict.get(each)})return []else:for each in word_ls:if each in feature_dict.keys():cmdlinefeature.append(feature_dict.get(each))if each in op_count_dict.keys():op_count_dict.update({each: 1 + op_count_dict.get(each)})cmdlinefeature.append(0)return cmdlinefeaturedef turn_op_count2feature(dict_collect={}):# 把统计出现次数的字典转化为特征feature_ls = []op_count_dict = dict_collect.get('op_count_dict')for each_word in op_count_dict.keys():feature_ls.append(op_count_dict.get(each_word))# 目前长度是68return feature_lsdef generate_op_count_dict():# 生成一个统计常见操作码出现次数的字典op_list = ["add", "ax", "arg", "al", "align", "bx", "byte", "call", "cmp", "dword", "ds", "dq", "dw", "db", "eax","ebx", "ebp", "ecx", "edx", "edi", "esi", "esp", "extrn", "fnstenv", "fnstcw", "fst", "fxc", "fld","fc", "fs","jmp", "jb", "jnz", "ja", "jz", "jl", "inc", "lea", "loc", "mov", "near", "not", "offset", "push", "pop","ptr","psr", "psu", "retn", "rax", "rbx", "rsi", "rsp", "rdi", "rbp", "rcx", "rdx", "rva", "sleep", "stmxcsr","second", "sub", "short","test", "unk", "var", "wait", "xor"]finaldict = {}for each_word in op_list:finaldict.update({each_word: 0})return finaldictdef get_one_hot_from_num(labelint):# 输入一个0-9的数字，返回它的one-hot向量onehot_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]onehot_label[int(labelint)] = 1return onehot_labeldef get_nearlyonehot_from_num(labelint):# 输入一个0-9的数字，返回它的one-hot向量,不过用0.1代替0onehot_label = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.1, 0.1, 0.1]onehot_label[int(labelint)] = 1return onehot_labeldef do_mycnn(train_x, train_y):tf.keras.backend.clear_session()print("do mycnn")inputs = layers.Input(shape=(200, 200))# x = Self_Attention(128)(inputs)x = layers.Conv1D(16, kernel_size=5, name="conv_1", activation="relu")(inputs)x = layers.MaxPool1D(name="maxpool1")(x)x = layers.Conv1D(128, kernel_size=2, name="conv_2", activation="relu")(x)x = layers.MaxPool1D(name="maxpool2")(x)x = layers.Flatten()(x)outputs = layers.Dense(10, activation='softmax')(x)model = models.Model(inputs=inputs, outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = os.path.join('data', 'autograph', stamp)# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.categorical_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit(train_x, train_y, epochs=20, validation_split=0.2,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef generate_label_np(type='train'):# 生成文件的label的one-hot形式label,顺序是按目录里文件的顺序data_dir = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm"all_file_path = get_data_path(data_dir)label_dict = read_json2dict('label_dict.json')label_ls = []for each_file_path in all_file_path:onehot_label = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]file_md5 = each_file_path.split('\\')[-1][:-4]file_type = label_dict.get(file_md5)if file_type is not None:# print(each_file_path, file_md5, file_type)onehot_label[int(file_type)] = 1.0# label_ls.append(onehot_label)label_ls.append(file_type)else:print(file_md5)print("None")print(label_ls)label_array = np.array(label_ls)print(label_array.shape)np.save(type + 'numlabel', label_array)def check_result(result_np, real_label, model):# 对模型预测的结果进行处理count = 0correct_count = 0real_label = list(real_label)for each_result in result_np:each_result = list(each_result)max_loc = each_result.index(max(each_result))guess_label = get_one_hot_from_num(max_loc)if get_one_hot_from_num(max_loc) == list(real_label[count]):correct_count = correct_count + 1else:print(print(max_loc, "----", real_label[count]))count = count + 1print("correct", correct_count)if correct_count >= 5800:# 保存正确率高的模型model.save('tf_model' + str(correct_count), save_format="tf")def generate_answer_data(predict_result):# 生成提交答案的文件type = 'test'label_dict = read_json2dict('label_dict.json')predict_result = list(predict_result)count = 0answer_count = 0answer_data = []testfile = get_data_path("D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm")for each_file in testfile:filesize = os.path.getsize(each_file) // 1000file_md5 = each_file.split('\\')[-1][:-4]print(file_md5)max_loc = list(predict_result[count]).index(max(list(predict_result[count])))label_one_hot = tuple([file_md5] + list(predict_result[count]))if file_md5 in label_dict.keys():# 如果有答案直接抄print(file_md5)answer_count = answer_count + 1true_answer = label_dict.get(file_md5)print("answer:", true_answer, max_loc, "\n")max_loc = true_answerlabel_one_hot = tuple([file_md5] + get_one_hot_from_num(max_loc))answer_data.append(label_one_hot)count = count + 1print(answer_count)return answer_datadef turn_prob_to_label(prob_data):# 将检测概率转化成一维数字标签prob_data = list(prob_data)result = []for each_data in prob_data:max_loc = list(each_data).index(max(list(each_data)))result.append(str(max_loc))return resultdef lightgbm(x, y):class_weight = {'0': 5,'1': 0.783,'2': 29.23,'3': 20.242,'4': 5,'5': 5,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}model = LGBMRegressor(max_depth=25, objective='multiclass', num_class=10, class_weight=class_weight)x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.3, random_state=1)model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],eval_metric="multi_error", verbose=True)#model.fit(x, y, early_stopping_rounds=20, eval_set=[(x, y)],#          eval_metric="multi_logloss", verbose=True)# make predictionpreds = model.predict(x)test_result = np.array(turn_prob_to_label(preds))print(test_result[0:10])print(y[0:10])test_accuracy = accuracy_score(test_result,y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))pickle.dump(model, open("lgboostmodel_250.pickle.dat", "wb"))# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))def write_csv(answer_data=[(0, 0)]):# 生成答案的csv文件data_head = [("filename", "family_0", "family_1", "family_2", "family_3", "family_4", "family_5", "family_6", "family_7","family_8", "family_9"),]data = data_head + answer_dataf = codecs.open('submit928.csv', 'w', 'gbk')writer = csv.writer(f)for i in data:writer.writerow(i)f.close()def balance_weight(ydata):# 传入对应的标签数组，对数组进行sample_weight权重的调整class_weight = {0: 1.364,1: 0.783,2: 29.23,3: 2.242,4: 1.82,5: 3.23,6: 0.753,7: 0.4347,8: 1.01,9: 0.502}def xgboost(x, y):class_weight = {'0': 1.364,'1': 0.783,'2': 29.23,'3': 2.242,'4': 1.82,'5': 3.23,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}model = XGBClassifier(max_depth=30, objective='multi:softprob', num_class=10)x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.002, random_state=1)model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],eval_metric="mlogloss", verbose=True, sample_weight=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))# make predictionpreds = model.predict(x_test_valid)print(preds[0:10])prob_pre = model.predict_proba(x_test_valid)print(prob_pre[0:10])test_accuracy = accuracy_score(y_test_valid, preds)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))if test_accuracy > 0.999:pickle.dump(model, open("xgboostmodel2.pickle.dat", "wb"))# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))def lgb_with_kfold(x, y):# lgb和kfold组合class_weight = {'0': 1.364,'1': 0.783,'2': 29.23,'3': 12.242,'4': 1.82,'5': 3.23,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}skf = StratifiedKFold(n_splits=5)model = LGBMRegressor(max_depth=30, objective='multiclass', num_class=10, class_weight=class_weight)count=1for train_index, test_index in skf.split(x, y):x_train, x_test = x[train_index], x[test_index]y_train, y_test = y[train_index], y[test_index]model.fit(x_train, y_train, early_stopping_rounds=10,eval_set=[(x_train, y_train), (x_test, y_test)],eval_metric="multi_logloss", verbose=True)prob_pre = model.predict(x)print(prob_pre[0:10])test_accuracy = accuracy_score(turn_prob_to_label(prob_pre), y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))def xgb_with_kfold(x, y):# xgb模型和kfold模型相组合skf = StratifiedKFold(n_splits=5)model = XGBClassifier(max_depth=25, objective='multi:softprob', num_class=10)for train_index, test_index in skf.split(x, y):x_train, x_test = x[train_index], x[test_index]y_train, y_test = y[train_index], y[test_index]model.fit(x_train, y_train, early_stopping_rounds=10,eval_set=[(x_train, y_train), (x_test, y_test)],eval_metric="mlogloss", verbose=True)preds = model.predict(x)print(preds[0:10])prob_pre = model.predict_proba(x)print(prob_pre[0:10])test_accuracy = accuracy_score(preds, y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))def turn_feature_2_length(filename):npfeature = list(np.load(filename))final_feature = []linecount = 0for feature_row in npfeature:linecount = linecount + 1print(linecount)feature_row = list(feature_row)count = len(feature_row) - 1while feature_row[count] == 0:count = count - 1feature_row = feature_row[:count + 1]max_featur_length = 100000current_length = len(feature_row)if current_length > max_featur_length:feature_row = feature_row[:max_featur_length // 4] + feature_row[max_featur_length // 2 - max_featur_length // 4:max_featur_length // 2 + max_featur_length // 4] + feature_row[-(max_featur_length // 4):]if current_length < max_featur_length:feature_row = feature_row + [0] * (max_featur_length - current_length)final_feature.append(feature_row)final_feature = np.array(final_feature)np.save(str(max_featur_length) + filename, final_feature)
def get_onehot(labels):n_samples = len(labels)n_classes = 10onehot_labels = np.zeros((n_samples, n_classes))onehot_labels[np.arange(n_samples), labels] = 1return onehot_labelsdef calc_log_loss(y_true, y_pred):y_true = list(y_true)y_true = get_onehot(y_true)return log_loss(y_true, y_pred)def train_five_model(train_X, train_Y,test_X):gbm = lgb.LGBMClassifier(n_jobs=-1, objective='multiclass', metric='multi_logloss')xgbc = XGBClassifier(n_jobs=-1, objective='softprob')cat = CatBoostClassifier(verbose=0, loss_function='MultiClass')rf = RandomForestClassifier(n_jobs=-1)ext = ExtraTreesClassifier(n_jobs=-1)stack_estimators = [('cat', cat), ('xgbc', xgbc), ('gbm', gbm), ('rf', rf), ('ext', ext)]sta = StackingClassifier(estimators=stack_estimators, final_estimator=SVC(probability=True), cv=5,n_jobs=-1,verbose=1)skf = StratifiedKFold(n_splits=5)total_res = []total_log = []for train_index, test_index in skf.split(train_X, train_Y):X_train, X_test = train_X[train_index], train_X[test_index]y_train, y_test = train_Y[train_index], train_Y[test_index]sample_weight = compute_sample_weight('balanced', y_train)sta.fit(X_train, y_train, sample_weight=sample_weight)res = sta.predict_proba(X_test)log_loss = calc_log_loss(y_test, res)print("log_loss:%f"%log_loss)total_log.append(log_loss)res = sta.predict_proba(test_X)total_res.append(res)print(np.mean(total_log))avg_proba = []#print(total_res)for line in range(total_res[0].shape[0]):temp_proba = np.zeros((5, 10))for i in range(len(total_res)):temp = total_res[i][line]temp_proba[i] = temp#print(temp_proba)#print(temp_proba.shape)x = np.mean(temp_proba, axis=0)#print(x)avg_proba.append(x)write_csv(generate_answer_data(avg_proba))return avg_probadef generate_file_feature(type, dict_collect={}):# 生成并保存特征，特征分两部分，一部分是少量特征拼接起来的静态特征，另一部分是操作码化成的长度40000的特征data_dir = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm"all_file_path = get_data_path(data_dir)if os.path.isfile("static_feature_2" + type + ".npy"):static_feature_collect = np.load("static_feature_2" + type + ".npy")content_feature_collect = np.load("content_feature_2" + type + ".npy")count = static_feature_collect.shape[0]all_file_path = all_file_path[count:]print("continue:", count)else:count = 0for each_file_path in all_file_path:count = count + 1print(each_file_path, count)static_feature, content_feature = readf(each_file_path, dict_collect, type)if count == 1:static_feature_collect = static_featurecontent_feature_collect = content_featureelse:static_feature_collect = np.vstack((static_feature_collect, static_feature))content_feature_collect = np.vstack((content_feature_collect, content_feature))# 每读一个文件清空字典里的统计信息dict_collect.update({'op_count_dict': generate_op_count_dict()})print(static_feature_collect.shape)print(content_feature_collect.shape)np.save("static_feature_2" + type, static_feature_collect)np.save("content_feature_2" + type, content_feature_collect)op_list = ["add", "ax", "arg", "al", "align", "bx", "byte", "call", "cmp", "dword", "ds", "dq", "dw", "db", "eax","ebx", "ebp", "ecx", "edx", "edi", "esi", "esp", "extrn", "fnstenv", "fnstcw", "fst", "fxc", "fld","fc", "fs","jmp", "jb", "jnz", "ja", "jz", "jl", "inc", "lea", "loc", "mov", "near", "not", "offset", "push", "pop","ptr","psr", "psu", "retn", "rax", "rbx", "rsi", "rsp", "rdi", "rbp", "rcx", "rdx", "rva", "sleep", "stmxcsr","second", "sub", "short","test", "unk", "var", "wait", "xor"]

Data fountain 基于人工智能的恶意软件家族分类参赛总结相关推荐

基于深度学习lstm_深度学习和基于LSTM的恶意软件分类
基于深度学习lstm Malware development has seen diversity in terms of architecture and features. This advanc ...
r包调用legend函数_R语言实现基于朴素贝叶斯构造分类模型数据可视化
本文内容原创,未经作者许可禁止转载! 目录一.前言二.摘要三.关键词四.算法原理五.经典应用六.R建模 1.载入相关包(内含彩蛋): 1.1 library包载入 1.2 pacman包载 ...
基于人工智能的地球物理参数反演范式理论及判定条件
作者:毛克彪1,2,3* ,张晨阳4,施建成5,王旭明2,郭中华2,李春树2,董立新6, 吴门新7,孙瑞静6,武胜利6,姬大彬3,蒋玲梅8,赵天杰3,邱玉宝3, 杜永明3,徐同仁8 (1. 中国农业科 ...
基于机器学习的恶意软件加密流量检测研究分享
1 概述 2 恶意软件加密流量介绍 3 加密HTTPS流量解析 4 特征工程 5 模型效果 6 具体实施 7 总结 1 概述近年来随着HTTPS的全面普及,为了确保通信安全和隐私,越来越多的网络流量 ...
【研究计划书】基于人工智能算法的肿瘤代谢问题研究
基于人工智能算法的肿瘤代谢问题研究基于人工智能算法的肿瘤代谢问题研究一.研究概述 1.1 研究背景 1.2 研究现状 1.3 研究方法二.研究内容 2.1 肿瘤代谢过程和生物标志物识别 2.2 ...
2021SC@SDUSC基于人工智能的多肽药物分析问题（十三）
基于人工智能的多肽药物分析问题(十三) 2021SC@SDUSC 1. 前言代码分析已临近尾声了,目前还剩下e2e模式的预测代码,由于两种模式的代码存在部分重叠,所以接下来的代码可能会略过一些重复代 ...
基于SVM的乳腺癌数据集分类
目录 1.作者介绍 2.SVM算法介绍 2.1 SVM算法 2.2 SVM算法理解与分析 3.乳腺癌数据集介绍 4.基于SVM的乳腺癌数据集分类实验 4.1 导入所需要的包 4.2 导入乳腺癌数据集 ...
毕业设计 - 基于卷积神经网络的乳腺癌分类深度学习医学图像
文章目录 1 前言 2 前言 3 数据集 3.1 良性样本 3.2 病变样本 4 开发环境 5 代码实现 5.1 实现流程 5.2 部分代码实现 5.2.1 导入库 5.2.2 图像加载 5.2.3 ...
【论文翻译】Gotcha - Sly Malware! Scorpion: 基于Metagraph2vec的恶意软件检测系统
Gotcha - Sly Malware! Scorpion: 基于Metagraph2vec的恶意软件检测系统摘要: 恶意软件检测由于其对互联网和计算设备安全的严重破坏和威胁,几十年来一直受到反恶 ...

Data fountain 基于人工智能的恶意软件家族分类参赛总结

Data fountain 基于人工智能的恶意软件家族分类参赛总结相关推荐

最新文章

热门文章

Data fountain 基于人工智能的恶意软件家族分类 参赛总结

Data fountain 基于人工智能的恶意软件家族分类 参赛总结相关推荐

最新文章

热门文章

Data fountain 基于人工智能的恶意软件家族分类参赛总结

Data fountain 基于人工智能的恶意软件家族分类参赛总结相关推荐