"""
1.特征:linearsvm-tfidf(word)+lr-tfidf(article) / doc2vec_word
2.模型:bayes"""
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import pickle
import time
import random
from scipy.stats import randint as sp_randint
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,BaseDiscreteNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import scipydef get_time_diff(start_time,str):print("%s 用时:%f min" % (str,(time.time() - start_time)/60))return time.time() - start_timeDATA_PATH = '..\达观杯_特征\\feature_word\Doc2vec_word\Doc2vec_word\data_doc2vec.pkl'
SAVE_PATH = './models/bayes/HashingVectorizer_word'def read_data(path):with open(path, 'rb') as f:x_train, y_train, x_test = pickle.load(f)X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.25)return X_train, X_val, Y_train, Y_val, x_testdef model_search_random(X_train, Y_train, param_dist, n_iter_search = 20,clf = MultinomialNB()):random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)random_search.fit(X_train, Y_train)return random_searchdef model_search_grid(X_train, Y_train, param_dist,clf = MultinomialNB()):random_search = GridSearchCV(estimator=clf,param_grid=param_dist)random_search.fit(X_train, Y_train)return random_searchdef model_best(model_search):print('随机搜索-度量记录:', model_search.cv_results_)  # 包含每次训练的相关信息print('随机搜索-最佳度量值:', model_search.best_score_)  # 获取最佳度量值print('随机搜索-最佳参数:', model_search.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典print('随机搜索-最佳模型:', model_search.best_estimator_)  # 获取最佳度量时的分类器模型return model_search.cv_results_, model_search.best_score_,model_search.best_params_,model_search.best_estimator_def predict(X_val,Y_val,x_test,model):Y_val_test = model.predict(X_val)score = accuracy_score(Y_val, Y_val_test)y_test = model.predict(x_test)y_test = [i + 1 for i in y_test.tolist()]df_result = pd.DataFrame({'id': range(102277), 'class': y_test})y_test_proba = model.predict_proba(x_test)y_test_proba = y_test_proba.tolist()df_proba = pd.DataFrame({'id': range(102277), 'proba': y_test_proba})return df_result,df_proba,scoredef save(score,path_save,df_result,df_proba,model,alpha = None):if alpha is not None:class_name = '/bayes_%0.2f_%s_class_%0.4f.csv' % (alpha,"data_tf_word",score)proba_name = '/bayes_%0.2f_%s_proba_%0.4f.csv' % (alpha,"data_tf_word",score)model_name = '/bayes_%0.2f_%s_model_%0.4f.csv' % (alpha,"data_tf_word",score)else:class_name = '/bayes_%s_class_%0.4f.csv' % ("data_tf_word",score)proba_name = '/bayes_%s_proba_%0.4f.csv' % ("data_tf_word",score)model_name = '/bayes_%s_model_%0.4f.csv' % ("data_tf_word",score)with open(path_save + model_name, 'wb')as f:pickle.dump(model, f)df_result.to_csv(path_save + class_name, index=False)df_proba.to_csv(path_save + proba_name, index=False)def test():_start = time.time()__start = _start # 记录总的时间X_train, X_val, Y_train, Y_val, x_test = read_data(DATA_PATH)get_time_diff(_start,"读取数据")_start = time.time()params = {"alpha": np.arange(0.75,0.86,0.01)}# search_model = model_search_random(X_train,Y_train,param_dist=params,n_iter_search=15,clf=MultinomialNB())search_model = model_search_grid(X_train, Y_train, param_dist=params,clf = MultinomialNB())get_time_diff(_start,"模型搜索")_start = time.time()cv_results_, best_score_, best_params_, model_best_estimator_ = model_best(search_model)alpha = best_params_['alpha']# fit_prior = best_params_['fit_prior']model = CalibratedClassifierCV(base_estimator=MultinomialNB(alpha=alpha))model.fit(X_train,Y_train)get_time_diff(_start,"最佳模型")_start = time.time()df_res, df_prob, score = predict(X_val,Y_val,x_test,model)get_time_diff(_start, "预测结果")print("验证集分数:%0.4f"%score)_start = time.time()save(score,SAVE_PATH,df_res,df_prob,model,alpha=alpha)get_time_diff(_start,"保存模型")get_time_diff(__start,"共用")if __name__ == '__main__':test()

达观杯_构建模型(四)贝叶斯相关推荐

  1. 达观杯_构建模型(一)linearSVM

    特征:tfidf(word)+tfidf(article) """ 1.特征:tfidf(word)+tfidf(article) 2.模型:linearsvm 3.参数 ...

  2. 达观杯_构建模型(三)lightGBM

    countvector(a)+doc(a)+hash(a) """ 1.特征:countvector(a)+doc(a)+hash(a) 2.模型:lgb "& ...

  3. 达观杯_构建模型(二)逻辑回归

    特征:tfidf(word+article) """ 1.特征:tfidf(word+article) 2.模型:lr 3.参数:C=120 ""&q ...

  4. 决策树模型 朴素贝叶斯模型_有关决策树模型的概述

    决策树模型 朴素贝叶斯模型 Decision Trees are one of the highly interpretable models and can perform both classif ...

  5. 如何用python进行建模_用 Python 进行贝叶斯模型建模(1)

    本系列: 第1节:估计模型参数 在这一节,我们将讨论贝叶斯方法是如何思考数据的,我们怎样通过 MCMC 技术估计模型参数. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...

  6. 判别两棵树是否相等 设计算法_机器学习算法-朴素贝叶斯

    一.概述 概率分类器 在许多分类算法应用中,特征和标签之间的关系并非是决定性的.比如说,我们想预测一个人究竟是否会在泰坦尼克号海难中生存下来,那我们可以建一棵决策树来学习我们的训练集.在训练中,其中一 ...

  7. pymc3 贝叶斯线性回归_使用PyMC3进行贝叶斯媒体混合建模,带来乐趣和收益

    pymc3 贝叶斯线性回归 Michael Johns, Zhenyu Wang, Bruno Dupont, and Luca Fiaschi 迈克尔·约翰斯,王振宇,布鲁诺·杜邦和卢卡·菲亚斯基 ...

  8. 【NLP】经典分类模型朴素贝叶斯解读

    贝叶斯分类器在早期的自然语言处理任务中有着较多实际的应用,例如大部分的垃圾邮件处理都是用的贝叶斯分类器.贝叶斯分类器的理论对于理解后续的NLP模型有很大的进益,感兴趣的小伙伴一定要好好看看,本文会详细 ...

  9. PGM:有向图模型:贝叶斯网络

    http://blog.csdn.net/pipisorry/article/details/52489270 为什么用贝叶斯网络 联合分布的显式表示 Note: n个变量的联合分布,每个x对应两个值 ...

最新文章

  1. 综述:光流估计从传统方法到深度学习
  2. Ubuntu安装yum失败-2
  3. SpringBoot注解自动扫描-底层实现
  4. Debian下Cannot set LC_CTYPE to default locale: No such file or directory解决方法
  5. K 近邻法(K-Nearest Neighbor, K-NN)
  6. 利用BI搭建零售业数据信息平台
  7. [.Net] 一句话Linq(递归查询)
  8. LibreOJ β Round #2 E. 数论只会 GCD
  9. 你想要的宏基因组-微生物组知识全在这(2021.8)
  10. 桌面计算机图标带虚线框,桌面图标出现虚线框,win10桌面图标带有虚线方框
  11. led灯光衰怎么解决_揭秘LED灯具光衰原因
  12. SCRATCH编程与科学——简单电路
  13. 智慧校园,用“智慧”培育“未来之花”
  14. python制作微信个人二维码_无聊的脚本,python生成随机二维码,手机微信自动扫脚本。...
  15. python预测股票价格论文_基于机器学习的股票分析与预测模型研究
  16. 【校招面试】紫光展锐 - 应用软件工程师面经(一面)
  17. Generative Adversarial Zero-shot Learning via Knowledge Graphs翻译
  18. winform 分页打印实例
  19. java并发编程之再学习
  20. python可视化库matplotlib_Python可视化库matplotlib(基础整理)

热门文章

  1. C++ 常用函数方法
  2. 别人总结的批处理技巧
  3. 编写和调试Shader程序(1)
  4. 创建ASP.NET WEB自定义控件——例程2
  5. 题目 1471:【蓝桥杯】【入门题】【基础练习VIP】矩形面积交
  6. 如何利用 C# 爬取Gate.io交易所的公告!
  7. threshold 二值化的实现
  8. 【CTF】实验吧 奇怪的短信
  9. 超 40W 奖金池等你来战!第二届“长沙银行杯”腾讯云启创新大赛火热来袭!...
  10. 机器学习和计算机视觉的前20个图像数据集