前言

当时也没有发现什么问题，后面老师给我们说了这个检测的目的性不明确，就像有的新闻并不是虚假的，但是文辞过于偏激就很容易判断错误，可以做一个关于新闻性质的判断，例如是否危害社会稳定，是否传播不良信息。

一、数据处理

import pandas as pd
import numpy as np
import re
from gensim.models import word2vec
import jieba
import os
import pickle# 过滤分词列表中的停用词
def stopwords_filter(stopwords_list, seg_list):filter_words_list = []# 停用词过滤for word in seg_list:if word not in stopwords_list:filter_words_list.append(word)return filter_words_list# 中文段落分词，返回词语列表（包含停用词过滤）
def sentence_seg(sentence):pattern = re.compile("[^\u4e00-\u9fa5]+")# 以下两行过滤出中文及字符串以外的其他符号sentence = pattern.sub('', sentence)return stopwords_filter(pd.read_table('dataset/cn_stopwords.txt', header=None).iloc[:, :].values,jieba.cut(sentence))# 新闻csv预处理成特征向量和标签
def csv2vec(csv_path, is_train=True):df = pd.read_csv(csv_path)  # 读取数据# 数据清理df.drop(axis=1, inplace=True, columns=["Unnamed: 0"])  # 删除索引列df = df.replace(re.compile(r'\[.*?\]'), " ", regex=True)  # 去除[xxx]df = df.replace(re.compile(r'@.*?:'), " ", regex=True)  # 去除@xxxdf = df.replace("\t", " ", regex=False)  # 去除转义字符df = df.replace("网页链接", " ", regex=False)  # 去除网页链接df['content'] = df['content'].str.strip()  # 去除首尾空格df = df.fillna(value=' ')   # 填充空值# 内容分词和评论分词df['content'] = df['content'].apply(lambda x: ' '.join(sentence_seg(x)))df['comment_all'] = df['comment_all'].apply(lambda x: ' '.join(sentence_seg(x)))df = df.fillna(value=' ')  # 填充空值# 根据新闻语料构建词向量模型content_seglist = [x.split(' ') for x in df['content']]comment_seglist = [x.split(' ') for x in df['comment_all']]wv_model = Nonewv_size = 50if is_train:wv_model = word2vec.Word2Vec(content_seglist + comment_seglist, vector_size=wv_size, min_count=1)with open('model/wv.model', 'wb') as outfile:pickle.dump(wv_model, outfile)  # 保存词向量else:with open('model/wv.model', 'rb') as infile:wv_model = pickle.load(infile)  # 载入词向量# 提取新闻特征向量feature = []for i in range(len(content_seglist)):feature_vec = np.zeros(shape=[0], dtype='float32')  # 2n维特征向量text_vec = np.zeros(shape=[wv_size], dtype='float32')  # 文本向量(n维)，采用n维词向量的平均值count = 0  # 词数量for word in content_seglist[i]:if wv_model.wv.has_index_for(word):text_vec += wv_model.wv[word]   # 词向量累加count += 1if count != 0:feature_vec = np.concatenate((feature_vec, text_vec / count))else:feature_vec = np.concatenate((feature_vec, text_vec))text_vec = np.zeros(shape=[wv_size], dtype='float32')  # 文本向量(n维)，采用n维词向量的平均值count = 0  # 词数量for word in comment_seglist[i]:if wv_model.wv.has_index_for(word):text_vec += wv_model.wv[word]   # 词向量累加count += 1if count != 0:feature_vec = np.concatenate((feature_vec, text_vec / count))else:feature_vec = np.concatenate((feature_vec, text_vec))feature.append(feature_vec.tolist())label = []if is_train:    # 对于训练集还要返回label集合for x in df['label']:label.append(x)return {'X': np.array(feature), 'y': np.array(label)}# 预处理
train_set = csv2vec('dataset/train.csv', is_train=True)
test_set = csv2vec('dataset/test.csv', is_train=False)
# 保存结果
with open('dataset/train.pkl', 'wb') as file:pickle.dump(train_set, file)
with open('dataset/test.pkl', 'wb') as file:pickle.dump(test_set, file)

二、模型训练

import pickle
import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScalerclf = MLPClassifier(max_iter=500, hidden_layer_sizes=(100,), solver='adam', alpha=0.0002)
with open('dataset/train.pkl', 'rb') as file:# 训练train_set = pickle.load(file)scaler = StandardScaler()train_data = scaler.fit_transform(train_set['X'])   # 标准化（MLP对此敏感）joblib.dump(scaler, 'model/scaler.model')train_label = train_set['y']clf.fit(train_data, train_label)joblib.dump(clf, 'model/mlp.model')  # 保存模型

三.调参

import pickle
import joblib
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCVclf = MLPClassifier()
mlp_clf_tuned_parameters = {"hidden_layer_sizes": [(100,), (100, 30), (100, 30, 30)],"solver": ['adam', 'sgd', 'lbfgs'],"max_iter": [20],   # 为了快速得出最优参数这里把迭代次数设置小一点"alpha": np.linspace(0.0001, 0.0005, 5)}   # mlp参数调整范围
opt = GridSearchCV(clf, mlp_clf_tuned_parameters)     # 自动调参器
with open('dataset/train.pkl', 'rb') as file:train_set = pickle.load(file)scaler = StandardScaler()# 标准化（MLP对此敏感）train_data = scaler.fit_transform(train_set['X'])train_label = train_set['y']opt.fit(train_data, train_label)print(opt.get_params().keys())print(opt.best_params_)

四.模型预测

import joblib
import pickle
from sklearn.neural_network import MLPClassifier#0为真实新闻，1为虚假新闻，-1为评论
clf = joblib.load('model/mlp.model')
scaler = joblib.load('model/scaler.model')
with open('dataset/test.pkl', 'rb') as infile:test_set = pickle.load(infile)pred = clf.predict(scaler.transform(test_set['X']))with open('mlp_pred.txt', 'w') as outfile:for x in pred:outfile.write(str(x) + '\n')

基于word2vec的虚假新闻检测系统相关推荐

《基于区块链技术的虚假新闻检测方法》文献阅读笔记+总结
<基于区块链技术的虚假新闻检测方法>文献阅读笔记+总结关键词:区块链.智能合约.虚假新闻.新闻网站.博弈论来源题目时间作者中国学术期刊网络版 <基于区块链技术的虚假新闻检 ...
SIGIR 2021 | 基于用户偏好感知的虚假新闻检测
©PaperWeekly 原创 · 作者 | 金金单位 | 阿里巴巴研究实习生研究方向 | 推荐系统简介近年来,虚假信息和假新闻对个人和社会造成了不利影响,引起了对假新闻检测的广泛关注.大多数 ...
独家 | 基于NLP的COVID-19虚假新闻检测（附代码）
作者:Susan Li 翻译:杨毅远校对:吴金笛本文长度为4400字,建议阅读8分钟本文为大家介绍了基于自然语言处理的COVID-19虚假新闻检测方法以及可视化方法,并结合真实的新闻数据集与完整 ...
基于NLP的COVID-19虚假新闻检测
基于NLP的COVID-19虚假新闻检测摘要全文约2400字,建议阅读时间7分钟.本文为大家介绍了基于自然语言处理的COVID-19虚假新闻检测方法以及可视化方法,并结合真实的新闻数据集以及完整的 ...
基于元路径的利用多级社会背景信息的虚假新闻检测
原文 <Meta-Path-based Fake News Detection Leveraging Multi-level Social Context Information> 一论 ...
【NLP】万字长文带你解读『虚假新闻检测』最新进展
NewBeeNLP原创出品公众号专栏作者 @byn blog | https://blog.csdn.net/byn12345 互联网时代,假新闻铺天盖地,而且极具迷惑性,因此假新闻检测任务对逻辑的 ...
虚假新闻检测的论文阅读笔记——sigir2021：User Preference-aware Fake News Detection
文章目录 1.虚假新闻检测的相关简介 2.本篇论文引言 3.模型介绍 3.1.内生偏好编码器 3.2.外生内容编码器 3.3.二者信息融合 4.实验 4.1.各模型的实验结果 4.2.消融实验 5.结 ...
虚假新闻检测论文调研
虚假新闻检测论文调研 Evidence Inference Networks for Interpretable Claim Verification 基本信息发表刊物和年份:2021 AAAI 摘 ...
虚假新闻检测挑战赛落幕，探寻获奖团队背后的故事
2019年11月16日,智源论坛:虚假新闻检测暨2019虚假新闻检测挑战赛颁奖仪式召开.本次挑战赛由北京智源人工智能研究院和中国科学院计算技术研究所共同举办,旨在促进互联网虚假新闻检测技术的发展,营造 ...

基于word2vec的虚假新闻检测系统

前言

二、模型训练

三.调参

四.模型预测

基于word2vec的虚假新闻检测系统相关推荐

最新文章

热门文章