1.训练前准备

指定训练和预测的gpu

from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torchdevice0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu

读取数据、分析数据


data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]#可以通过df.colname 来指定某个列，value_count()在这里进行计数
df2 = data.label.value_counts()
print(df2)

构造训练数据

class SentimentDataset(Dataset):def __init__(self,df):self.dataset = dfdef __len__(self):return len(self.dataset)def __getitem__(self, idx):text = self.dataset.loc[idx, "text"]label = self.dataset.loc[idx, "label"]input_ids = self.dataset.loc[idx, "input_ids"]attention_mask = self.dataset.loc[idx, "attention_mask"]sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}# print(sample)return sampleprint('text2token')
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def text2token(text,tokenizer,max_length=100):text2id = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")input_ids=text2id["input_ids"].tolist()attention_mask=text2id["attention_mask"].tolist()return input_ids,attention_maskinput_ids,attention_mask=text2token(text,tokenizer,max_length=100)data['input_ids']=input_ids
data['attention_mask']=attention_masktrain_data = data.sample(frac=0.8)
test_data=data[~data.index.isin(train_data.index)]
print(len(train_data),len(test_data))
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)print('DataLoader')
#按batch_size分batch_size=16
train_loader = DataLoader(SentimentDataset(train_data), batch_size=batch_size, shuffle=True, num_workers=0
)
test_loader = DataLoader(SentimentDataset(test_data), batch_size=batch_size, shuffle=False, num_workers=0
)
import pickle
with open('train_loader.pkl', 'wb') as f:pickle.dump(train_loader, f)
with open('test_loader.pkl', 'wb') as f:pickle.dump(test_loader, f)

如果之前保存了可以用这个直接读数据

import pickle
with open("train_loader.pkl",'rb') as f:train_loader  = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:test_loader  = pickle.loads(f.read())

2.模型定义、训练和测试代码

定义模型

from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
class fn_cls(nn.Module):def __init__(self,device):super(fn_cls, self).__init__()self.model = AutoModel.from_pretrained("bert")self.model.resize_token_embeddings(len(tokenizer))##############self.model.to(device)# self.dropout = nn.Dropout(0.3)self.l1 = nn.Linear(768, 1)def forward(self, x, attention_mask=None):outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])x = outputs[1]# x = self.dropout(x)x = self.l1(x)return x
# cls = fn_cls(device0)# from torch import optim
# optimizer = optim.Adam(cls.parameters(), lr=1e-4)
sigmoid = nn.Sigmoid()
criterion = nn.BCELoss()#weight=weight

测试代码

from sklearn import metrics
import numpy as np
from tqdm import tqdmdef test(device_test):cls.to(device_test)cls.eval()epoch_loss=0total=0correct=0output_all=[]label_all=[]for batch_idx,batch in enumerate(test_loader):with torch.no_grad():label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1label_all.append(label)input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100#计算输出output = cls(input_ids, attention_mask=attention_mask)#batch size * 1output=sigmoid(output)#batch size * 1total+=len(output)#计算lossloss = criterion(output, label)epoch_loss+=lossave_loss=epoch_loss/total#四舍五入output=output.round()output_all.append(output)#计算准确率add_correct=(output== label).sum().item()correct+=add_correctacc=correct/totalif batch_idx%5==0:print('[{}/{} ({:.0f}%)]\t正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(batch_idx, len(test_loader),100.*batch_idx/len(test_loader), correct, total,acc,ave_loss),end= "\r")#结束：print('正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(correct, total,acc,ave_loss))#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.output_all=torch.cat(output_all,0)label_all=torch.cat(label_all,0)output_all=np.array(output_all.cpu())label_all=np.array(label_all.cpu())acc_score=metrics.accuracy_score(label_all,output_all)print(metrics.classification_report(label_all,output_all))print("准确率:",acc_score )return acc,epoch_loss.item()# test(device1)

训练代码

train_acc_l=[]
train_epoch_loss_l=[]
test_acc_l=[]
test_epoch_loss_l=[]def train_one_epoch(device_train,epoch_num):print("______________________________________________")print("______________________________________________")print("_______________",epoch_num,"start_______________")print("______________________________________________")print("______________________________________________")cls.to(device_train)cls.train()epoch_loss=0total=0correct=0output_all=[]label_all=[]for batch_idx,batch in enumerate(train_loader):label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100#计算输出output = cls(input_ids, attention_mask=attention_mask)#batch size * 1output=sigmoid(output)#batch size * 1#计算lossloss = criterion(output, label)loss.backward()optimizer.step()optimizer.zero_grad()with torch.no_grad():#四舍五入output=output.round()output_all.append(output)label_all.append(label)total+=len(output)#epoch_lossepoch_loss+=lossave_loss=epoch_loss/total#计算准确率add_correct=(output== label).sum().item()correct+=add_correctacc=correct/totalif batch_idx%5==0:print('[{}/{} ({:.0f}%)]\t正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(batch_idx, len(train_loader),100.*batch_idx/len(train_loader), correct, total,acc,ave_loss),end= "\r")#结束：print('正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(correct, total,acc,ave_loss))#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.with torch.no_grad():output_all=torch.cat(output_all,0)label_all=torch.cat(label_all,0)output_all=np.array(output_all.cpu())label_all=np.array(label_all.cpu())acc_score=metrics.accuracy_score(label_all,output_all)# print(metrics.classification_report(label_all,output_all))# print("准确率:",acc_score )test_acc,test_epoch_loss=test(device1)print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss)train_acc_l.append(acc)train_epoch_loss_l.append(epoch_loss.item())test_acc_l.append(test_acc)test_epoch_loss_l.append(test_epoch_loss)print("______________________________________________")print("______________________________________________")print("_______________",epoch_num,"end_______________")print("______________________________________________")print("______________________________________________")return test_epoch_loss# train_one_epoch(device0,0)

3.微调

import timecls = fn_cls(device0)from torch import optim
# cls=torch.load("./data/yxl_best.model",map_location=device0)
optimizer = optim.Adam(cls.parameters(), lr=1e-4)
test(device1)
now_loss = 999
pre_epoch_loss = 9999
epoch = 0
while now_loss < pre_epoch_loss :torch.save(cls,"./data/yxl_best.model")pre_epoch_loss = now_loss now_loss = train_one_epoch(device0,epoch)epoch += 1

4.预测、批量预测

def predict(device,s_l,cls):with torch.no_grad():cls.to(device)cls.eval()text2id = tokenizer(s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt")input_ids=text2id["input_ids"].to(device)mask=text2id["attention_mask"].to(device)output = cls(input_ids, attention_mask=mask)output1=sigmoid(output)output2=output1.round()return output1,output2
from tqdm import tqdm
def run(device, s_l, cls, bs):# bs指的是batch sizewith torch.no_grad():cls.to(device)cls.eval()len_ = len(s_l)all_end_lgs = []all_end = []for start in tqdm(range(0, len_, bs)):li_i = s_l[start:min(start+bs, len_)]text2id = tokenizer(li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt")input_ids=text2id["input_ids"].to(device)mask=text2id["attention_mask"].to(device)output = cls(input_ids, attention_mask=mask)output1=sigmoid(output)output2=output1.round()all_end_lgs = all_end_lgs + output1.tolist()all_end = all_end + output2.tolist()return all_end,all_end_lgs

预测实例：

s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])

【实战】使用Bert微调完成文本二分类相关推荐

使用google的bert结合哈工大预训练模型进行中文/英文文本二分类，基于pytorch和transformer
使用bert的哈工大预训练模型进行中文/英文文本二分类,基于pytorch和transformer 前提简要介绍开始导入必要的包和环境准备并读取数据导入模型的tokenizer 对数据进行t ...
bert中文情感分析二分类任务详解
查看GPU版本和使用情况 import torch if torch.cuda.is_available():device = torch.device("cuda")print( ...
如何用Python和BERT做中文文本二元分类？| 程序员硬核评测
点击上方↑↑↑蓝字关注我们~ 「2019 Python开发者日」全日程揭晓,请扫码咨询 ↑↑↑ 作者 | 王树义来源 | 王树芝兰(ID:nkwangshuyi) 兴奋去年, Google 的 B ...
天池小布助手对话短文本语义匹配-文本二分类实践（pytorch）
目标:对句子二分类,检测两个句子是否表达是同一个意思,模型数据来自天池全球人工智能技术创新大赛[赛道三]详情数据格式: 模型:textcnn,lstm,lstm+attention最后选择用最后一种 ...
python深度神经网络文本二分类代码_如何用Python和深度神经网络识别图像？
只需要10几行Python代码,你就能自己构建机器视觉模型,对图片做出准确辨识和分类.快来试试吧! 视觉进化的作用,让人类对图像的处理非常高效. 这里,我给你展示一张照片. 如果我这样问你: 你能否 ...
Bert中文文本多分类与传统BOW+tfidf+LR中文文本多分类对比
最近在重温bert,对bert的中文文本多分类的效果很好奇,并将其与传统的非pre-train模型进行对比,除此之外,由于选用的是12层的base版的bert,还从第0层开始到12层,对每一层的输出进 ...
使用BERT做中文文本相似度计算与文本分类
转载请注明出处,原文地址: https://terrifyzhao.github.io/2018/11/29/使用BERT做中文文本相似度计算.html 简介最近Google推出了NLP大杀器BER ...
深度学习实战案例：电影评论二分类
第一个深度学习实战案例:电影评论分类公众号:机器学习杂货店作者:Peter 编辑:Peter 大家好,我是Peter~ 这里是机器学习杂货店 Machine Learning Grocery~ 本 ...
天池零基础入门NLP竞赛实战：Task4-基于深度学习的文本分类3-基于Bert预训练和微调进行文本分类
Task4-基于深度学习的文本分类3-基于Bert预训练和微调进行文本分类因为天池这个比赛的数据集是脱敏的,无法利用其它已经预训练好的模型,所以需要针对这个数据集自己从头预训练一个模型. 我们利用H ...

【实战】使用Bert微调完成文本二分类

使用Bert微调完成文本二分类

1.训练前准备

指定训练和预测的gpu

读取数据、分析数据

构造训练数据

2.模型定义、训练和测试代码

定义模型

测试代码

训练代码

3.微调

4.预测、批量预测

【实战】使用Bert微调完成文本二分类相关推荐

最新文章

热门文章