123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- import torch
- from torch import nn as nn
- from model import BiRNN
- import os
- from userdataset import UserDataset
- import time
- import torch.optim as optim
- from torch.utils.data import DataLoader, TensorDataset
- from torch.autograd import Variable
- from config import Config
- from torch.utils.data import DataLoader, WeightedRandomSampler
- from torchsampler import ImbalancedDatasetSampler
- import random
- import pandas as pd
- from tqdm import tqdm
- from sklearn import metrics
- import sys
- from datetime import datetime
- import pickle
- sys.path.append('../')
- from common.log_utils import logFactory
- logger = logFactory("train_main").log
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
- def train(train_loader, test_loader):
- model = BiRNN(input_size=Config.input_size, hidden_dim=Config.hidden_dim, label_size=Config.label_size,
- batch_size=Config.batch_size, num_layers=Config.num_layers, use_gpu=Config.use_gpu)
- if Config.use_gpu:
- model = model.cuda()
- loss_function = nn.CrossEntropyLoss().to(device)
- else:
- model = model
- loss_function = nn.CrossEntropyLoss()
- optimizer = optim.SGD(model.parameters(), lr=Config.learning_rate)
- train_loss_ = []
- test_loss_ = []
- train_acc_ = []
- test_acc_ = []
- logger.info(f"开始epoch,共{Config.epochs}轮")
- for epoch in range(Config.epochs):
- optimizer, lr = Config.adjust_learning_rate(optimizer, epoch)
- logger.info(f'epoch-{epoch},当前lr为:{str(lr)}')
- # train epoch
- total_acc = 0.0
- total_loss = 0.0
- total = 0.0
- for index, (contents, labels) in tqdm(enumerate(train_loader)):
- try:
- train_inputs = contents
- train_labels = torch.squeeze(labels)
- if Config.use_gpu:
- # train_inputs, train_labels = Variable(train_inputs, train_labels)
- train_inputs, train_labels = train_inputs.cuda(), train_labels.cuda()
- else:
- train_inputs = Variable(train_inputs)
- model.zero_grad()
- model.batch_size = len(train_labels)
- model.hidden = model.init_hidden()
- output = model(train_inputs)
- loss = loss_function(output, Variable(train_labels))
- loss.backward()
- optimizer.step()
- # calc training acc
- _, predicted = torch.max(output.data, 1)
- total_acc += (predicted == train_labels).sum()
- total += len(train_labels)
- total_loss += loss.item()
- if index % 64 == 0:
- logger.info('Epoch ' + str(epoch) + ' : ' + str(index // 10) + ' , LOSS =' + str(loss))
- result = metrics.classification_report(train_labels.cpu().numpy(), predicted.cpu().numpy())
- logger.info('in train epoch')
- logger.info(result)
- except Exception as e:
- print(e)
- continue
- train_loss_.append(total_loss / total)
- train_acc_.append(total_acc / total)
- # test epoch
- total_acc = 0.0
- total_loss = 0.0
- total = 0.0
- for index, (contents, labels) in enumerate(test_loader):
- try:
- test_inputs = contents
- test_labels = torch.squeeze(labels)
- if Config.use_gpu:
- test_inputs, test_labels = Variable(test_inputs.cuda()), test_labels.cuda()
- else:
- test_inputs = Variable(test_inputs)
- model.batch_size = len(test_labels)
- model.hidden = model.init_hidden()
- output = model(test_inputs)
- test_labels.to(torch.int64)
- loss = loss_function(output, Variable(test_labels))
- # calc testing acc
- _, predicted = torch.max(output.data, 1)
- total_acc += (predicted == test_labels).sum()
- total += len(test_labels)
- total_loss += loss.item()
- if index % 64 == 0:
- logger.info('Epoch ' + str(epoch) + ' : ' + str(index // 10) + ' , LOSS =' + str(loss))
- result = metrics.classification_report(test_labels.cpu().numpy(), predicted.cpu().numpy())
- logger.info('in test epoch')
- logger.info(result)
- except Exception as e:
- print(e)
- continue
- test_loss_.append(total_loss / total)
- test_acc_.append(total_acc / total)
- # result = metrics.classification_report(test_y_data.values, test_result)
- # logger.info(result)
- logger.info('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
- % (epoch, Config.epochs, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))
- result = {}
- result['train loss'] = train_loss_
- result['test loss'] = test_loss_
- result['train acc'] = train_acc_
- result['test acc'] = test_acc_
- if Config.use_plot:
- import PlotFigure as PF
- PF.PlotFigure(result, Config.plot_save)
- if Config.use_save:
- torch.save(model,"./model")
- def start():
- train_data_set = UserDataset("../data/csv_lstm_data/final_train_dict.txt","train_data")
- train_loader = DataLoader(dataset=train_data_set, batch_size=Config.batch_size, shuffle=Config.shuffle_train_data,
- sampler=ImbalancedDatasetSampler(train_data_set))
- # train_loader = DataLoader(dataset=train_data_set, batch_size=Config.batch_size, shuffle=Config.shuffle_train_data,
- # num_workers=8)
- test_data_set = UserDataset("../data/csv_lstm_data/final_test_dict.txt","test_data")
- test_loader = DataLoader(dataset=test_data_set, batch_size=Config.batch_size, shuffle=Config.shuffle_train_data,
- sampler=ImbalancedDatasetSampler(test_data_set))
- # test_loader = DataLoader(dataset=test_data_set, batch_size=Config.batch_size, shuffle=Config.shuffle_train_data,
- # num_workers=8)
- train(train_loader, test_loader)
- if __name__ == "__main__":
- start()
- # f = open("../data/csv_lstm_data/path_dict.txt", "r")
- # lines = f.read().split("\n")
- # label_dict = {}
- # pos = []
- # neg = []
- # for line in lines:
- # if line == "":
- # continue
- # file_path = line.split(" ")[0]
- # label_value = line.split(" ")[1]
- # if label_value == "0":
- # pos.append(file_path)
- # else:
- # neg.append(file_path)
- # pos_test = random.sample(pos, int(len(pos) / 5))
- # pos_train = [elem for elem in pos if elem not in pos_test]
- #
- # neg_test = random.sample(neg, int(len(neg) / 5))
- # neg_train = [elem for elem in neg if elem not in neg_test]
- #
- #
- # with open("../data/csv_lstm_data/train_dict.txt", "a") as f:
- # for e in pos_train:
- # f.write(e + " " + "0" + "\n")
- # for e in neg_train:
- # f.write(e + " " + "1" + "\n")
- #
- # with open("../data/csv_lstm_data/test_dict.txt", "a") as f:
- # for e in pos_test:
- # f.write(e + " " + "0" + "\n")
- # for e in neg_test:
- # f.write(e + " " + "1" + "\n")
- #
- # pass
- # train_final = []
- # test_final = []
- # for i, path in enumerate(["../data/csv_lstm_data/train_dict.txt", "../data/csv_lstm_data/test_dict.txt"]):
- #
- # with open(path, "r") as f:
- # lines = f.read()
- # line = lines.split("\n")
- # for l in line:
- # path = "." + l.split(" ")[0]
- # label = l.split(" ")[1]
- # df = pd.read_pickle(path)
- # if len(df) == 2:
- # if i == 0:
- # train_final.append((path, label))
- # else:
- # test_final.append((path, label))
- #
- # with open('../data/csv_lstm_data/final_train_dict.txt', "a") as f:
- # for e in train_final:
- # f.write(e[0]+" "+str(e[1])+"\n")
- #
- # with open('../data/csv_lstm_data/final_test_dict.txt', "a") as f:
- # for e in test_final:
- # f.write(e[0]+" "+str(e[1])+"\n")
|