local_process_deep.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. from common.log_utils import logFactory
  2. from common.database_utils import database_util
  3. from common import constant
  4. import pandas as pd
  5. from tqdm import tqdm
  6. from random import shuffle
  7. from sklearn.model_selection import train_test_split
  8. import lightgbm as lgb
  9. import torch
  10. import torch.nn as nn
  11. import torch.utils.data as data_utils
  12. from torch.utils.data import Dataset, DataLoader
  13. logger = logFactory("local preprocess data").log
  14. # click_client = database_util.get_client()
  15. # # 获取正负样本数量,准备随机id
  16. # tb_pos = constant.insert_pos_tb_name
  17. # tb_neg = constant.insert_neg_tb_name
  18. # positive_sql = f"select count(1) from {tb_pos}"
  19. # positive_sql_result = click_client.execute(positive_sql)[0][0]
  20. # positive_ids = range(0, positive_sql_result)
  21. # negative_sql = f"select count(1) from {tb_neg}"
  22. # negative_sql_result = click_client.execute(negative_sql)[0][0]
  23. # negative_ids = range(0, negative_sql_result)
  24. #
  25. # # 分组数量
  26. batch_num_pos = 10
  27. batch_num_neg = 150
  28. # 根据每个批次的数据量计算出每个批次的row_id
  29. def partition_preserve_order(list_in, n):
  30. indices = list(range(len(list_in)))
  31. shuffle(indices)
  32. index_partitions = [sorted(indices[i::n]) for i in range(n)]
  33. return [[list_in[i] for i in index_partition]
  34. for index_partition in index_partitions]
  35. def gen_train_tuple(pos_row_ids, neg_row_ids):
  36. result = []
  37. for i, e, in enumerate(neg_row_ids):
  38. pos_index = i % batch_num_pos
  39. result.append((pos_row_ids[pos_index], neg_row_ids[i]))
  40. return result
  41. # batch_num_pos = 10
  42. # batch_num_neg = 5
  43. total_pos = pd.read_pickle("data_pos_2013.pkl")
  44. total_neg = pd.read_pickle("data_neg_2013.pkl")
  45. # 1862547
  46. positive_ids = range(0, total_pos.shape[0])
  47. # 28500000
  48. negative_ids = range(0, total_neg.shape[0])
  49. pos_row_ids = partition_preserve_order(positive_ids, batch_num_pos)
  50. neg_row_ids = partition_preserve_order(negative_ids, batch_num_neg)
  51. train_tuple = gen_train_tuple(pos_row_ids, neg_row_ids)
  52. '''
  53. 搭建神经网络,
  54. 输入层包括2个节点,两个隐层均包含5个节点,输出层包括1个节点。
  55. '''
  56. net = nn.Sequential(
  57. nn.Linear(2,5), # 输入层与第一隐层结点数设置,全连接结构
  58. torch.nn.Sigmoid(), # 第一隐层激活函数采用sigmoid
  59. nn.Linear(5,5), # 第一隐层与第二隐层结点数设置,全连接结构
  60. torch.nn.Sigmoid(), # 第一隐层激活函数采用sigmoid
  61. nn.Linear(5,2), # 第二隐层与输出层层结点数设置,全连接结构
  62. nn.Softmax(dim=1) # 由于有两个概率输出,因此对其使用Softmax进行概率归一化
  63. )
  64. # 配置损失函数和优化器
  65. optimizer = torch.optim.SGD(net.parameters(),lr=0.01) # 优化器使用随机梯度下降,传入网络参数和学习率
  66. loss_func = torch.nn.CrossEntropyLoss() # 损失函数使用交叉熵损失函数
  67. def start_train():
  68. for i, data_tuple in enumerate(tqdm(train_tuple)):
  69. step_num = str(i)
  70. logger.info(f"开始第{step_num}轮")
  71. pos_ids = data_tuple[0]
  72. neg_ids = data_tuple[1]
  73. train_data_frame_pos = total_pos[total_pos.row_id.isin(pos_ids)]
  74. train_data_frame_neg = total_neg[total_neg.row_id.isin(neg_ids)]
  75. train_data_frame_pos['mark'] = 0
  76. train_data_frame_neg['mark'] = 1
  77. total_train_data = pd.concat([train_data_frame_pos, train_data_frame_neg], axis=0)
  78. # 划分训练集和测试集
  79. data_train, data_test = train_test_split(total_train_data, train_size=0.7)
  80. train_y_data = data_train[['mark']]
  81. train_x_data = data_train.drop(columns=['row_id', 'month', 'mark'])
  82. x_train = torch.from_numpy(train_x_data.values)
  83. y_train = torch.from_numpy(train_y_data.values)
  84. test_x_data = data_test.drop(columns=['row_id', 'month', 'mark'])
  85. test_y_data = data_test[['mark']]
  86. y_p = net(x_train)
  87. loss = loss_func(y_p, y_train.long()) # 计算损失
  88. optimizer.zero_grad() # 清除梯度
  89. loss.backward() # 计算梯度,误差回传
  90. optimizer.step() # 根据计算的梯度,更新网络中的参数
  91. if i % 1000 == 0:
  92. print('epoch: {}, loss: {}'.format(i, loss.data.item()))
  93. if __name__ == "__main__":
  94. start_train()
  95. # data1 = pd.read_pickle("./data_neg_part_1_2013.pkl")
  96. # data2 = pd.read_pickle("./data_neg_part_2_2013.pkl")
  97. # data3 = pd.read_pickle("./data_neg_part_3_2013.pkl")
  98. # data4 = pd.read_pickle("./data_neg_part_4_2013.pkl")
  99. # data5 = pd.read_pickle("./data_neg_part_5_2013.pkl")
  100. # data_all = pd.concat([data1, data2, data3, data4], axis=0)
  101. # data_all.to_pickle("./data_neg_2013.pkl")
  102. pass