local_train.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. from common.log_utils import logFactory
  2. from common.database_utils import database_util
  3. from common import constant
  4. import pandas as pd
  5. from common.process_util import ProcessUtil
  6. from tqdm import tqdm
  7. from random import shuffle
  8. from sklearn.model_selection import train_test_split
  9. import lightgbm as lgb
  10. from sklearn import metrics
  11. import matplotlib.pyplot as plt
  12. import seaborn as sns
  13. from sklearn.cluster import DBSCAN
  14. logger = logFactory("local train").log
  15. drop_columns = ['uuid', 'row_id', 'mark']
  16. def train(total_pos, total_neg, model_name):
  17. lgb_model = None
  18. train_params = {
  19. 'task': 'train',
  20. 'objective': 'binary',
  21. 'boosting_type': 'gbdt',
  22. 'learning_rate': 0.01,
  23. 'num_leaves': 40,
  24. 'tree_learner': 'serial',
  25. 'metric': {'binary_logloss', 'auc', 'average_precision'}, # l1:mae, l2:mse
  26. 'max_bin': 80, # 较小的max_bin会导致更快的速度,较大的值会提高准确性
  27. 'max_depth': 6,
  28. # 'min_child_samples': 5,
  29. # "min_data_in_leaf": 10,
  30. "bagging_fraction": 0.9, # 样本采样比例,同 XGBoost ,调小可以防止过拟合,加快运算速度
  31. "feature_fraction": 0.9, # 样本采样比例,同 XGBoost ,调小可以防止过拟合,加快运算速度
  32. "n_jobs": 8,
  33. "boost_from_average": False,
  34. 'seed': 2022,
  35. "lambda_l1": 1e-5,
  36. "lambda_l2": 1e-5,
  37. }
  38. for i, data_tuple in enumerate(tqdm(train_tuple)):
  39. step_num = str(i)
  40. logger.info(f"开始第{step_num}轮")
  41. pos_ids = data_tuple[0]
  42. neg_ids = data_tuple[1]
  43. train_data_frame_pos = total_pos[total_pos.row_id.isin(pos_ids)]
  44. train_data_frame_neg = total_neg[total_neg.row_id.isin(neg_ids)]
  45. train_data_frame_pos['mark'] = 0
  46. train_data_frame_neg['mark'] = 1
  47. total_train_data = pd.concat([train_data_frame_pos, train_data_frame_neg], axis=0)
  48. # 划分训练集和测试集
  49. data_train, data_test = train_test_split(total_train_data, train_size=0.8)
  50. train_y_data = data_train[['mark']]
  51. train_x_data = data_train.drop(columns=drop_columns)
  52. # train_x_data = data_train[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']]
  53. test_y_data = data_test[['mark']]
  54. test_x_data = data_test.drop(columns=drop_columns)
  55. # test_x_data = data_test[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']]
  56. # 创建lgb的数据集
  57. lgb_train = lgb.Dataset(train_x_data, train_y_data.values, silent=True)
  58. lgb_eval = lgb.Dataset(test_x_data, test_y_data.values, reference=lgb_train, silent=True)
  59. lgb_model = lgb.train(params=train_params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_eval,
  60. init_model=lgb_model, feature_name=train_x_data.columns.tolist(),
  61. early_stopping_rounds=20,
  62. verbose_eval=False, keep_training_booster=True)
  63. # 输出模型评估分数
  64. score_train = str(dict([(s[1], s[2]) for s in lgb_model.eval_train()]))
  65. score_valid = str(dict([(s[1], s[2]) for s in lgb_model.eval_valid()]))
  66. logger.info(f"第{step_num}轮的结果如下:")
  67. logger.info(f"在训练集上:{score_train}")
  68. logger.info(f"在测试集上:{score_valid}")
  69. if i == len(train_tuple) - 1:
  70. test_predict = lgb_model.predict(test_x_data)
  71. test_result = []
  72. for x in test_predict:
  73. if x < 0.5:
  74. test_result.append(0)
  75. else:
  76. test_result.append(1)
  77. result = metrics.classification_report(test_y_data.values, test_result)
  78. logger.info(result)
  79. ProcessUtil.draw_roc_auc(test_y_data.values.reshape([-1, ]), test_predict)
  80. sns.set(rc={'figure.figsize': (50, 50)})
  81. sns.barplot(y=train_x_data.columns, x=lgb_model.feature_importance())
  82. plt.show()
  83. lgb_model.save_model(model_name)
  84. def test(model_path, test_pos_path, test_neg_path):
  85. logger.info("准备开始加载验证数据")
  86. test_pos = pd.read_pickle(test_pos_path)
  87. test_neg = pd.read_pickle(test_neg_path)
  88. test_pos['mark'] = 0
  89. test_neg['mark'] = 1
  90. test_all_data = pd.concat([test_pos, test_neg], axis=0)
  91. test_x_data = test_all_data
  92. test_y_data = test_all_data[['mark']]
  93. test_x_data.drop(columns=['uuid', 'mark'], inplace=True)
  94. # test_x_data=test_x_data[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']]
  95. lgb_model = lgb.Booster(model_file=model_path)
  96. test_predict = lgb_model.predict(test_x_data, num_iteration=lgb_model.best_iteration)
  97. test_result = []
  98. for x in test_predict:
  99. if x > 0.2:
  100. test_result.append(1)
  101. else:
  102. test_result.append(0)
  103. result = metrics.classification_report(test_y_data.values, test_result)
  104. logger.info(result)
  105. ProcessUtil.draw_confusion_matrix(test_y_data.values, test_result)
  106. ProcessUtil.draw_roc_auc(test_y_data.values.reshape([-1, ]), test_predict)
  107. pass
  108. def dbscan_data(df_list):
  109. df_all = pd.concat(df_list, axis=0)
  110. dbscan = DBSCAN(eps=0.8, min_samples=1)
  111. dbscan.fit(df_all[
  112. [
  113. 'EVENT_FLUX_C_0',
  114. 'EVENT_FLUX_C_1',
  115. 'EVENT_FLUX_C_2',
  116. 'EVENT_FLUX_C_3',
  117. 'EVENT_FLUX_C_4',
  118. 'EVENT_FLUX_C_5',
  119. 'EVENT_FLUX_C_6',
  120. 'EVENT_FLUX_C_7',
  121. 'EVENT_FLUX_C_8',
  122. 'EVENT_CONSUM_C_0',
  123. 'EVENT_CONSUM_C_1',
  124. 'EVENT_CONSUM_C_2',
  125. 'EVENT_CONSUM_C_3',
  126. 'EVENT_CONSUM_C_4',
  127. 'EVENT_CONSUM_C_5',
  128. 'EVENT_CONSUM_C_6',
  129. 'EVENT_CONSUM_C_7',
  130. 'EVENT_VIDEO_FLUX_C_0',
  131. 'EVENT_VIDEO_FLUX_C_1',
  132. 'EVENT_VIDEO_FLUX_C_2',
  133. 'EVENT_VIDEO_FLUX_C_3',
  134. 'EVENT_VIDEO_FLUX_C_4',
  135. 'EVENT_VIDEO_FLUX_C_5',
  136. 'EVENT_VIDEO_FLUX_C_6',
  137. 'EVENT_VIDEO_FLUX_C_7',
  138. 'app_use_kuaishou_0',
  139. 'app_use_kuaishou_1',
  140. 'app_use_kuaishou_2',
  141. 'app_use_kuaishou_3',
  142. 'app_use_kuaishou_4',
  143. 'app_use_kuaishou_5',
  144. 'app_use_kuaishou_6',
  145. 'app_use_kuaishou_7',
  146. 'EVENT_ORDER_MONTH_C_0', 'EVENT_ORDER_MONTH_C_1', 'EVENT_ORDER_MONTH_C_2', 'EVENT_ORDER_MONTH_C_3',
  147. 'EVENT_ORDER_MONTH_C_4',
  148. 'EVENT_ORDER_MONTH_C_5', 'EVENT_ORDER_MONTH_C_6']])
  149. label_pred = dbscan.labels_
  150. print(label_pred)
  151. if __name__ == "__main__":
  152. batch_num_pos = 10
  153. batch_num_neg = 100
  154. total_pos = pd.read_pickle("./data/pkl/train_neg_all.pkl")
  155. total_neg = pd.read_pickle("./data/pkl/train_pos_all.pkl")
  156. positive_ids = range(0, total_pos.shape[0])
  157. negative_ids = range(0, total_neg.shape[0])
  158. pos_row_ids = ProcessUtil.partition_preserve_order(positive_ids, batch_num_pos)
  159. neg_row_ids = ProcessUtil.partition_preserve_order(negative_ids, batch_num_neg)
  160. train_tuple = ProcessUtil.gen_train_tuple(pos_row_ids, neg_row_ids, batch_num_pos)
  161. # 增加自增列
  162. total_pos['row_id'] = range(0, len(total_pos))
  163. total_neg['row_id'] = range(0, len(total_neg))
  164. train(total_pos, total_neg, "./data/model/temp_model030901.model")
  165. # test("./data/model/temp_model030802.model", './data/pkl/valid_neg_all.pkl','./data/pkl/valid_pos_all.pkl')
  166. # dbscan_data([pd.read_pickle('./data/pkl/process_dbscan.pkl')])