from common.log_utils import logFactory from common.database_utils import database_util from common import constant import pandas as pd from common.process_util import ProcessUtil from tqdm import tqdm from random import shuffle from sklearn.model_selection import train_test_split import lightgbm as lgb from sklearn import metrics import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import DBSCAN logger = logFactory("local train").log drop_columns = ['uuid', 'row_id', 'mark'] def train(total_pos, total_neg, model_name): lgb_model = None train_params = { 'task': 'train', 'objective': 'binary', 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'num_leaves': 40, 'tree_learner': 'serial', 'metric': {'binary_logloss', 'auc', 'average_precision'}, # l1:mae, l2:mse 'max_bin': 80, # 较小的max_bin会导致更快的速度,较大的值会提高准确性 'max_depth': 6, # 'min_child_samples': 5, # "min_data_in_leaf": 10, "bagging_fraction": 0.9, # 样本采样比例,同 XGBoost ,调小可以防止过拟合,加快运算速度 "feature_fraction": 0.9, # 样本采样比例,同 XGBoost ,调小可以防止过拟合,加快运算速度 "n_jobs": 8, "boost_from_average": False, 'seed': 2022, "lambda_l1": 1e-5, "lambda_l2": 1e-5, } for i, data_tuple in enumerate(tqdm(train_tuple)): step_num = str(i) logger.info(f"开始第{step_num}轮") pos_ids = data_tuple[0] neg_ids = data_tuple[1] train_data_frame_pos = total_pos[total_pos.row_id.isin(pos_ids)] train_data_frame_neg = total_neg[total_neg.row_id.isin(neg_ids)] train_data_frame_pos['mark'] = 0 train_data_frame_neg['mark'] = 1 total_train_data = pd.concat([train_data_frame_pos, train_data_frame_neg], axis=0) # 划分训练集和测试集 data_train, data_test = train_test_split(total_train_data, train_size=0.8) train_y_data = data_train[['mark']] train_x_data = data_train.drop(columns=drop_columns) # train_x_data = data_train[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']] test_y_data = data_test[['mark']] test_x_data = data_test.drop(columns=drop_columns) # test_x_data = data_test[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']] # 创建lgb的数据集 lgb_train = lgb.Dataset(train_x_data, train_y_data.values, silent=True) lgb_eval = lgb.Dataset(test_x_data, test_y_data.values, reference=lgb_train, silent=True) lgb_model = lgb.train(params=train_params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_eval, init_model=lgb_model, feature_name=train_x_data.columns.tolist(), early_stopping_rounds=20, verbose_eval=False, keep_training_booster=True) # 输出模型评估分数 score_train = str(dict([(s[1], s[2]) for s in lgb_model.eval_train()])) score_valid = str(dict([(s[1], s[2]) for s in lgb_model.eval_valid()])) logger.info(f"第{step_num}轮的结果如下:") logger.info(f"在训练集上:{score_train}") logger.info(f"在测试集上:{score_valid}") if i == len(train_tuple) - 1: test_predict = lgb_model.predict(test_x_data) test_result = [] for x in test_predict: if x < 0.5: test_result.append(0) else: test_result.append(1) result = metrics.classification_report(test_y_data.values, test_result) logger.info(result) ProcessUtil.draw_roc_auc(test_y_data.values.reshape([-1, ]), test_predict) sns.set(rc={'figure.figsize': (50, 50)}) sns.barplot(y=train_x_data.columns, x=lgb_model.feature_importance()) plt.show() lgb_model.save_model(model_name) def test(model_path, test_pos_path, test_neg_path): logger.info("准备开始加载验证数据") test_pos = pd.read_pickle(test_pos_path) test_neg = pd.read_pickle(test_neg_path) test_pos['mark'] = 0 test_neg['mark'] = 1 test_all_data = pd.concat([test_pos, test_neg], axis=0) test_x_data = test_all_data test_y_data = test_all_data[['mark']] test_x_data.drop(columns=['uuid', 'mark'], inplace=True) # test_x_data=test_x_data[['diff_EVENT_FLUX_V','diff_EVENT_CONSUM_V','diff_EVENT_VIDEO_FLUX_V','diff_kuaishou_use']] lgb_model = lgb.Booster(model_file=model_path) test_predict = lgb_model.predict(test_x_data, num_iteration=lgb_model.best_iteration) test_result = [] for x in test_predict: if x > 0.2: test_result.append(1) else: test_result.append(0) result = metrics.classification_report(test_y_data.values, test_result) logger.info(result) ProcessUtil.draw_confusion_matrix(test_y_data.values, test_result) ProcessUtil.draw_roc_auc(test_y_data.values.reshape([-1, ]), test_predict) pass def dbscan_data(df_list): df_all = pd.concat(df_list, axis=0) dbscan = DBSCAN(eps=0.8, min_samples=1) dbscan.fit(df_all[ [ 'EVENT_FLUX_C_0', 'EVENT_FLUX_C_1', 'EVENT_FLUX_C_2', 'EVENT_FLUX_C_3', 'EVENT_FLUX_C_4', 'EVENT_FLUX_C_5', 'EVENT_FLUX_C_6', 'EVENT_FLUX_C_7', 'EVENT_FLUX_C_8', 'EVENT_CONSUM_C_0', 'EVENT_CONSUM_C_1', 'EVENT_CONSUM_C_2', 'EVENT_CONSUM_C_3', 'EVENT_CONSUM_C_4', 'EVENT_CONSUM_C_5', 'EVENT_CONSUM_C_6', 'EVENT_CONSUM_C_7', 'EVENT_VIDEO_FLUX_C_0', 'EVENT_VIDEO_FLUX_C_1', 'EVENT_VIDEO_FLUX_C_2', 'EVENT_VIDEO_FLUX_C_3', 'EVENT_VIDEO_FLUX_C_4', 'EVENT_VIDEO_FLUX_C_5', 'EVENT_VIDEO_FLUX_C_6', 'EVENT_VIDEO_FLUX_C_7', 'app_use_kuaishou_0', 'app_use_kuaishou_1', 'app_use_kuaishou_2', 'app_use_kuaishou_3', 'app_use_kuaishou_4', 'app_use_kuaishou_5', 'app_use_kuaishou_6', 'app_use_kuaishou_7', 'EVENT_ORDER_MONTH_C_0', 'EVENT_ORDER_MONTH_C_1', 'EVENT_ORDER_MONTH_C_2', 'EVENT_ORDER_MONTH_C_3', 'EVENT_ORDER_MONTH_C_4', 'EVENT_ORDER_MONTH_C_5', 'EVENT_ORDER_MONTH_C_6']]) label_pred = dbscan.labels_ print(label_pred) if __name__ == "__main__": batch_num_pos = 10 batch_num_neg = 100 total_pos = pd.read_pickle("./data/pkl/train_neg_all.pkl") total_neg = pd.read_pickle("./data/pkl/train_pos_all.pkl") positive_ids = range(0, total_pos.shape[0]) negative_ids = range(0, total_neg.shape[0]) pos_row_ids = ProcessUtil.partition_preserve_order(positive_ids, batch_num_pos) neg_row_ids = ProcessUtil.partition_preserve_order(negative_ids, batch_num_neg) train_tuple = ProcessUtil.gen_train_tuple(pos_row_ids, neg_row_ids, batch_num_pos) # 增加自增列 total_pos['row_id'] = range(0, len(total_pos)) total_neg['row_id'] = range(0, len(total_neg)) train(total_pos, total_neg, "./data/model/temp_model030901.model") # test("./data/model/temp_model030802.model", './data/pkl/valid_neg_all.pkl','./data/pkl/valid_pos_all.pkl') # dbscan_data([pd.read_pickle('./data/pkl/process_dbscan.pkl')])