from common.log_utils import logFactory from common.database_utils import database_util from common import constant import pandas as pd from tqdm import tqdm from random import shuffle from sklearn.model_selection import train_test_split import lightgbm as lgb from sklearn import metrics import matplotlib.pyplot as plt import seaborn as sns # # 分组数量 batch_num_pos = 1 batch_num_neg = 13 # 根据每个批次的数据量计算出每个批次的row_id def partition_preserve_order(list_in, n): indices = list(range(len(list_in))) shuffle(indices) index_partitions = [sorted(indices[i::n]) for i in range(n)] return [[list_in[i] for i in index_partition] for index_partition in index_partitions] def gen_train_tuple(pos_row_ids, neg_row_ids): result = [] for i, e, in enumerate(neg_row_ids): pos_index = i % batch_num_pos result.append((pos_row_ids[pos_index], neg_row_ids[i])) return result logger = logFactory("local data analysis new").log total_pos = pd.read_pickle("data_pos_0218_06.pkl") total_neg = pd.read_pickle("data_neg_part_1_0218_06.pkl") train_data_frame_pos = total_pos train_data_frame_neg = total_neg train_data_frame_pos['mark'] = 0 train_data_frame_neg['mark'] = 1 total_train_data = pd.concat([train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos, train_data_frame_neg], axis=0) # EVENT_CONSUM_V t0 = total_train_data[['EVENT_CONSUM_V', 'mark']] t0 = t0[t0.EVENT_CONSUM_V <= 200] all_data = pd.melt(t0, id_vars='mark', var_name="Features", value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=False, # palette='muted' # ) # plt.show() # # # EVENT_FLUX_V # t0 = total_train_data[['EVENT_FLUX_V', 'mark']] # t0 = t0[t0.EVENT_FLUX_V <= 20000] # all_data = pd.melt(t0, id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=False, # palette='muted' # ) # plt.show() # # # EVENT_VIDEO_FLUX_V # t0 = total_train_data[['EVENT_VIDEO_FLUX_V', 'mark']] # t0 = t0[t0.EVENT_VIDEO_FLUX_V <= 20] # all_data = pd.melt(t0, id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=False, # palette='muted' # ) # plt.show() # # f, [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8] = plt.subplots(1, 8, figsize=(20, 5)) # sns.countplot(x='app_use_kuaishou_7', hue='mark', data=total_train_data, ax=ax1) # sns.countplot(x='app_use_kuaishou_6', hue='mark', data=total_train_data, ax=ax2) # sns.countplot(x='app_use_kuaishou_5', hue='mark', data=total_train_data, ax=ax3) # sns.countplot(x='app_use_kuaishou_4', hue='mark', data=total_train_data, ax=ax4) # sns.countplot(x='app_use_kuaishou_3', hue='mark', data=total_train_data, ax=ax5) # sns.countplot(x='app_use_kuaishou_2', hue='mark', data=total_train_data, ax=ax6) # sns.countplot(x='app_use_kuaishou_1', hue='mark', data=total_train_data, ax=ax7) # sns.countplot(x='app_use_kuaishou_0', hue='mark', data=total_train_data, ax=ax8) # ax1.set_title('label7') # ax2.set_title('label7') # ax3.set_title('label5') # ax4.set_title('label4') # ax5.set_title('label3') # ax6.set_title('label2') # ax7.set_title('label1') # ax8.set_title('label0') # sns.set(rc={'figure.figsize': (50, 50)}) # f, [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, # ax11, ax12, ax13, ax14, ax15, ax16, ax17, ax18, ax19, ax20, # ax21, ax22, ax23, ax24, ax25, ax26, ax27, ax28, ax29, ax30, ax31, ax32] = plt.subplots(1, 32, figsize=(20, 5)) # sns.countplot(x="TAG_PROVINCE_C_0", hue='mark', data=total_train_data, ax=ax1) # sns.countplot(x="TAG_PROVINCE_C_1", hue='mark', data=total_train_data, ax=ax2) # sns.countplot(x="TAG_PROVINCE_C_2", hue='mark', data=total_train_data, ax=ax3) # sns.countplot(x="TAG_PROVINCE_C_3", hue='mark', data=total_train_data, ax=ax4) # sns.countplot(x="TAG_PROVINCE_C_4", hue='mark', data=total_train_data, ax=ax5) # sns.countplot(x="TAG_PROVINCE_C_5", hue='mark', data=total_train_data, ax=ax6) # sns.countplot(x="TAG_PROVINCE_C_6", hue='mark', data=total_train_data, ax=ax7) # sns.countplot(x="TAG_PROVINCE_C_7", hue='mark', data=total_train_data, ax=ax8) # sns.countplot(x="TAG_PROVINCE_C_8", hue='mark', data=total_train_data, ax=ax9) # sns.countplot(x="TAG_PROVINCE_C_9", hue='mark', data=total_train_data, ax=ax10) # sns.countplot(x="TAG_PROVINCE_C_10", hue='mark', data=total_train_data, ax=ax11) # sns.countplot(x="TAG_PROVINCE_C_11", hue='mark', data=total_train_data, ax=ax12) # sns.countplot(x="TAG_PROVINCE_C_12", hue='mark', data=total_train_data, ax=ax13) # sns.countplot(x="TAG_PROVINCE_C_13", hue='mark', data=total_train_data, ax=ax14) # sns.countplot(x="TAG_PROVINCE_C_14", hue='mark', data=total_train_data, ax=ax15) # sns.countplot(x="TAG_PROVINCE_C_15", hue='mark', data=total_train_data, ax=ax16) # sns.countplot(x="TAG_PROVINCE_C_16", hue='mark', data=total_train_data, ax=ax17) # sns.countplot(x="TAG_PROVINCE_C_17", hue='mark', data=total_train_data, ax=ax18) # sns.countplot(x="TAG_PROVINCE_C_18", hue='mark', data=total_train_data, ax=ax19) # sns.countplot(x="TAG_PROVINCE_C_19", hue='mark', data=total_train_data, ax=ax20) # sns.countplot(x="TAG_PROVINCE_C_20", hue='mark', data=total_train_data, ax=ax21) # sns.countplot(x="TAG_PROVINCE_C_21", hue='mark', data=total_train_data, ax=ax22) # sns.countplot(x="TAG_PROVINCE_C_22", hue='mark', data=total_train_data, ax=ax23) # sns.countplot(x="TAG_PROVINCE_C_23", hue='mark', data=total_train_data, ax=ax24) # sns.countplot(x="TAG_PROVINCE_C_24", hue='mark', data=total_train_data, ax=ax25) # sns.countplot(x="TAG_PROVINCE_C_25", hue='mark', data=total_train_data, ax=ax26) # sns.countplot(x="TAG_PROVINCE_C_26", hue='mark', data=total_train_data, ax=ax27) # sns.countplot(x="TAG_PROVINCE_C_27", hue='mark', data=total_train_data, ax=ax28) # sns.countplot(x="TAG_PROVINCE_C_28", hue='mark', data=total_train_data, ax=ax29) # sns.countplot(x="TAG_PROVINCE_C_29", hue='mark', data=total_train_data, ax=ax30) # sns.countplot(x="TAG_PROVINCE_C_30", hue='mark', data=total_train_data, ax=ax31) # sns.countplot(x="TAG_PROVINCE_C_31", hue='mark', data=total_train_data, ax=ax32) # ax1.set_title("PROVINCE_C_0") # ax2.set_title("PROVINCE_C_1") # ax3.set_title("PROVINCE_C_2") # ax4.set_title("PROVINCE_C_3") # ax5.set_title("PROVINCE_C_4") # ax6.set_title("PROVINCE_C_5") # ax7.set_title("PROVINCE_C_6") # ax8.set_title("PROVINCE_C_7") # ax9.set_title("PROVINCE_C_8") # ax10.set_title("PROVINCE_C_9") # ax11.set_title("PROVINCE_C_10") # ax12.set_title("PROVINCE_C_11") # ax13.set_title("PROVINCE_C_12") # ax14.set_title("PROVINCE_C_13") # ax15.set_title("PROVINCE_C_14") # ax16.set_title("PROVINCE_C_15") # ax17.set_title("PROVINCE_C_16") # ax18.set_title("PROVINCE_C_17") # ax19.set_title("PROVINCE_C_18") # ax20.set_title("PROVINCE_C_19") # ax21.set_title("PROVINCE_C_20") # ax22.set_title("PROVINCE_C_21") # ax23.set_title("PROVINCE_C_22") # ax24.set_title("PROVINCE_C_23") # ax25.set_title("PROVINCE_C_24") # ax26.set_title("PROVINCE_C_25") # ax27.set_title("PROVINCE_C_26") # ax28.set_title("PROVINCE_C_27") # ax29.set_title("PROVINCE_C_28") # ax30.set_title("PROVINCE_C_29") # ax31.set_title("PROVINCE_C_30") # ax32.set_title("PROVINCE_C_31") f, [ax1, ax2, ax3] = plt.subplots(1, 3, figsize=(20, 5)) sns.countplot(x='EVENT_IS_ACCT_C_0', hue='mark', data=total_train_data, ax=ax1) sns.countplot(x='EVENT_IS_ACCT_C_1', hue='mark', data=total_train_data, ax=ax2) sns.countplot(x='EVENT_IS_ACCT_C_2', hue='mark', data=total_train_data, ax=ax3) ax1.set_title('acct0') ax2.set_title('acct1') ax3.set_title('acct2') # # f, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(1, 5, figsize=(20, 5)) # sns.countplot(x='TAG_NETTYPE_C_0', hue='mark', data=total_train_data, ax=ax1) # sns.countplot(x='TAG_NETTYPE_C_1', hue='mark', data=total_train_data, ax=ax2) # sns.countplot(x='TAG_NETTYPE_C_2', hue='mark', data=total_train_data, ax=ax3) # sns.countplot(x='TAG_NETTYPE_C_3', hue='mark', data=total_train_data, ax=ax4) # sns.countplot(x='TAG_NETTYPE_C_4', hue='mark', data=total_train_data, ax=ax5) # ax1.set_title('net type0') # ax2.set_title('net type1') # ax3.set_title('net type2') # ax4.set_title('net type3') # ax5.set_title('net type4') # # f, [ax1, ax2, ax3, ax4, ax5, ax6] = plt.subplots(1, 6, figsize=(20, 5)) # sns.countplot(x='TAG_INTIME_C_0', hue='mark', data=total_train_data, ax=ax1) # sns.countplot(x='TAG_INTIME_C_1', hue='mark', data=total_train_data, ax=ax2) # sns.countplot(x='TAG_INTIME_C_2', hue='mark', data=total_train_data, ax=ax3) # sns.countplot(x='TAG_INTIME_C_3', hue='mark', data=total_train_data, ax=ax4) # sns.countplot(x='TAG_INTIME_C_4', hue='mark', data=total_train_data, ax=ax5) # sns.countplot(x='TAG_INTIME_C_5', hue='mark', data=total_train_data, ax=ax6) # ax1.set_title('INTIME1') # ax2.set_title('INTIME2') # ax3.set_title('INTIME3') # ax4.set_title('INTIME4') # ax5.set_title('INTIME5') # ax6.set_title('INTIME5') plt.show()