import pandas as pd from common.log_utils import logFactory from common.database_utils import database_util from common import constant import pickle import numpy as np import matplotlib.pyplot as plt import seaborn as sns click_client = database_util.get_client() logger = logFactory("data analysis").log if __name__: # tb_name = constant.insert_origin_tb_name # sql = f"select * from {tb_name}" # all_data = click_client.execute(sql) # dataf = pd.DataFrame(all_data, columns=constant.process_column_names) # dataf.to_pickle("store_process_1.pkl") dataf_0 = pd.read_pickle("store_process_0.pkl") dataf_1 = pd.read_pickle("store_process_1.pkl") dataf_0['mark'] = 0 dataf_1['mark'] = 1 all_data = pd.concat([dataf_0, dataf_1], axis=0) # EVENT_CATEGORYNAME_C # all_data = pd.melt(all_data[['EVENT_CATEGORYNAME_C', 'mark']], id_vars='mark', var_name="Features", value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # # EVENT_CHANNEL_BELONGTO_C # all_data = pd.melt(all_data[['EVENT_CHANNEL_BELONGTO_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # TAG_INTIME_C # all_data = pd.melt(all_data[['TAG_INTIME_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # EVENT_FLUX_C # all_data = pd.melt(all_data[['EVENT_FLUX_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)] # all_data = pd.melt(all_data[['EVENT_FLUX_V', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # EVENT_CONSUM_C # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)] # all_data = pd.melt(all_data[['EVENT_CONSUM_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)] # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # EVENT_VIDEO_FLUX_C/ENVENT_VIDEO_FLUX_V # # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)] # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)] # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # TAG_GENDER_C # all_data = pd.melt(all_data[['TAG_GENDER_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # TAG_NETTYPE_C # all_data = pd.melt(all_data[['TAG_AGE_C', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # APP USE # all_data = all_data[ # (all_data.app_use_tencent > -1000) & # (all_data.app_use_mangguo > -1000) & # (all_data.app_use_youku > -1000) & # (all_data.app_use_iqiyi > -1000) & # (all_data.app_use_bilibili > -1000) & # (all_data.app_use_kuaishou > -1000) # ] # # all_data = pd.melt(all_data[['app_use_tencent','app_use_mangguo','app_use_youku','app_use_iqiyi','app_use_bilibili','app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # 对比 # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)] # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)] # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)] # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler) # all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler) # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler) # # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'EVENT_FLUX_V', 'EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark', # var_name="Features", # value_name="Values") # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() # 相关性 # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)] # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)] # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)] # corr = all_data.corr() # xticks = list(corr.index) # yticks = list(corr.index) # plt.rcParams['axes.unicode_minus'] = False # plt.figure(figsize=(60, 60)) # ax1 = plt.subplot(1, 1, 1) # sns.heatmap(corr, annot=True, cmap="rainbow", ax=ax1, linewidths=.5, # annot_kws={'size': 10, 'weight': 'bold', 'color': 'blue'}) # ax1.set_xticklabels(xticks, rotation=35, fontsize=15) # ax1.set_yticklabels(yticks, rotation=0, fontsize=15) # # plt.show() all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)] all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)] all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)] max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler) all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler) all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler) all_data = all_data[ (all_data.app_use_tencent > -1000) & (all_data.app_use_mangguo > -1000) & (all_data.app_use_youku > -1000) & (all_data.app_use_iqiyi > -1000) & (all_data.app_use_bilibili > -1000) & (all_data.app_use_kuaishou > -1000) ] data = all_data[["app_use_kuaishou","TAG_INTIME_C","EVENT_CONSUM_V","EVENT_FLUX_V","EVENT_VIDEO_FLUX_V"]] data.hist(figsize=(12, 10), bins=20) plt.tight_layout() plt.show() # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) # # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler) # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler) # drop_columns = ['row_id', 'month'] # all_data.drop(columns=drop_columns, inplace=True) # # all_data = all_data[(all_data.EVENT_CONSUM_V < 100) & (all_data.EVENT_CONSUM_V > 0)] # # all_data = all_data[all_data.app_use_kuaishou != -9999] # # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features", value_name="Values") # # all_data = pd.melt(all_data[['app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features", value_name="Values") # # fig, ax = plt.subplots(1, 2, figsize=(15, 5)) # sns.violinplot( # x="Features", # y="Values", # hue="mark", # data=all_data, # split=True, # palette='muted' # ) # plt.show() pass