123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- import pandas as pd
- from common.log_utils import logFactory
- from common.database_utils import database_util
- from common import constant
- import pickle
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- click_client = database_util.get_client()
- logger = logFactory("data analysis").log
- if __name__:
- # tb_name = constant.insert_origin_tb_name
- # sql = f"select * from {tb_name}"
- # all_data = click_client.execute(sql)
- # dataf = pd.DataFrame(all_data, columns=constant.process_column_names)
- # dataf.to_pickle("store_process_1.pkl")
- dataf_0 = pd.read_pickle("store_process_0.pkl")
- dataf_1 = pd.read_pickle("store_process_1.pkl")
- dataf_0['mark'] = 0
- dataf_1['mark'] = 1
- all_data = pd.concat([dataf_0, dataf_1], axis=0)
- # EVENT_CATEGORYNAME_C
- # all_data = pd.melt(all_data[['EVENT_CATEGORYNAME_C', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- #
- # EVENT_CHANNEL_BELONGTO_C
- # all_data = pd.melt(all_data[['EVENT_CHANNEL_BELONGTO_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # TAG_INTIME_C
- # all_data = pd.melt(all_data[['TAG_INTIME_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # EVENT_FLUX_C
- # all_data = pd.melt(all_data[['EVENT_FLUX_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
- # all_data = pd.melt(all_data[['EVENT_FLUX_V', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # EVENT_CONSUM_C
- # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
- # all_data = pd.melt(all_data[['EVENT_CONSUM_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
- # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # EVENT_VIDEO_FLUX_C/ENVENT_VIDEO_FLUX_V
- # # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
- # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
- # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # TAG_GENDER_C
- # all_data = pd.melt(all_data[['TAG_GENDER_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # TAG_NETTYPE_C
- # all_data = pd.melt(all_data[['TAG_AGE_C', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # APP USE
- # all_data = all_data[
- # (all_data.app_use_tencent > -1000) &
- # (all_data.app_use_mangguo > -1000) &
- # (all_data.app_use_youku > -1000) &
- # (all_data.app_use_iqiyi > -1000) &
- # (all_data.app_use_bilibili > -1000) &
- # (all_data.app_use_kuaishou > -1000)
- # ]
- #
- # all_data = pd.melt(all_data[['app_use_tencent','app_use_mangguo','app_use_youku','app_use_iqiyi','app_use_bilibili','app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # 对比
- # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
- # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
- # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
- # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
- # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
- # all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler)
- # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
- #
- # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'EVENT_FLUX_V', 'EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark',
- # var_name="Features",
- # value_name="Values")
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- # 相关性
- # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
- # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
- # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
- # corr = all_data.corr()
- # xticks = list(corr.index)
- # yticks = list(corr.index)
- # plt.rcParams['axes.unicode_minus'] = False
- # plt.figure(figsize=(60, 60))
- # ax1 = plt.subplot(1, 1, 1)
- # sns.heatmap(corr, annot=True, cmap="rainbow", ax=ax1, linewidths=.5,
- # annot_kws={'size': 10, 'weight': 'bold', 'color': 'blue'})
- # ax1.set_xticklabels(xticks, rotation=35, fontsize=15)
- # ax1.set_yticklabels(yticks, rotation=0, fontsize=15)
- #
- # plt.show()
- all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
- all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
- all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
- max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
- all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
- all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler)
- all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
- all_data = all_data[
- (all_data.app_use_tencent > -1000) &
- (all_data.app_use_mangguo > -1000) &
- (all_data.app_use_youku > -1000) &
- (all_data.app_use_iqiyi > -1000) &
- (all_data.app_use_bilibili > -1000) &
- (all_data.app_use_kuaishou > -1000)
- ]
- data = all_data[["app_use_kuaishou","TAG_INTIME_C","EVENT_CONSUM_V","EVENT_FLUX_V","EVENT_VIDEO_FLUX_V"]]
- data.hist(figsize=(12, 10), bins=20)
- plt.tight_layout()
- plt.show()
- # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
- # # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
- # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
- # drop_columns = ['row_id', 'month']
- # all_data.drop(columns=drop_columns, inplace=True)
- #
- # all_data = all_data[(all_data.EVENT_CONSUM_V < 100) & (all_data.EVENT_CONSUM_V > 0)]
- # # all_data = all_data[all_data.app_use_kuaishou != -9999]
- #
- # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
- # # all_data = pd.melt(all_data[['app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
- # # fig, ax = plt.subplots(1, 2, figsize=(15, 5))
- # sns.violinplot(
- # x="Features",
- # y="Values",
- # hue="mark",
- # data=all_data,
- # split=True,
- # palette='muted'
- # )
- # plt.show()
- pass
|