data_analysis.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. import pandas as pd
  2. from common.log_utils import logFactory
  3. from common.database_utils import database_util
  4. from common import constant
  5. import pickle
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. import seaborn as sns
  9. click_client = database_util.get_client()
  10. logger = logFactory("data analysis").log
  11. if __name__:
  12. # tb_name = constant.insert_origin_tb_name
  13. # sql = f"select * from {tb_name}"
  14. # all_data = click_client.execute(sql)
  15. # dataf = pd.DataFrame(all_data, columns=constant.process_column_names)
  16. # dataf.to_pickle("store_process_1.pkl")
  17. dataf_0 = pd.read_pickle("store_process_0.pkl")
  18. dataf_1 = pd.read_pickle("store_process_1.pkl")
  19. dataf_0['mark'] = 0
  20. dataf_1['mark'] = 1
  21. all_data = pd.concat([dataf_0, dataf_1], axis=0)
  22. # EVENT_CATEGORYNAME_C
  23. # all_data = pd.melt(all_data[['EVENT_CATEGORYNAME_C', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
  24. # sns.violinplot(
  25. # x="Features",
  26. # y="Values",
  27. # hue="mark",
  28. # data=all_data,
  29. # split=True,
  30. # palette='muted'
  31. # )
  32. # plt.show()
  33. #
  34. # EVENT_CHANNEL_BELONGTO_C
  35. # all_data = pd.melt(all_data[['EVENT_CHANNEL_BELONGTO_C', 'mark']], id_vars='mark', var_name="Features",
  36. # value_name="Values")
  37. # sns.violinplot(
  38. # x="Features",
  39. # y="Values",
  40. # hue="mark",
  41. # data=all_data,
  42. # split=True,
  43. # palette='muted'
  44. # )
  45. # plt.show()
  46. # TAG_INTIME_C
  47. # all_data = pd.melt(all_data[['TAG_INTIME_C', 'mark']], id_vars='mark', var_name="Features",
  48. # value_name="Values")
  49. # sns.violinplot(
  50. # x="Features",
  51. # y="Values",
  52. # hue="mark",
  53. # data=all_data,
  54. # split=True,
  55. # palette='muted'
  56. # )
  57. # plt.show()
  58. # EVENT_FLUX_C
  59. # all_data = pd.melt(all_data[['EVENT_FLUX_C', 'mark']], id_vars='mark', var_name="Features",
  60. # value_name="Values")
  61. # sns.violinplot(
  62. # x="Features",
  63. # y="Values",
  64. # hue="mark",
  65. # data=all_data,
  66. # split=True,
  67. # palette='muted'
  68. # )
  69. # plt.show()
  70. # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  71. # all_data = pd.melt(all_data[['EVENT_FLUX_V', 'mark']], id_vars='mark', var_name="Features",
  72. # value_name="Values")
  73. # sns.violinplot(
  74. # x="Features",
  75. # y="Values",
  76. # hue="mark",
  77. # data=all_data,
  78. # split=True,
  79. # palette='muted'
  80. # )
  81. # plt.show()
  82. # EVENT_CONSUM_C
  83. # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  84. # all_data = pd.melt(all_data[['EVENT_CONSUM_C', 'mark']], id_vars='mark', var_name="Features",
  85. # value_name="Values")
  86. # sns.violinplot(
  87. # x="Features",
  88. # y="Values",
  89. # hue="mark",
  90. # data=all_data,
  91. # split=True,
  92. # palette='muted'
  93. # )
  94. # plt.show()
  95. # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
  96. # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features",
  97. # value_name="Values")
  98. # sns.violinplot(
  99. # x="Features",
  100. # y="Values",
  101. # hue="mark",
  102. # data=all_data,
  103. # split=True,
  104. # palette='muted'
  105. # )
  106. # plt.show()
  107. # EVENT_VIDEO_FLUX_C/ENVENT_VIDEO_FLUX_V
  108. # # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
  109. # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_C', 'mark']], id_vars='mark', var_name="Features",
  110. # value_name="Values")
  111. # sns.violinplot(
  112. # x="Features",
  113. # y="Values",
  114. # hue="mark",
  115. # data=all_data,
  116. # split=True,
  117. # palette='muted'
  118. # )
  119. # plt.show()
  120. # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
  121. # all_data = pd.melt(all_data[['EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark', var_name="Features",
  122. # value_name="Values")
  123. # sns.violinplot(
  124. # x="Features",
  125. # y="Values",
  126. # hue="mark",
  127. # data=all_data,
  128. # split=True,
  129. # palette='muted'
  130. # )
  131. # plt.show()
  132. # TAG_GENDER_C
  133. # all_data = pd.melt(all_data[['TAG_GENDER_C', 'mark']], id_vars='mark', var_name="Features",
  134. # value_name="Values")
  135. # sns.violinplot(
  136. # x="Features",
  137. # y="Values",
  138. # hue="mark",
  139. # data=all_data,
  140. # split=True,
  141. # palette='muted'
  142. # )
  143. # plt.show()
  144. # TAG_NETTYPE_C
  145. # all_data = pd.melt(all_data[['TAG_AGE_C', 'mark']], id_vars='mark', var_name="Features",
  146. # value_name="Values")
  147. # sns.violinplot(
  148. # x="Features",
  149. # y="Values",
  150. # hue="mark",
  151. # data=all_data,
  152. # split=True,
  153. # palette='muted'
  154. # )
  155. # plt.show()
  156. # APP USE
  157. # all_data = all_data[
  158. # (all_data.app_use_tencent > -1000) &
  159. # (all_data.app_use_mangguo > -1000) &
  160. # (all_data.app_use_youku > -1000) &
  161. # (all_data.app_use_iqiyi > -1000) &
  162. # (all_data.app_use_bilibili > -1000) &
  163. # (all_data.app_use_kuaishou > -1000)
  164. # ]
  165. #
  166. # all_data = pd.melt(all_data[['app_use_tencent','app_use_mangguo','app_use_youku','app_use_iqiyi','app_use_bilibili','app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features",
  167. # value_name="Values")
  168. # sns.violinplot(
  169. # x="Features",
  170. # y="Values",
  171. # hue="mark",
  172. # data=all_data,
  173. # split=True,
  174. # palette='muted'
  175. # )
  176. # plt.show()
  177. # 对比
  178. # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
  179. # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  180. # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
  181. # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
  182. # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
  183. # all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler)
  184. # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
  185. #
  186. # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'EVENT_FLUX_V', 'EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark',
  187. # var_name="Features",
  188. # value_name="Values")
  189. # sns.violinplot(
  190. # x="Features",
  191. # y="Values",
  192. # hue="mark",
  193. # data=all_data,
  194. # split=True,
  195. # palette='muted'
  196. # )
  197. # plt.show()
  198. # 相关性
  199. # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  200. # all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
  201. # all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
  202. # corr = all_data.corr()
  203. # xticks = list(corr.index)
  204. # yticks = list(corr.index)
  205. # plt.rcParams['axes.unicode_minus'] = False
  206. # plt.figure(figsize=(60, 60))
  207. # ax1 = plt.subplot(1, 1, 1)
  208. # sns.heatmap(corr, annot=True, cmap="rainbow", ax=ax1, linewidths=.5,
  209. # annot_kws={'size': 10, 'weight': 'bold', 'color': 'blue'})
  210. # ax1.set_xticklabels(xticks, rotation=35, fontsize=15)
  211. # ax1.set_yticklabels(yticks, rotation=0, fontsize=15)
  212. #
  213. # plt.show()
  214. all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  215. all_data = all_data[(all_data.EVENT_CONSUM_V < 200) & (all_data.EVENT_CONSUM_V > 0)]
  216. all_data = all_data[(all_data.EVENT_VIDEO_FLUX_V < 50) & (all_data.EVENT_VIDEO_FLUX_V > 0)]
  217. max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
  218. all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
  219. all_data['EVENT_FLUX_V'] = all_data[['EVENT_FLUX_V']].apply(max_min_scaler)
  220. all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
  221. all_data = all_data[
  222. (all_data.app_use_tencent > -1000) &
  223. (all_data.app_use_mangguo > -1000) &
  224. (all_data.app_use_youku > -1000) &
  225. (all_data.app_use_iqiyi > -1000) &
  226. (all_data.app_use_bilibili > -1000) &
  227. (all_data.app_use_kuaishou > -1000)
  228. ]
  229. data = all_data[["app_use_kuaishou","TAG_INTIME_C","EVENT_CONSUM_V","EVENT_FLUX_V","EVENT_VIDEO_FLUX_V"]]
  230. data.hist(figsize=(12, 10), bins=20)
  231. plt.tight_layout()
  232. plt.show()
  233. # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
  234. # # all_data['EVENT_CONSUM_V'] = all_data[['EVENT_CONSUM_V']].apply(max_min_scaler)
  235. # all_data['EVENT_VIDEO_FLUX_V'] = all_data[['EVENT_VIDEO_FLUX_V']].apply(max_min_scaler)
  236. # drop_columns = ['row_id', 'month']
  237. # all_data.drop(columns=drop_columns, inplace=True)
  238. #
  239. # all_data = all_data[(all_data.EVENT_CONSUM_V < 100) & (all_data.EVENT_CONSUM_V > 0)]
  240. # # all_data = all_data[all_data.app_use_kuaishou != -9999]
  241. #
  242. # all_data = pd.melt(all_data[['EVENT_CONSUM_V', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
  243. # # all_data = pd.melt(all_data[['app_use_kuaishou', 'mark']], id_vars='mark', var_name="Features", value_name="Values")
  244. # # fig, ax = plt.subplots(1, 2, figsize=(15, 5))
  245. # sns.violinplot(
  246. # x="Features",
  247. # y="Values",
  248. # hue="mark",
  249. # data=all_data,
  250. # split=True,
  251. # palette='muted'
  252. # )
  253. # plt.show()
  254. pass