data_analysis_new.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. from common.log_utils import logFactory
  2. from common.database_utils import database_util
  3. from common import constant
  4. import pandas as pd
  5. from tqdm import tqdm
  6. from random import shuffle
  7. from sklearn.model_selection import train_test_split
  8. import lightgbm as lgb
  9. from sklearn import metrics
  10. import matplotlib.pyplot as plt
  11. import seaborn as sns
  12. # # 分组数量
  13. batch_num_pos = 1
  14. batch_num_neg = 13
  15. # 根据每个批次的数据量计算出每个批次的row_id
  16. def partition_preserve_order(list_in, n):
  17. indices = list(range(len(list_in)))
  18. shuffle(indices)
  19. index_partitions = [sorted(indices[i::n]) for i in range(n)]
  20. return [[list_in[i] for i in index_partition]
  21. for index_partition in index_partitions]
  22. def gen_train_tuple(pos_row_ids, neg_row_ids):
  23. result = []
  24. for i, e, in enumerate(neg_row_ids):
  25. pos_index = i % batch_num_pos
  26. result.append((pos_row_ids[pos_index], neg_row_ids[i]))
  27. return result
  28. logger = logFactory("local data analysis new").log
  29. total_pos = pd.read_pickle("data_pos_0218_06.pkl")
  30. total_neg = pd.read_pickle("data_neg_part_1_0218_06.pkl")
  31. train_data_frame_pos = total_pos
  32. train_data_frame_neg = total_neg
  33. train_data_frame_pos['mark'] = 0
  34. train_data_frame_neg['mark'] = 1
  35. total_train_data = pd.concat([train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos,train_data_frame_pos, train_data_frame_neg], axis=0)
  36. # EVENT_CONSUM_V
  37. t0 = total_train_data[['EVENT_CONSUM_V', 'mark']]
  38. t0 = t0[t0.EVENT_CONSUM_V <= 200]
  39. all_data = pd.melt(t0, id_vars='mark', var_name="Features",
  40. value_name="Values")
  41. # sns.violinplot(
  42. # x="Features",
  43. # y="Values",
  44. # hue="mark",
  45. # data=all_data,
  46. # split=False,
  47. # palette='muted'
  48. # )
  49. # plt.show()
  50. #
  51. # # EVENT_FLUX_V
  52. # t0 = total_train_data[['EVENT_FLUX_V', 'mark']]
  53. # t0 = t0[t0.EVENT_FLUX_V <= 20000]
  54. # all_data = pd.melt(t0, id_vars='mark', var_name="Features",
  55. # value_name="Values")
  56. # sns.violinplot(
  57. # x="Features",
  58. # y="Values",
  59. # hue="mark",
  60. # data=all_data,
  61. # split=False,
  62. # palette='muted'
  63. # )
  64. # plt.show()
  65. #
  66. # # EVENT_VIDEO_FLUX_V
  67. # t0 = total_train_data[['EVENT_VIDEO_FLUX_V', 'mark']]
  68. # t0 = t0[t0.EVENT_VIDEO_FLUX_V <= 20]
  69. # all_data = pd.melt(t0, id_vars='mark', var_name="Features",
  70. # value_name="Values")
  71. # sns.violinplot(
  72. # x="Features",
  73. # y="Values",
  74. # hue="mark",
  75. # data=all_data,
  76. # split=False,
  77. # palette='muted'
  78. # )
  79. # plt.show()
  80. #
  81. # f, [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8] = plt.subplots(1, 8, figsize=(20, 5))
  82. # sns.countplot(x='app_use_kuaishou_7', hue='mark', data=total_train_data, ax=ax1)
  83. # sns.countplot(x='app_use_kuaishou_6', hue='mark', data=total_train_data, ax=ax2)
  84. # sns.countplot(x='app_use_kuaishou_5', hue='mark', data=total_train_data, ax=ax3)
  85. # sns.countplot(x='app_use_kuaishou_4', hue='mark', data=total_train_data, ax=ax4)
  86. # sns.countplot(x='app_use_kuaishou_3', hue='mark', data=total_train_data, ax=ax5)
  87. # sns.countplot(x='app_use_kuaishou_2', hue='mark', data=total_train_data, ax=ax6)
  88. # sns.countplot(x='app_use_kuaishou_1', hue='mark', data=total_train_data, ax=ax7)
  89. # sns.countplot(x='app_use_kuaishou_0', hue='mark', data=total_train_data, ax=ax8)
  90. # ax1.set_title('label7')
  91. # ax2.set_title('label7')
  92. # ax3.set_title('label5')
  93. # ax4.set_title('label4')
  94. # ax5.set_title('label3')
  95. # ax6.set_title('label2')
  96. # ax7.set_title('label1')
  97. # ax8.set_title('label0')
  98. # sns.set(rc={'figure.figsize': (50, 50)})
  99. # f, [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10,
  100. # ax11, ax12, ax13, ax14, ax15, ax16, ax17, ax18, ax19, ax20,
  101. # ax21, ax22, ax23, ax24, ax25, ax26, ax27, ax28, ax29, ax30, ax31, ax32] = plt.subplots(1, 32, figsize=(20, 5))
  102. # sns.countplot(x="TAG_PROVINCE_C_0", hue='mark', data=total_train_data, ax=ax1)
  103. # sns.countplot(x="TAG_PROVINCE_C_1", hue='mark', data=total_train_data, ax=ax2)
  104. # sns.countplot(x="TAG_PROVINCE_C_2", hue='mark', data=total_train_data, ax=ax3)
  105. # sns.countplot(x="TAG_PROVINCE_C_3", hue='mark', data=total_train_data, ax=ax4)
  106. # sns.countplot(x="TAG_PROVINCE_C_4", hue='mark', data=total_train_data, ax=ax5)
  107. # sns.countplot(x="TAG_PROVINCE_C_5", hue='mark', data=total_train_data, ax=ax6)
  108. # sns.countplot(x="TAG_PROVINCE_C_6", hue='mark', data=total_train_data, ax=ax7)
  109. # sns.countplot(x="TAG_PROVINCE_C_7", hue='mark', data=total_train_data, ax=ax8)
  110. # sns.countplot(x="TAG_PROVINCE_C_8", hue='mark', data=total_train_data, ax=ax9)
  111. # sns.countplot(x="TAG_PROVINCE_C_9", hue='mark', data=total_train_data, ax=ax10)
  112. # sns.countplot(x="TAG_PROVINCE_C_10", hue='mark', data=total_train_data, ax=ax11)
  113. # sns.countplot(x="TAG_PROVINCE_C_11", hue='mark', data=total_train_data, ax=ax12)
  114. # sns.countplot(x="TAG_PROVINCE_C_12", hue='mark', data=total_train_data, ax=ax13)
  115. # sns.countplot(x="TAG_PROVINCE_C_13", hue='mark', data=total_train_data, ax=ax14)
  116. # sns.countplot(x="TAG_PROVINCE_C_14", hue='mark', data=total_train_data, ax=ax15)
  117. # sns.countplot(x="TAG_PROVINCE_C_15", hue='mark', data=total_train_data, ax=ax16)
  118. # sns.countplot(x="TAG_PROVINCE_C_16", hue='mark', data=total_train_data, ax=ax17)
  119. # sns.countplot(x="TAG_PROVINCE_C_17", hue='mark', data=total_train_data, ax=ax18)
  120. # sns.countplot(x="TAG_PROVINCE_C_18", hue='mark', data=total_train_data, ax=ax19)
  121. # sns.countplot(x="TAG_PROVINCE_C_19", hue='mark', data=total_train_data, ax=ax20)
  122. # sns.countplot(x="TAG_PROVINCE_C_20", hue='mark', data=total_train_data, ax=ax21)
  123. # sns.countplot(x="TAG_PROVINCE_C_21", hue='mark', data=total_train_data, ax=ax22)
  124. # sns.countplot(x="TAG_PROVINCE_C_22", hue='mark', data=total_train_data, ax=ax23)
  125. # sns.countplot(x="TAG_PROVINCE_C_23", hue='mark', data=total_train_data, ax=ax24)
  126. # sns.countplot(x="TAG_PROVINCE_C_24", hue='mark', data=total_train_data, ax=ax25)
  127. # sns.countplot(x="TAG_PROVINCE_C_25", hue='mark', data=total_train_data, ax=ax26)
  128. # sns.countplot(x="TAG_PROVINCE_C_26", hue='mark', data=total_train_data, ax=ax27)
  129. # sns.countplot(x="TAG_PROVINCE_C_27", hue='mark', data=total_train_data, ax=ax28)
  130. # sns.countplot(x="TAG_PROVINCE_C_28", hue='mark', data=total_train_data, ax=ax29)
  131. # sns.countplot(x="TAG_PROVINCE_C_29", hue='mark', data=total_train_data, ax=ax30)
  132. # sns.countplot(x="TAG_PROVINCE_C_30", hue='mark', data=total_train_data, ax=ax31)
  133. # sns.countplot(x="TAG_PROVINCE_C_31", hue='mark', data=total_train_data, ax=ax32)
  134. # ax1.set_title("PROVINCE_C_0")
  135. # ax2.set_title("PROVINCE_C_1")
  136. # ax3.set_title("PROVINCE_C_2")
  137. # ax4.set_title("PROVINCE_C_3")
  138. # ax5.set_title("PROVINCE_C_4")
  139. # ax6.set_title("PROVINCE_C_5")
  140. # ax7.set_title("PROVINCE_C_6")
  141. # ax8.set_title("PROVINCE_C_7")
  142. # ax9.set_title("PROVINCE_C_8")
  143. # ax10.set_title("PROVINCE_C_9")
  144. # ax11.set_title("PROVINCE_C_10")
  145. # ax12.set_title("PROVINCE_C_11")
  146. # ax13.set_title("PROVINCE_C_12")
  147. # ax14.set_title("PROVINCE_C_13")
  148. # ax15.set_title("PROVINCE_C_14")
  149. # ax16.set_title("PROVINCE_C_15")
  150. # ax17.set_title("PROVINCE_C_16")
  151. # ax18.set_title("PROVINCE_C_17")
  152. # ax19.set_title("PROVINCE_C_18")
  153. # ax20.set_title("PROVINCE_C_19")
  154. # ax21.set_title("PROVINCE_C_20")
  155. # ax22.set_title("PROVINCE_C_21")
  156. # ax23.set_title("PROVINCE_C_22")
  157. # ax24.set_title("PROVINCE_C_23")
  158. # ax25.set_title("PROVINCE_C_24")
  159. # ax26.set_title("PROVINCE_C_25")
  160. # ax27.set_title("PROVINCE_C_26")
  161. # ax28.set_title("PROVINCE_C_27")
  162. # ax29.set_title("PROVINCE_C_28")
  163. # ax30.set_title("PROVINCE_C_29")
  164. # ax31.set_title("PROVINCE_C_30")
  165. # ax32.set_title("PROVINCE_C_31")
  166. f, [ax1, ax2, ax3] = plt.subplots(1, 3, figsize=(20, 5))
  167. sns.countplot(x='EVENT_IS_ACCT_C_0', hue='mark', data=total_train_data, ax=ax1)
  168. sns.countplot(x='EVENT_IS_ACCT_C_1', hue='mark', data=total_train_data, ax=ax2)
  169. sns.countplot(x='EVENT_IS_ACCT_C_2', hue='mark', data=total_train_data, ax=ax3)
  170. ax1.set_title('acct0')
  171. ax2.set_title('acct1')
  172. ax3.set_title('acct2')
  173. #
  174. # f, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(1, 5, figsize=(20, 5))
  175. # sns.countplot(x='TAG_NETTYPE_C_0', hue='mark', data=total_train_data, ax=ax1)
  176. # sns.countplot(x='TAG_NETTYPE_C_1', hue='mark', data=total_train_data, ax=ax2)
  177. # sns.countplot(x='TAG_NETTYPE_C_2', hue='mark', data=total_train_data, ax=ax3)
  178. # sns.countplot(x='TAG_NETTYPE_C_3', hue='mark', data=total_train_data, ax=ax4)
  179. # sns.countplot(x='TAG_NETTYPE_C_4', hue='mark', data=total_train_data, ax=ax5)
  180. # ax1.set_title('net type0')
  181. # ax2.set_title('net type1')
  182. # ax3.set_title('net type2')
  183. # ax4.set_title('net type3')
  184. # ax5.set_title('net type4')
  185. #
  186. # f, [ax1, ax2, ax3, ax4, ax5, ax6] = plt.subplots(1, 6, figsize=(20, 5))
  187. # sns.countplot(x='TAG_INTIME_C_0', hue='mark', data=total_train_data, ax=ax1)
  188. # sns.countplot(x='TAG_INTIME_C_1', hue='mark', data=total_train_data, ax=ax2)
  189. # sns.countplot(x='TAG_INTIME_C_2', hue='mark', data=total_train_data, ax=ax3)
  190. # sns.countplot(x='TAG_INTIME_C_3', hue='mark', data=total_train_data, ax=ax4)
  191. # sns.countplot(x='TAG_INTIME_C_4', hue='mark', data=total_train_data, ax=ax5)
  192. # sns.countplot(x='TAG_INTIME_C_5', hue='mark', data=total_train_data, ax=ax6)
  193. # ax1.set_title('INTIME1')
  194. # ax2.set_title('INTIME2')
  195. # ax3.set_title('INTIME3')
  196. # ax4.set_title('INTIME4')
  197. # ax5.set_title('INTIME5')
  198. # ax6.set_title('INTIME5')
  199. plt.show()