data_analysize.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import pandas as pd
  2. from common.log_utils import logFactory
  3. from common.database_utils import database_util
  4. from common import constant
  5. import pickle
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. import seaborn as sns
  9. click_client = database_util.get_client()
  10. logger = logFactory("data analysis").log
  11. if __name__:
  12. # train_pos_data = pd.read_pickle('./data/pkl/train_pos_all.pkl')
  13. # train_neg_data = pd.read_pickle('./data/pkl/train_neg_all.pkl')
  14. # train_neg_data = pd.concat(
  15. # [train_neg_data, train_neg_data, train_neg_data, train_neg_data, train_neg_data, train_neg_data, train_neg_data,
  16. # train_neg_data, train_neg_data, train_neg_data], axis=0)
  17. #
  18. # train_neg_data['mark'] = 0
  19. # train_pos_data['mark'] = 1
  20. #
  21. # all_data = pd.concat([train_pos_data, train_neg_data], axis=0)
  22. all_data = pd.read_pickle('./data/pkl/analysize_all.pkl')
  23. # first_EVENT_FLUX_V
  24. # second_EVENT_FLUX_V
  25. # temp = pd.melt(all_data[['first_EVENT_FLUX_V', 'second_EVENT_FLUX_V', 'mark']], id_vars='mark', var_name="Features",
  26. # value_name="Values")
  27. # sns.violinplot(
  28. # x="Features",
  29. # y="Values",
  30. # hue="mark",
  31. # data=temp,
  32. # split=True,
  33. # palette='muted'
  34. # )
  35. # plt.show()
  36. #
  37. # # first_EVENT_CONSUM_V
  38. # # second_EVENT_CONSUM_V
  39. # # all_data = all_data[(all_data.EVENT_FLUX_V < 100000) & (all_data.EVENT_FLUX_V > 0)]
  40. # temp = all_data[(all_data.first_EVENT_CONSUM_V < 300) & (all_data.second_EVENT_CONSUM_V < 300)]
  41. # temp = pd.melt(temp[['first_EVENT_CONSUM_V', 'second_EVENT_CONSUM_V', 'mark']], id_vars='mark',
  42. # var_name="Features",
  43. # value_name="Values")
  44. # sns.violinplot(
  45. # x="Features",
  46. # y="Values",
  47. # hue="mark",
  48. # data=temp,
  49. # split=True,
  50. # palette='muted'
  51. # )
  52. # plt.show()
  53. # temp = all_data[(all_data.first_EVENT_VIDEO_FLUX_V < 100) & (all_data.second_EVENT_VIDEO_FLUX_V < 100)]
  54. # temp = pd.melt(temp[['first_EVENT_VIDEO_FLUX_V', 'second_EVENT_VIDEO_FLUX_V', 'mark']], id_vars='mark',
  55. # var_name="Features",
  56. # value_name="Values")
  57. # sns.violinplot(
  58. # x="Features",
  59. # y="Values",
  60. # hue="mark",
  61. # data=temp,
  62. # split=True,
  63. # palette='muted'
  64. # )
  65. # plt.show()
  66. # temp = pd.melt(all_data[['first_MAvg_TOTAL_FLUX_1_3_zs', 'second_MAvg_TOTAL_FLUX_1_3_zs',
  67. # 'first_MPer1_TOTAL_FLUX_zs', 'second_MPer1_TOTAL_FLUX_zs',
  68. # 'mark']], id_vars='mark',
  69. # var_name="Features",
  70. # value_name="Values")
  71. # sns.violinplot(
  72. # x="Features",
  73. # y="Values",
  74. # hue="mark",
  75. # data=temp,
  76. # split=True,
  77. # palette='muted'
  78. # )
  79. # sns.set(rc={'figure.figsize': (30, 30)})
  80. # plt.show()
  81. #
  82. # temp = pd.melt(all_data[[
  83. # 'first_MAvg_TOTAL_VIDEO_FLUX_1_3_zs', 'second_MAvg_TOTAL_VIDEO_FLUX_1_3_zs',
  84. # 'first_MPer1_TOTAL_VIDEO_FLUX_zs', 'second_MPer1_TOTAL_VIDEO_FLUX_zs',
  85. # 'mark']], id_vars='mark',
  86. # var_name="Features",
  87. # value_name="Values")
  88. # sns.violinplot(
  89. # x="Features",
  90. # y="Values",
  91. # hue="mark",
  92. # data=temp,
  93. # split=True,
  94. # palette='muted'
  95. # )
  96. # sns.set(rc={'figure.figsize': (30, 30)})
  97. # plt.show()
  98. #
  99. # temp = pd.melt(all_data[[
  100. # 'first_MAvg_Flow_kuaishou_1_3_zs', 'second_MAvg_Flow_kuaishou_1_3_zs',
  101. # 'first_MPer1_Flow_kuaishou_zs', 'second_MPer1_Flow_kuaishou_zs',
  102. # 'first_Div_kuaishou_vFlux_1_3', 'second_Div_kuaishou_vFlux_1_3',
  103. # 'mark']], id_vars='mark',
  104. # var_name="Features",
  105. # value_name="Values")
  106. # sns.violinplot(
  107. # x="Features",
  108. # y="Values",
  109. # hue="mark",
  110. # data=temp,
  111. # split=True,
  112. # palette='muted'
  113. # )
  114. # sns.set(rc={'figure.figsize': (30, 30)})
  115. # plt.show()
  116. #, 'diff_EVENT_CONSUM_V',
  117. # 'diff_EVENT_VIDEO_FLUX_V', 'diff_kuaishou_use',
  118. temp = all_data[(all_data.diff_EVENT_FLUX_V < 50000) & (all_data.diff_EVENT_FLUX_V < 50000)]
  119. temp = pd.melt(all_data[[
  120. 'diff_EVENT_FLUX_V',
  121. 'mark']], id_vars='mark',
  122. var_name="Features",
  123. value_name="Values")
  124. sns.violinplot(
  125. x="Features",
  126. y="Values",
  127. hue="mark",
  128. data=temp,
  129. split=True,
  130. palette='muted'
  131. )
  132. plt.show()
  133. temp = all_data[(all_data.diff_EVENT_VIDEO_FLUX_V < 50) & (all_data.diff_EVENT_VIDEO_FLUX_V < 50)]
  134. temp = pd.melt(all_data[[
  135. 'diff_EVENT_VIDEO_FLUX_V',
  136. 'mark']], id_vars='mark',
  137. var_name="Features",
  138. value_name="Values")
  139. sns.violinplot(
  140. x="Features",
  141. y="Values",
  142. hue="mark",
  143. data=temp,
  144. split=True,
  145. palette='muted'
  146. )
  147. plt.show()
  148. temp = pd.melt(all_data[[
  149. 'diff_kuaishou_use',
  150. 'mark']], id_vars='mark',
  151. var_name="Features",
  152. value_name="Values")
  153. sns.violinplot(
  154. x="Features",
  155. y="Values",
  156. hue="mark",
  157. data=temp,
  158. split=True,
  159. palette='muted'
  160. )
  161. plt.show()
  162. pass