process_util.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. from common import value_dict
  2. from sklearn.preprocessing import OneHotEncoder
  3. import numpy as np
  4. import pandas as pd
  5. from random import shuffle
  6. from sklearn import metrics
  7. import matplotlib.pyplot as plt
  8. import seaborn as sns
  9. class ProcessUtil:
  10. np_cates = list(np.array([
  11. [x for x in range(0, 3)],
  12. [x for x in range(0, 5)],
  13. [x for x in range(0, 6)],
  14. [x for x in range(0, 6)],
  15. [x for x in range(0, 3)],
  16. [x for x in range(0, 9)],
  17. [x for x in range(0, 8)],
  18. [x for x in range(0, 8)],
  19. [x for x in range(0, 32)],
  20. [x for x in range(0, 8)],
  21. [x for x in range(0, 8)],
  22. [x for x in range(0, 8)],
  23. [x for x in range(0, 8)],
  24. [x for x in range(0, 8)],
  25. [x for x in range(0, 8)],
  26. # [x for x in range(0,7)]
  27. ]))
  28. enc = OneHotEncoder(categories=np_cates)
  29. one_hot_feature_columns = [
  30. "TAG_GENDER_C",
  31. "TAG_NETTYPE_C",
  32. "TAG_AGE_C",
  33. "TAG_INTIME_C",
  34. "EVENT_IS_ACCT_C",
  35. "EVENT_FLUX_C",
  36. "EVENT_CONSUM_C",
  37. "EVENT_VIDEO_FLUX_C",
  38. "TAG_PROVINCE_C",
  39. 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
  40. 'app_use_kuaishou',
  41. # 'EVENT_ORDER_MONTH_C'
  42. ]
  43. one_hot_feature_columns2 = [
  44. "TAG_GENDER_C",
  45. "TAG_NETTYPE_C",
  46. "TAG_AGE_C",
  47. "TAG_INTIME_C",
  48. "EVENT_IS_ACCT_C",
  49. "EVENT_FLUX_C",
  50. "EVENT_CONSUM_C",
  51. "EVENT_VIDEO_FLUX_C",
  52. "TAG_PROVINCE_C",
  53. 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili']
  54. @classmethod
  55. def convert_province(cls, value):
  56. return value_dict.area_dict[value]
  57. @classmethod
  58. def convert_intime(cls, value):
  59. if value == '1年-2年':
  60. value = 1
  61. elif value == '2年-5年':
  62. value = 2
  63. elif value == '5年-10年':
  64. value = 3
  65. elif value == '10年-20年':
  66. value = 4
  67. elif value == '20年以上':
  68. value = 5
  69. else:
  70. value = 0
  71. return value
  72. @classmethod
  73. def is_number(cls, s):
  74. try:
  75. float(s)
  76. return True
  77. except ValueError:
  78. pass
  79. try:
  80. import unicodedata
  81. unicodedata.numeric(s)
  82. return True
  83. except (TypeError, ValueError):
  84. pass
  85. return False
  86. @classmethod
  87. def convert_flux_value(cls, value):
  88. if value is None or str(value) == "nan":
  89. value = 0
  90. elif not ProcessUtil.is_number(value):
  91. value = 0
  92. return value
  93. @classmethod
  94. def convert_flux(cls, value):
  95. if value == '0-0.5G':
  96. value = 2
  97. elif value == '0.5G-2G':
  98. value = 3
  99. elif value == '2G-5G':
  100. value = 4
  101. elif value == '5G-10G':
  102. value = 5
  103. elif value == '10G-15G':
  104. value = 6
  105. elif value == '15-30G':
  106. value = 7
  107. elif value == '30G以上':
  108. value = 8
  109. elif value == '小于等于0':
  110. value = 1
  111. else:
  112. value = 0
  113. return value
  114. @classmethod
  115. def convert_consume(cls, value):
  116. if value == '10元以下':
  117. value = 1
  118. elif value == '10-20元':
  119. value = 2
  120. elif value == '20-60元':
  121. value = 3
  122. elif value == '60-100元':
  123. value = 4
  124. elif value == '100-150元':
  125. value = 5
  126. elif value == '150-300元':
  127. value = 6
  128. elif value == '300元以上':
  129. value = 7
  130. else:
  131. value = 0
  132. return value
  133. @classmethod
  134. def convert_consume_value(cls, value):
  135. if value is None or str(value) == "nan":
  136. value = 0
  137. elif not ProcessUtil.is_number(value):
  138. value = 0
  139. return value
  140. @classmethod
  141. def convert_event_order_c(cls,value):
  142. if value=='订购1月内':
  143. value=1
  144. elif value == '1-3月':
  145. value=2
  146. elif value == '3-6月':
  147. value=3
  148. elif value=='6-12月':
  149. value=4
  150. elif value=='12-24月':
  151. value=5
  152. elif value=='24月以上':
  153. value=6
  154. else:
  155. value=0
  156. return value
  157. @classmethod
  158. def convert_video_flux(cls, value):
  159. if value == '0-0.5G':
  160. value = 1
  161. elif value == '0.5-2G':
  162. value = 2
  163. elif value == '2-5G':
  164. value = 3
  165. elif value == '5-10G':
  166. value = 4
  167. elif value == '10-15G':
  168. value = 5
  169. elif value == '15-30G':
  170. value = 6
  171. elif value == '30G以上':
  172. value = 7
  173. else:
  174. value = 0
  175. return value
  176. @classmethod
  177. def convert_video_flux_value(cls, value):
  178. if value is None or str(value) == "nan":
  179. value = 0
  180. elif not ProcessUtil.is_number(value):
  181. value = 0
  182. return value
  183. @classmethod
  184. def convert_gender(cls, value):
  185. if value == '男':
  186. value = 0
  187. elif value == '女':
  188. value = 1
  189. else:
  190. value = 2
  191. return value
  192. @classmethod
  193. def convert_nettype(cls, value):
  194. if value == '2G':
  195. value = 1
  196. elif value == '3G':
  197. value = 2
  198. elif value == '4G':
  199. value = 3
  200. elif value == '5G':
  201. value = 4
  202. else:
  203. value = 0
  204. return value
  205. @classmethod
  206. def convert_age(cls, value):
  207. if value == '70前':
  208. value = 1
  209. elif value == '70后':
  210. value = 2
  211. elif value == '80后':
  212. value = 3
  213. elif value == '90后':
  214. value = 4
  215. elif value == '00后':
  216. value = 5
  217. else:
  218. value = 0
  219. return value
  220. @classmethod
  221. def convert_acct(cls, value):
  222. if value == '已出账':
  223. value = 0
  224. elif value == '未出账':
  225. value = 1
  226. else:
  227. value = 2
  228. return value
  229. @classmethod
  230. def convert_appuse(cls, df):
  231. app_use_columns = ['app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
  232. 'app_use_kuaishou']
  233. df[app_use_columns] = df.apply(ProcessUtil.convert_app_use_info, axis=1, result_type="expand")
  234. @classmethod
  235. def convert_app_use_info(cls, value):
  236. value = value.tolist()[-9]
  237. tencent_app_use = 7
  238. kuaishou_app_use = 7
  239. mangguo_app_use = 7
  240. youku_app_use = 7
  241. iqiyi_app_use = 7
  242. bilibili_app_use = 7
  243. if str(value).lower() == 'nan':
  244. pass
  245. else:
  246. use_list = value
  247. for apptype_range in use_list:
  248. if len(apptype_range.split("_")) > 1:
  249. apptype = apptype_range.split("_")[0]
  250. apprange = apptype_range.split("_")[1]
  251. if "腾讯视频" in apptype:
  252. tencent_app_use = ProcessUtil.get_range_value(apprange)
  253. if "哔哩哔哩" in apptype:
  254. bilibili_app_use = ProcessUtil.get_range_value(apprange)
  255. if "爱奇艺" in apptype:
  256. iqiyi_app_use = ProcessUtil.get_range_value(apprange)
  257. if "快手" in apptype:
  258. kuaishou_app_use = ProcessUtil.get_range_value(apprange)
  259. if "优酷" in apptype:
  260. youku_app_use = ProcessUtil.get_range_value(apprange)
  261. if "芒果" in apptype:
  262. mangguo_app_use = ProcessUtil.get_range_value(apprange)
  263. return int(tencent_app_use), int(mangguo_app_use), int(youku_app_use), int(iqiyi_app_use), int(
  264. bilibili_app_use), int(kuaishou_app_use)
  265. @classmethod
  266. def get_range_value(cls, range_value):
  267. if range_value == '0.1G-0.5G':
  268. value = 1
  269. elif range_value == '0.5G-2G':
  270. value = 2
  271. elif range_value == '2G-5G':
  272. value = 3
  273. elif range_value == '5G-10G':
  274. value = 4
  275. elif range_value == '10G-20G':
  276. value = 5
  277. elif range_value == '20G以上':
  278. value = 6
  279. else:
  280. value = 0
  281. return value
  282. @classmethod
  283. def convert_onehot(cls, df):
  284. ProcessUtil.enc.fit(df[ProcessUtil.one_hot_feature_columns])
  285. after_onehot_features = ProcessUtil.enc.get_feature_names(ProcessUtil.one_hot_feature_columns)
  286. one_hot_dataframe = pd.DataFrame(
  287. ProcessUtil.enc.fit_transform(df[ProcessUtil.one_hot_feature_columns]).toarray().astype("int"),
  288. columns=after_onehot_features)
  289. df.drop(columns=ProcessUtil.one_hot_feature_columns2, inplace=True)
  290. df = pd.concat([df, one_hot_dataframe], axis=1)
  291. return df
  292. @classmethod
  293. def partition_preserve_order(cls, list_in, n):
  294. indices = list(range(len(list_in)))
  295. shuffle(indices)
  296. index_partitions = [sorted(indices[i::n]) for i in range(n)]
  297. return [[list_in[i] for i in index_partition]
  298. for index_partition in index_partitions]
  299. @classmethod
  300. def gen_train_tuple(cls, pos_row_ids, neg_row_ids, batch_num_pos):
  301. result = []
  302. for i, e, in enumerate(neg_row_ids):
  303. pos_index = i % batch_num_pos
  304. result.append((pos_row_ids[pos_index], neg_row_ids[i]))
  305. return result
  306. @classmethod
  307. def draw_roc_auc(cls, y_label, y_test):
  308. # ROC曲线绘制
  309. fpr, tpr, thresholds = metrics.roc_curve(y_label, y_test)
  310. ##计算曲线下面积
  311. roc_auc = metrics.auc(fpr, tpr)
  312. ##绘图
  313. plt.clf()
  314. plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
  315. plt.plot([0, 1], [0, 1], 'k--')
  316. plt.xlim([0.0, 1.0])
  317. plt.ylim([0.0, 1.0])
  318. plt.legend(loc="lower right")
  319. plt.show()
  320. @classmethod
  321. def draw_confusion_matrix(cls, y_label, y_test):
  322. # 画出混淆矩阵
  323. confusion_data = metrics.confusion_matrix(y_label, y_test)
  324. print(confusion_data)
  325. sns.heatmap(confusion_data, cmap="Greens", annot=True)
  326. plt.xlabel("Predicted labels")
  327. plt.ylabel("True labels")
  328. plt.tight_layout()
  329. plt.show()