example_data.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. import pandas as pd
  2. from common.log_utils import logFactory
  3. from common import constant
  4. from common import value_dict
  5. from common.database_utils import database_util
  6. from tqdm import tqdm
  7. from sklearn.preprocessing import OneHotEncoder
  8. import numpy as np
  9. import math
  10. click_client = database_util.get_client()
  11. logger = logFactory("preprocess data").log
  12. column_names = [
  13. "THEMONTH_V",
  14. "uuid",
  15. "TAG_AREA_C",
  16. "EVENT_CPNAME_C",
  17. "EVENT_SPNAME_C",
  18. "EVENT_CATEGORYNAME_C",
  19. "EVENT_CHANNEL_C",
  20. "EVENT_CHANNEL_BELONGTO_C",
  21. "EVENT_ORDER_MONTH_C",
  22. "EVENT_ORDER_MONTH_V",
  23. "EVENT_CANCEL_DIFF_C",
  24. "EVENT_CANCEL_DIFF_V",
  25. "EVENT_ORDER_PRICE_C",
  26. "EVENT_ORDER_PRICE_V",
  27. "EVENT_ORDER_FLOWP_C",
  28. "EVENT_ORDER_FLOWP_V",
  29. "EVENT_ORDER_STATE_C",
  30. "EVENT_ORDER_SUMMONTH_C",
  31. "EVENT_ORDER_SUMMONTH_V",
  32. "TAG_INTIME_C",
  33. "TAG_INTIME_V",
  34. "TAG_PACKAGE_C",
  35. "TAG_PACKAGE_PIC_C",
  36. "EVENT_FLUX_C",
  37. "EVENT_FLUX_V",
  38. "EVENT_CONSUM_C",
  39. "EVENT_CONSUM_V",
  40. "EVENT_PHONETYPE_C",
  41. "EVENT_VIDEO_FLUX_C",
  42. "EVENT_VIDEO_FLUX_V",
  43. "EVENT_IS_ACCT_C",
  44. "TAG_GENDER_C",
  45. "TAG_PROVINCE_C",
  46. "TAG_NETTYPE_C",
  47. "TAG_AGE_C",
  48. "EVENT_APP_USE.C",
  49. "EVENT_APP_USE.V",
  50. "EVENT_USER_OSTATE_C"
  51. ]
  52. drop_columns = [
  53. "THEMONTH_V",
  54. "uuid",
  55. "EVENT_CPNAME_C",
  56. "EVENT_SPNAME_C",
  57. "EVENT_CATEGORYNAME_C",
  58. "EVENT_CHANNEL_C",
  59. "EVENT_CHANNEL_BELONGTO_C",
  60. "EVENT_ORDER_MONTH_C",
  61. "EVENT_ORDER_MONTH_V",
  62. "EVENT_CANCEL_DIFF_C",
  63. "EVENT_CANCEL_DIFF_V",
  64. "EVENT_ORDER_PRICE_C",
  65. "EVENT_ORDER_PRICE_V",
  66. "EVENT_ORDER_FLOWP_C",
  67. "EVENT_ORDER_FLOWP_V",
  68. "EVENT_ORDER_STATE_C",
  69. "EVENT_ORDER_SUMMONTH_C",
  70. "EVENT_ORDER_SUMMONTH_V",
  71. "TAG_INTIME_V",
  72. "TAG_PACKAGE_C",
  73. "TAG_PACKAGE_PIC_C",
  74. "EVENT_PHONETYPE_C",
  75. "TAG_AREA_C",
  76. "EVENT_APP_USE.V",
  77. "I_appuse"
  78. ]
  79. fetch_size = 50000
  80. def convert_product_type(value):
  81. if value == "无限畅视":
  82. value = 10
  83. elif value == "电商卡":
  84. value = 9
  85. elif value == "畅视后向":
  86. value = 8
  87. elif value == "畅视流量":
  88. value = 7
  89. elif value == "沃畅视":
  90. value = 6
  91. elif value == "畅视随心选":
  92. value = 5
  93. elif value == "沃视频":
  94. value = 4
  95. elif value == "小娱八斗":
  96. value = 3
  97. elif value == "畅视互娱":
  98. value = 2
  99. elif value == "其它":
  100. value = 1
  101. else:
  102. value = 0
  103. return value
  104. # 渠道大类
  105. def convert_channel(value):
  106. if value == '互联网渠道':
  107. value = 6
  108. elif value == '集团渠道':
  109. value = 5
  110. elif value == '自有线上渠道':
  111. value = 4
  112. elif value == '省分线下渠道':
  113. value = 3
  114. elif value == '社会代理和其他渠道':
  115. value = 2
  116. elif value == '省分线上渠道':
  117. value = 1
  118. else:
  119. value = 0
  120. return value
  121. # 订购月份
  122. def convert_order_month(value):
  123. if value == '订购1月内':
  124. value = 1
  125. elif value == '1-3月':
  126. value = 2
  127. elif value == '3-6月':
  128. value = 3
  129. elif value == '6-12月':
  130. value = 4
  131. elif value == '12-24月':
  132. value = 5
  133. elif value == '24月以上':
  134. value = 6
  135. else:
  136. value = 0
  137. return value
  138. # 退订距今时长
  139. def convert_cancel_diff(value):
  140. if value == '退订1月内':
  141. value = 1
  142. elif value == '退订1-3月':
  143. value = 2
  144. elif value == '退订3-6月':
  145. value = 3
  146. elif value == '退订6-12月':
  147. value = 4
  148. elif value == '退订1年以上':
  149. value = 5
  150. elif value == '未退订':
  151. value = 6
  152. else:
  153. value = 0
  154. return value
  155. # 订购价格
  156. def convert_order_price(value):
  157. if value == '0元':
  158. value = 1
  159. elif value == '1-3元':
  160. value = 2
  161. elif value == '3-8元':
  162. value = 3
  163. elif value == '8-15元':
  164. value = 4
  165. elif value == '15-20元':
  166. value = 5
  167. elif value == '20以上':
  168. value = 6
  169. else:
  170. value = 0
  171. return value
  172. # 流量使用比
  173. def convert_order_flow(value):
  174. if str(value) in ['5%以下']:
  175. value = 1
  176. elif value == '5-20%':
  177. value = 2
  178. elif value == '20-50%':
  179. value = 3
  180. elif value == '50-80%':
  181. value = 4
  182. elif value == '80%以上':
  183. value = 5
  184. else:
  185. value = 0
  186. return value
  187. # 产品订购总时长
  188. def convert_order_sum_month(value):
  189. if value == '0-1月':
  190. value = 1
  191. elif value == '2-6月':
  192. value = 2
  193. elif value == '6-12月':
  194. value = 3
  195. elif value == '12-24月':
  196. value = 4
  197. elif value == '24月以上':
  198. value = 5
  199. else:
  200. value = 0
  201. return value
  202. # 入网时长
  203. def convert_intime(value):
  204. if value == '1年-2年':
  205. value = 1
  206. elif value == '2年-5年':
  207. value = 2
  208. elif value == '5年-10年':
  209. value = 3
  210. elif value == '10年-20年':
  211. value = 4
  212. elif value == '20年以上':
  213. value = 5
  214. else:
  215. value = 0
  216. return value
  217. # 当月使用流量
  218. def convert_flux(value):
  219. if value == '0-0.5G':
  220. value = 2
  221. elif value == '0.5G-2G':
  222. value = 3
  223. elif value == '2G-5G':
  224. value = 4
  225. elif value == '5G-10G':
  226. value = 5
  227. elif value == '10G-15G':
  228. value = 6
  229. elif value == '15-30G':
  230. value = 7
  231. elif value == '30G以上':
  232. value = 8
  233. elif value == '小于等于0':
  234. value = 1
  235. else:
  236. value = 0
  237. return value
  238. # 是否出账
  239. def convert_acct(value):
  240. if value == '已出账':
  241. value = 0
  242. elif value == '未出账':
  243. value = 1
  244. else:
  245. value = 2
  246. return value
  247. # 月消费
  248. def convert_consume(value):
  249. if value == '10元以下':
  250. value = 1
  251. elif value == '10-20元':
  252. value = 2
  253. elif value == '20-60元':
  254. value = 3
  255. elif value == '60-100元':
  256. value = 4
  257. elif value == '100-150元':
  258. value = 5
  259. elif value == '150-300元':
  260. value = 6
  261. elif value == '300元以上':
  262. value = 7
  263. else:
  264. value = 0
  265. return value
  266. def convert_flux_value(value):
  267. if value is None or str(value) == "nan":
  268. value = 0
  269. elif not constant.is_number(value):
  270. value = 0
  271. return value
  272. def convert_consume_value(value):
  273. if value is None or str(value) == "nan":
  274. value = 0
  275. elif not constant.is_number(value):
  276. value = 0
  277. return value
  278. def convert_video_flux_value(value):
  279. if value is None or str(value) == "nan":
  280. value = 0
  281. elif not constant.is_number(value):
  282. value = 0
  283. return value
  284. # 月视频使用流量
  285. def convert_video_flux(value):
  286. if value == '0-0.5G':
  287. value = 1
  288. elif value == '0.5-2G':
  289. value = 2
  290. elif value == '2-5G':
  291. value = 3
  292. elif value == '5-10G':
  293. value = 4
  294. elif value == '10-15G':
  295. value = 5
  296. elif value == '15-30G':
  297. value = 6
  298. elif value == '30G以上':
  299. value = 7
  300. else:
  301. value = 0
  302. return value
  303. # 性别
  304. def convert_gender(value):
  305. if value == '男':
  306. value = 0
  307. elif value == '女':
  308. value = 1
  309. else:
  310. value = 2
  311. return value
  312. # 网络类型
  313. def convert_nettype(value):
  314. if value == '2G':
  315. value = 1
  316. elif value == '3G':
  317. value = 2
  318. elif value == '4G':
  319. value = 3
  320. elif value == '5G':
  321. value = 4
  322. else:
  323. value = 0
  324. return value
  325. # 年龄
  326. def convert_age(value):
  327. if value == '70前':
  328. value = 1
  329. elif value == '70后':
  330. value = 2
  331. elif value == '80后':
  332. value = 3
  333. elif value == '90后':
  334. value = 4
  335. elif value == '00后':
  336. value = 5
  337. else:
  338. value = 0
  339. return value
  340. def get_range_value(range_value):
  341. if range_value == '0.1G-0.5G':
  342. value = 1
  343. elif range_value == '0.5G-2G':
  344. value = 2
  345. elif range_value == '2G-5G':
  346. value = 3
  347. elif range_value == '5G-10G':
  348. value = 4
  349. elif range_value == '10G-20G':
  350. value = 5
  351. elif range_value == '20G以上':
  352. value = 6
  353. else:
  354. value = 0
  355. return value
  356. def convert_app_use_info(value):
  357. value = value.tolist()[-2]
  358. tencent_app_use = 7
  359. kuaishou_app_use = 7
  360. mangguo_app_use = 7
  361. youku_app_use = 7
  362. iqiyi_app_use = 7
  363. bilibili_app_use = 7
  364. if str(value).lower() == 'nan':
  365. pass
  366. else:
  367. use_list = value
  368. for apptype_range in use_list:
  369. if len(apptype_range.split("_")) > 1:
  370. apptype = apptype_range.split("_")[0]
  371. apprange = apptype_range.split("_")[1]
  372. if "腾讯视频" in apptype:
  373. tencent_app_use = get_range_value(apprange)
  374. if "哔哩哔哩" in apptype:
  375. bilibili_app_use = get_range_value(apprange)
  376. if "爱奇艺" in apptype:
  377. iqiyi_app_use = get_range_value(apprange)
  378. if "快手" in apptype:
  379. kuaishou_app_use = get_range_value(apprange)
  380. if "优酷" in apptype:
  381. youku_app_use = get_range_value(apprange)
  382. if "芒果" in apptype:
  383. mangguo_app_use = get_range_value(apprange)
  384. return int(tencent_app_use), int(mangguo_app_use), int(youku_app_use), int(iqiyi_app_use), int(
  385. bilibili_app_use), int(kuaishou_app_use)
  386. # APP使用情况
  387. def convert_appuse(df):
  388. app_use_columns = ['app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
  389. 'app_use_kuaishou']
  390. df[app_use_columns] = df.apply(convert_app_use_info, axis=1, result_type="expand")
  391. def convert_province(value):
  392. return value_dict.area_dict[value]
  393. def convert_label(data):
  394. logger.info("开始预处理")
  395. # data['EVENT_CATEGORYNAME_C'] = data['EVENT_CATEGORYNAME_C'].map(convert_product_type)
  396. # data['EVENT_CATEGORYNAME_C'] = data['EVENT_CATEGORYNAME_C'].astype('int')
  397. # data['EVENT_CHANNEL_BELONGTO_C'] = data['EVENT_CHANNEL_BELONGTO_C'].map(convert_channel)
  398. # data['EVENT_CHANNEL_BELONGTO_C'] = data['EVENT_CHANNEL_BELONGTO_C'].astype('int')
  399. # data['EVENT_ORDER_MONTH_C'] = data['EVENT_ORDER_MONTH_C'].map(convert_order_month)
  400. # data['EVENT_CANCEL_DIFF_C'] = data['EVENT_CANCEL_DIFF_C'].map(convert_cancel_diff)
  401. # data['EVENT_CANCEL_DIFF_C'] = data['EVENT_CANCEL_DIFF_C'].astype('int')
  402. # data['EVENT_ORDER_PRICE_C'] = data['EVENT_ORDER_PRICE_C'].map(convert_order_price)
  403. # data['EVENT_ORDER_FLOWP_C'] = data['EVENT_ORDER_FLOWP_C'].map(convert_order_flow)
  404. # data['EVENT_ORDER_SUMMONTH_C'] = data['EVENT_ORDER_SUMMONTH_C'].map(convert_order_sum_month)
  405. data['TAG_PROVINCE_C'] = data['TAG_PROVINCE_C'].map(convert_province)
  406. data['TAG_INTIME_C'] = data['TAG_INTIME_C'].map(convert_intime)
  407. data['TAG_INTIME_C'] = data['TAG_INTIME_C'].astype('int')
  408. data['EVENT_FLUX_C'] = data['EVENT_FLUX_C'].map(convert_flux)
  409. data['EVENT_FLUX_V'] = data['EVENT_FLUX_V'].map(convert_flux_value)
  410. data['EVENT_FLUX_V'] = data['EVENT_FLUX_V'].astype('float')
  411. data['EVENT_FLUX_C'] = data['EVENT_FLUX_C'].astype('int')
  412. data['EVENT_CONSUM_C'] = data['EVENT_CONSUM_C'].map(convert_consume)
  413. data['EVENT_CONSUM_V'] = data['EVENT_CONSUM_V'].map(convert_consume_value)
  414. data['EVENT_CONSUM_C'] = data['EVENT_CONSUM_C'].astype('int')
  415. data['EVENT_CONSUM_V'] = data['EVENT_CONSUM_V'].astype('float')
  416. data['EVENT_VIDEO_FLUX_C'] = data['EVENT_VIDEO_FLUX_C'].map(convert_video_flux)
  417. data['EVENT_VIDEO_FLUX_V'] = data['EVENT_VIDEO_FLUX_V'].map(convert_video_flux_value)
  418. data['EVENT_VIDEO_FLUX_V'] = data['EVENT_VIDEO_FLUX_V'].astype('float')
  419. data['EVENT_VIDEO_FLUX_C'] = data['EVENT_VIDEO_FLUX_C'].astype('int')
  420. data['TAG_GENDER_C'] = data['TAG_GENDER_C'].map(convert_gender)
  421. data['TAG_GENDER_C'] = data['TAG_GENDER_C'].astype('int')
  422. data['TAG_NETTYPE_C'] = data['TAG_NETTYPE_C'].map(convert_nettype)
  423. data['TAG_NETTYPE_C'] = data['TAG_NETTYPE_C'].astype('int')
  424. data['TAG_AGE_C'] = data['TAG_AGE_C'].map(convert_age)
  425. data['TAG_AGE_C'] = data['TAG_AGE_C'].astype('int')
  426. data['EVENT_IS_ACCT_C'] = data['EVENT_IS_ACCT_C'].map(convert_acct)
  427. data['EVENT_IS_ACCT_C'] = data['EVENT_IS_ACCT_C'].astype('int')
  428. # app使用
  429. convert_appuse(data)
  430. data.drop(columns=['EVENT_APP_USE.C'], inplace=True)
  431. data.drop(columns=['EVENT_USER_OSTATE_C'], inplace=True)
  432. # 性别:0,1,2
  433. # 网络类型: 1,2,3,4,5
  434. np_cates = list(np.array([
  435. [x for x in range(0, 3)],
  436. [x for x in range(0, 5)],
  437. [x for x in range(0, 6)],
  438. [x for x in range(0, 6)],
  439. [x for x in range(0, 3)],
  440. [x for x in range(0, 9)],
  441. [x for x in range(0, 8)],
  442. [x for x in range(0, 8)],
  443. [x for x in range(0, 32)],
  444. [x for x in range(0, 8)],
  445. [x for x in range(0, 8)],
  446. [x for x in range(0, 8)],
  447. [x for x in range(0, 8)],
  448. [x for x in range(0, 8)],
  449. [x for x in range(0, 8)]
  450. ]))
  451. enc = OneHotEncoder(categories=np_cates)
  452. one_hot_feature_columns = [
  453. "TAG_GENDER_C",
  454. "TAG_NETTYPE_C",
  455. "TAG_AGE_C",
  456. "TAG_INTIME_C",
  457. "EVENT_IS_ACCT_C",
  458. "EVENT_FLUX_C",
  459. "EVENT_CONSUM_C",
  460. "EVENT_VIDEO_FLUX_C",
  461. "TAG_PROVINCE_C",
  462. 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
  463. 'app_use_kuaishou'
  464. ]
  465. def convert_onehot(df):
  466. logger.info("开始onehot编码")
  467. enc.fit(df[one_hot_feature_columns])
  468. after_onehot_features = enc.get_feature_names(one_hot_feature_columns)
  469. one_hot_dataframe = pd.DataFrame(enc.fit_transform(df[one_hot_feature_columns]).toarray().astype("int"),
  470. columns=after_onehot_features)
  471. df.drop(columns=one_hot_feature_columns, inplace=True)
  472. df = pd.concat([df, one_hot_dataframe], axis=1)
  473. return df
  474. def insert_to_database(dataf, row_index):
  475. nums = str(len(dataf))
  476. logger.info(f"开始写入数据库,共{nums}条")
  477. # 增加自增列
  478. next_index = row_index + len(dataf)
  479. dataf['row_id'] = range(row_index, next_index)
  480. dataf['month'] = '202107'
  481. # tb_name = 'lu_target_data'
  482. tb_name = constant.insert_neg_tb_name
  483. cols = ",".join(dataf.columns.tolist())
  484. data = dataf.to_dict("records")
  485. click_client.execute(f"INSERT INTO {tb_name} ({cols}) VALUES ", data, types_check=True)
  486. return next_index
  487. # 760
  488. # range(0,190) row_index = 0
  489. # range(190,380) row_index = 9500000
  490. # range(380,570) row_index = 19000000
  491. # range(570,760) row_index = 28500000
  492. def get_data(row_index, range_start, range_end, part):
  493. logger.info("开始处理数据")
  494. # for_nums = int(2000000 / fetch_size)
  495. # for_nums = math.ceil(36700000 / fetch_size)
  496. # for_nums = math.ceil(38000000 / fetch_size)
  497. row_index = row_index
  498. data_all = []
  499. try:
  500. for i in tqdm(range(range_start, range_end)):
  501. step = str(i)
  502. logger.info(f"第{step}次循环开始处理")
  503. start = i * fetch_size
  504. # sql = f"select t.* from Z_USER_TAG_FLAT_out t where EVENT_CPNAME_C!='快手' and EVENT_USER_OSTATE_C!='在订' and EVENT_CONSUM_C!='10元以下' and EVENT_VIDEO_FLUX_C!='未知' and TAG_NETTYPE_C!='2G' and position(arrayStringConcat(EVENT_APP_USE.C,','),'快手')!=0 limit {start},{fetch_size} "
  505. sql = f"select t.* from l_neg_origin_07 t order by t.uuid limit {fetch_size} offset {start}"
  506. all_data = click_client.execute(sql)
  507. dataf = pd.DataFrame(all_data, columns=constant.origin_column_names)
  508. dataf.drop(columns=drop_columns, inplace=True)
  509. # 预处理
  510. convert_label(dataf)
  511. # onehot编码
  512. dataf = convert_onehot(dataf)
  513. # 写入数据库
  514. row_index = insert_to_database(dataf, row_index)
  515. logger.info(f"第{step}次循环处理完毕")
  516. data_all.append(dataf)
  517. except Exception as e:
  518. logger.error(str(e))
  519. data_last = pd.concat(data_all, axis=0)
  520. data_last.to_pickle(f"data_neg_part_{part}_0218_07.pkl")
  521. data_last = pd.concat(data_all, axis=0)
  522. data_last.to_pickle(f"data_neg_part_{part}_0218_07.pkl")
  523. if __name__ == "__main__":
  524. get_data(0, 0, 27, "1")
  525. # get_data(9500000, 190, 285, "2")
  526. # get_data(14250000, 285, 380, "3")
  527. # get_data(9500000, 190, 380, "4")
  528. # get_data(19000000, 380, 475, "5")
  529. # get_data(19000000, 475, 570, "6")
  530. # get_data(28500000, 570, 760)