123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- from common import value_dict
- from sklearn.preprocessing import OneHotEncoder
- import numpy as np
- import pandas as pd
- from random import shuffle
- from sklearn import metrics
- import matplotlib.pyplot as plt
- import seaborn as sns
- class ProcessUtil:
- np_cates = list(np.array([
- [x for x in range(0, 3)],
- [x for x in range(0, 5)],
- [x for x in range(0, 6)],
- [x for x in range(0, 6)],
- [x for x in range(0, 3)],
- [x for x in range(0, 9)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- [x for x in range(0, 32)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- [x for x in range(0, 8)],
- # [x for x in range(0,7)]
- ]))
- enc = OneHotEncoder(categories=np_cates)
- one_hot_feature_columns = [
- "TAG_GENDER_C",
- "TAG_NETTYPE_C",
- "TAG_AGE_C",
- "TAG_INTIME_C",
- "EVENT_IS_ACCT_C",
- "EVENT_FLUX_C",
- "EVENT_CONSUM_C",
- "EVENT_VIDEO_FLUX_C",
- "TAG_PROVINCE_C",
- 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
- 'app_use_kuaishou',
- # 'EVENT_ORDER_MONTH_C'
- ]
- one_hot_feature_columns2 = [
- "TAG_GENDER_C",
- "TAG_NETTYPE_C",
- "TAG_AGE_C",
- "TAG_INTIME_C",
- "EVENT_IS_ACCT_C",
- "EVENT_FLUX_C",
- "EVENT_CONSUM_C",
- "EVENT_VIDEO_FLUX_C",
- "TAG_PROVINCE_C",
- 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili']
- @classmethod
- def convert_province(cls, value):
- return value_dict.area_dict[value]
- @classmethod
- def convert_intime(cls, value):
- if value == '1年-2年':
- value = 1
- elif value == '2年-5年':
- value = 2
- elif value == '5年-10年':
- value = 3
- elif value == '10年-20年':
- value = 4
- elif value == '20年以上':
- value = 5
- else:
- value = 0
- return value
- @classmethod
- def is_number(cls, s):
- try:
- float(s)
- return True
- except ValueError:
- pass
- try:
- import unicodedata
- unicodedata.numeric(s)
- return True
- except (TypeError, ValueError):
- pass
- return False
- @classmethod
- def convert_flux_value(cls, value):
- if value is None or str(value) == "nan":
- value = 0
- elif not ProcessUtil.is_number(value):
- value = 0
- return value
- @classmethod
- def convert_flux(cls, value):
- if value == '0-0.5G':
- value = 2
- elif value == '0.5G-2G':
- value = 3
- elif value == '2G-5G':
- value = 4
- elif value == '5G-10G':
- value = 5
- elif value == '10G-15G':
- value = 6
- elif value == '15-30G':
- value = 7
- elif value == '30G以上':
- value = 8
- elif value == '小于等于0':
- value = 1
- else:
- value = 0
- return value
- @classmethod
- def convert_consume(cls, value):
- if value == '10元以下':
- value = 1
- elif value == '10-20元':
- value = 2
- elif value == '20-60元':
- value = 3
- elif value == '60-100元':
- value = 4
- elif value == '100-150元':
- value = 5
- elif value == '150-300元':
- value = 6
- elif value == '300元以上':
- value = 7
- else:
- value = 0
- return value
- @classmethod
- def convert_consume_value(cls, value):
- if value is None or str(value) == "nan":
- value = 0
- elif not ProcessUtil.is_number(value):
- value = 0
- return value
- @classmethod
- def convert_event_order_c(cls,value):
- if value=='订购1月内':
- value=1
- elif value == '1-3月':
- value=2
- elif value == '3-6月':
- value=3
- elif value=='6-12月':
- value=4
- elif value=='12-24月':
- value=5
- elif value=='24月以上':
- value=6
- else:
- value=0
- return value
- @classmethod
- def convert_video_flux(cls, value):
- if value == '0-0.5G':
- value = 1
- elif value == '0.5-2G':
- value = 2
- elif value == '2-5G':
- value = 3
- elif value == '5-10G':
- value = 4
- elif value == '10-15G':
- value = 5
- elif value == '15-30G':
- value = 6
- elif value == '30G以上':
- value = 7
- else:
- value = 0
- return value
- @classmethod
- def convert_video_flux_value(cls, value):
- if value is None or str(value) == "nan":
- value = 0
- elif not ProcessUtil.is_number(value):
- value = 0
- return value
- @classmethod
- def convert_gender(cls, value):
- if value == '男':
- value = 0
- elif value == '女':
- value = 1
- else:
- value = 2
- return value
- @classmethod
- def convert_nettype(cls, value):
- if value == '2G':
- value = 1
- elif value == '3G':
- value = 2
- elif value == '4G':
- value = 3
- elif value == '5G':
- value = 4
- else:
- value = 0
- return value
- @classmethod
- def convert_age(cls, value):
- if value == '70前':
- value = 1
- elif value == '70后':
- value = 2
- elif value == '80后':
- value = 3
- elif value == '90后':
- value = 4
- elif value == '00后':
- value = 5
- else:
- value = 0
- return value
- @classmethod
- def convert_acct(cls, value):
- if value == '已出账':
- value = 0
- elif value == '未出账':
- value = 1
- else:
- value = 2
- return value
- @classmethod
- def convert_appuse(cls, df):
- app_use_columns = ['app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili',
- 'app_use_kuaishou']
- df[app_use_columns] = df.apply(ProcessUtil.convert_app_use_info, axis=1, result_type="expand")
- @classmethod
- def convert_app_use_info(cls, value):
- value = value.tolist()[-9]
- tencent_app_use = 7
- kuaishou_app_use = 7
- mangguo_app_use = 7
- youku_app_use = 7
- iqiyi_app_use = 7
- bilibili_app_use = 7
- if str(value).lower() == 'nan':
- pass
- else:
- use_list = value
- for apptype_range in use_list:
- if len(apptype_range.split("_")) > 1:
- apptype = apptype_range.split("_")[0]
- apprange = apptype_range.split("_")[1]
- if "腾讯视频" in apptype:
- tencent_app_use = ProcessUtil.get_range_value(apprange)
- if "哔哩哔哩" in apptype:
- bilibili_app_use = ProcessUtil.get_range_value(apprange)
- if "爱奇艺" in apptype:
- iqiyi_app_use = ProcessUtil.get_range_value(apprange)
- if "快手" in apptype:
- kuaishou_app_use = ProcessUtil.get_range_value(apprange)
- if "优酷" in apptype:
- youku_app_use = ProcessUtil.get_range_value(apprange)
- if "芒果" in apptype:
- mangguo_app_use = ProcessUtil.get_range_value(apprange)
- return int(tencent_app_use), int(mangguo_app_use), int(youku_app_use), int(iqiyi_app_use), int(
- bilibili_app_use), int(kuaishou_app_use)
- @classmethod
- def get_range_value(cls, range_value):
- if range_value == '0.1G-0.5G':
- value = 1
- elif range_value == '0.5G-2G':
- value = 2
- elif range_value == '2G-5G':
- value = 3
- elif range_value == '5G-10G':
- value = 4
- elif range_value == '10G-20G':
- value = 5
- elif range_value == '20G以上':
- value = 6
- else:
- value = 0
- return value
- @classmethod
- def convert_onehot(cls, df):
- ProcessUtil.enc.fit(df[ProcessUtil.one_hot_feature_columns])
- after_onehot_features = ProcessUtil.enc.get_feature_names(ProcessUtil.one_hot_feature_columns)
- one_hot_dataframe = pd.DataFrame(
- ProcessUtil.enc.fit_transform(df[ProcessUtil.one_hot_feature_columns]).toarray().astype("int"),
- columns=after_onehot_features)
- df.drop(columns=ProcessUtil.one_hot_feature_columns2, inplace=True)
- df = pd.concat([df, one_hot_dataframe], axis=1)
- return df
- @classmethod
- def partition_preserve_order(cls, list_in, n):
- indices = list(range(len(list_in)))
- shuffle(indices)
- index_partitions = [sorted(indices[i::n]) for i in range(n)]
- return [[list_in[i] for i in index_partition]
- for index_partition in index_partitions]
- @classmethod
- def gen_train_tuple(cls, pos_row_ids, neg_row_ids, batch_num_pos):
- result = []
- for i, e, in enumerate(neg_row_ids):
- pos_index = i % batch_num_pos
- result.append((pos_row_ids[pos_index], neg_row_ids[i]))
- return result
- @classmethod
- def draw_roc_auc(cls, y_label, y_test):
- # ROC曲线绘制
- fpr, tpr, thresholds = metrics.roc_curve(y_label, y_test)
- ##计算曲线下面积
- roc_auc = metrics.auc(fpr, tpr)
- ##绘图
- plt.clf()
- plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
- plt.plot([0, 1], [0, 1], 'k--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.0])
- plt.legend(loc="lower right")
- plt.show()
- @classmethod
- def draw_confusion_matrix(cls, y_label, y_test):
- # 画出混淆矩阵
- confusion_data = metrics.confusion_matrix(y_label, y_test)
- print(confusion_data)
- sns.heatmap(confusion_data, cmap="Greens", annot=True)
- plt.xlabel("Predicted labels")
- plt.ylabel("True labels")
- plt.tight_layout()
- plt.show()
|