from common import value_dict from sklearn.preprocessing import OneHotEncoder import numpy as np import pandas as pd from random import shuffle from sklearn import metrics import matplotlib.pyplot as plt import seaborn as sns class ProcessUtil: np_cates = list(np.array([ [x for x in range(0, 3)], [x for x in range(0, 5)], [x for x in range(0, 6)], [x for x in range(0, 6)], [x for x in range(0, 3)], [x for x in range(0, 9)], [x for x in range(0, 8)], [x for x in range(0, 8)], [x for x in range(0, 32)], [x for x in range(0, 8)], [x for x in range(0, 8)], [x for x in range(0, 8)], [x for x in range(0, 8)], [x for x in range(0, 8)], [x for x in range(0, 8)], # [x for x in range(0,7)] ])) enc = OneHotEncoder(categories=np_cates) one_hot_feature_columns = [ "TAG_GENDER_C", "TAG_NETTYPE_C", "TAG_AGE_C", "TAG_INTIME_C", "EVENT_IS_ACCT_C", "EVENT_FLUX_C", "EVENT_CONSUM_C", "EVENT_VIDEO_FLUX_C", "TAG_PROVINCE_C", 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili', 'app_use_kuaishou', # 'EVENT_ORDER_MONTH_C' ] one_hot_feature_columns2 = [ "TAG_GENDER_C", "TAG_NETTYPE_C", "TAG_AGE_C", "TAG_INTIME_C", "EVENT_IS_ACCT_C", "EVENT_FLUX_C", "EVENT_CONSUM_C", "EVENT_VIDEO_FLUX_C", "TAG_PROVINCE_C", 'app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili'] @classmethod def convert_province(cls, value): return value_dict.area_dict[value] @classmethod def convert_intime(cls, value): if value == '1年-2年': value = 1 elif value == '2年-5年': value = 2 elif value == '5年-10年': value = 3 elif value == '10年-20年': value = 4 elif value == '20年以上': value = 5 else: value = 0 return value @classmethod def is_number(cls, s): try: float(s) return True except ValueError: pass try: import unicodedata unicodedata.numeric(s) return True except (TypeError, ValueError): pass return False @classmethod def convert_flux_value(cls, value): if value is None or str(value) == "nan": value = 0 elif not ProcessUtil.is_number(value): value = 0 return value @classmethod def convert_flux(cls, value): if value == '0-0.5G': value = 2 elif value == '0.5G-2G': value = 3 elif value == '2G-5G': value = 4 elif value == '5G-10G': value = 5 elif value == '10G-15G': value = 6 elif value == '15-30G': value = 7 elif value == '30G以上': value = 8 elif value == '小于等于0': value = 1 else: value = 0 return value @classmethod def convert_consume(cls, value): if value == '10元以下': value = 1 elif value == '10-20元': value = 2 elif value == '20-60元': value = 3 elif value == '60-100元': value = 4 elif value == '100-150元': value = 5 elif value == '150-300元': value = 6 elif value == '300元以上': value = 7 else: value = 0 return value @classmethod def convert_consume_value(cls, value): if value is None or str(value) == "nan": value = 0 elif not ProcessUtil.is_number(value): value = 0 return value @classmethod def convert_event_order_c(cls,value): if value=='订购1月内': value=1 elif value == '1-3月': value=2 elif value == '3-6月': value=3 elif value=='6-12月': value=4 elif value=='12-24月': value=5 elif value=='24月以上': value=6 else: value=0 return value @classmethod def convert_video_flux(cls, value): if value == '0-0.5G': value = 1 elif value == '0.5-2G': value = 2 elif value == '2-5G': value = 3 elif value == '5-10G': value = 4 elif value == '10-15G': value = 5 elif value == '15-30G': value = 6 elif value == '30G以上': value = 7 else: value = 0 return value @classmethod def convert_video_flux_value(cls, value): if value is None or str(value) == "nan": value = 0 elif not ProcessUtil.is_number(value): value = 0 return value @classmethod def convert_gender(cls, value): if value == '男': value = 0 elif value == '女': value = 1 else: value = 2 return value @classmethod def convert_nettype(cls, value): if value == '2G': value = 1 elif value == '3G': value = 2 elif value == '4G': value = 3 elif value == '5G': value = 4 else: value = 0 return value @classmethod def convert_age(cls, value): if value == '70前': value = 1 elif value == '70后': value = 2 elif value == '80后': value = 3 elif value == '90后': value = 4 elif value == '00后': value = 5 else: value = 0 return value @classmethod def convert_acct(cls, value): if value == '已出账': value = 0 elif value == '未出账': value = 1 else: value = 2 return value @classmethod def convert_appuse(cls, df): app_use_columns = ['app_use_tencent', 'app_use_mangguo', 'app_use_youku', 'app_use_iqiyi', 'app_use_bilibili', 'app_use_kuaishou'] df[app_use_columns] = df.apply(ProcessUtil.convert_app_use_info, axis=1, result_type="expand") @classmethod def convert_app_use_info(cls, value): value = value.tolist()[-9] tencent_app_use = 7 kuaishou_app_use = 7 mangguo_app_use = 7 youku_app_use = 7 iqiyi_app_use = 7 bilibili_app_use = 7 if str(value).lower() == 'nan': pass else: use_list = value for apptype_range in use_list: if len(apptype_range.split("_")) > 1: apptype = apptype_range.split("_")[0] apprange = apptype_range.split("_")[1] if "腾讯视频" in apptype: tencent_app_use = ProcessUtil.get_range_value(apprange) if "哔哩哔哩" in apptype: bilibili_app_use = ProcessUtil.get_range_value(apprange) if "爱奇艺" in apptype: iqiyi_app_use = ProcessUtil.get_range_value(apprange) if "快手" in apptype: kuaishou_app_use = ProcessUtil.get_range_value(apprange) if "优酷" in apptype: youku_app_use = ProcessUtil.get_range_value(apprange) if "芒果" in apptype: mangguo_app_use = ProcessUtil.get_range_value(apprange) return int(tencent_app_use), int(mangguo_app_use), int(youku_app_use), int(iqiyi_app_use), int( bilibili_app_use), int(kuaishou_app_use) @classmethod def get_range_value(cls, range_value): if range_value == '0.1G-0.5G': value = 1 elif range_value == '0.5G-2G': value = 2 elif range_value == '2G-5G': value = 3 elif range_value == '5G-10G': value = 4 elif range_value == '10G-20G': value = 5 elif range_value == '20G以上': value = 6 else: value = 0 return value @classmethod def convert_onehot(cls, df): ProcessUtil.enc.fit(df[ProcessUtil.one_hot_feature_columns]) after_onehot_features = ProcessUtil.enc.get_feature_names(ProcessUtil.one_hot_feature_columns) one_hot_dataframe = pd.DataFrame( ProcessUtil.enc.fit_transform(df[ProcessUtil.one_hot_feature_columns]).toarray().astype("int"), columns=after_onehot_features) df.drop(columns=ProcessUtil.one_hot_feature_columns2, inplace=True) df = pd.concat([df, one_hot_dataframe], axis=1) return df @classmethod def partition_preserve_order(cls, list_in, n): indices = list(range(len(list_in))) shuffle(indices) index_partitions = [sorted(indices[i::n]) for i in range(n)] return [[list_in[i] for i in index_partition] for index_partition in index_partitions] @classmethod def gen_train_tuple(cls, pos_row_ids, neg_row_ids, batch_num_pos): result = [] for i, e, in enumerate(neg_row_ids): pos_index = i % batch_num_pos result.append((pos_row_ids[pos_index], neg_row_ids[i])) return result @classmethod def draw_roc_auc(cls, y_label, y_test): # ROC曲线绘制 fpr, tpr, thresholds = metrics.roc_curve(y_label, y_test) ##计算曲线下面积 roc_auc = metrics.auc(fpr, tpr) ##绘图 plt.clf() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.legend(loc="lower right") plt.show() @classmethod def draw_confusion_matrix(cls, y_label, y_test): # 画出混淆矩阵 confusion_data = metrics.confusion_matrix(y_label, y_test) print(confusion_data) sns.heatmap(confusion_data, cmap="Greens", annot=True) plt.xlabel("Predicted labels") plt.ylabel("True labels") plt.tight_layout() plt.show()