123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- import datetime
- import numpy as np
- import pandas as pd
- import lightgbm as lgb
- from sklearn import datasets
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- def printlog(info):
- nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- print("\n" + "==========" * 8 + "%s" % nowtime)
- print(info + '...\n\n')
- # ================================================================================
- # 一,读取数据
- # ================================================================================
- printlog("step1: reading data...")
- # 读取dftrain,dftest
- breast = datasets.load_breast_cancer()
- df = pd.DataFrame(breast.data, columns=[x.replace(' ', '_') for x in breast.feature_names])
- df['label'] = breast.target
- df['mean_radius'] = df['mean_radius'].apply(lambda x: int(x))
- df['mean_texture'] = df['mean_texture'].apply(lambda x: int(x))
- dftrain, dftest = train_test_split(df)
- categorical_features = ['mean_radius', 'mean_texture']
- lgb_train = lgb.Dataset(dftrain.drop(['label'], axis=1), label=dftrain['label'],
- categorical_feature=categorical_features)
- lgb_valid = lgb.Dataset(dftest.drop(['label'], axis=1), label=dftest['label'],
- categorical_feature=categorical_features,
- reference=lgb_train)
- # ================================================================================
- # 二,设置参数
- # ================================================================================
- printlog("step2: setting parameters...")
- boost_round = 50
- early_stop_rounds = 10
- params = {
- 'boosting_type': 'gbdt',
- 'objective': 'binary',
- 'metric': ['auc'],
- 'num_leaves': 31,
- 'learning_rate': 0.05,
- 'feature_fraction': 0.9,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 5,
- 'verbose': 0
- }
- # ================================================================================
- # 三,训练模型
- # ================================================================================
- printlog("step3: training model...")
- results = {}
- gbm = lgb.train(params,
- lgb_train,
- num_boost_round=boost_round,
- valid_sets=(lgb_valid, lgb_train),
- valid_names=('validate', 'train'),
- early_stopping_rounds=early_stop_rounds,
- evals_result=results)
- # ================================================================================
- # 四,评估模型
- # ================================================================================
- printlog("step4: evaluating model ...")
- y_pred_train = gbm.predict(dftrain.drop('label', axis=1), num_iteration=gbm.best_iteration)
- y_pred_test = gbm.predict(dftest.drop('label', axis=1), num_iteration=gbm.best_iteration)
- print('train accuracy: {:.5} '.format(accuracy_score(dftrain['label'], y_pred_train > 0.5)))
- print('valid accuracy: {:.5} \n'.format(accuracy_score(dftest['label'], y_pred_test > 0.5)))
- lgb.plot_metric(results)
- lgb.plot_importance(gbm, importance_type="gain")
- # ================================================================================
- # 五,保存模型
- # ================================================================================
- printlog("step5: saving model ...")
- model_dir = "data/gbm.model"
- print("model_dir: %s" % model_dir)
- gbm.save_model("data/gbm.model")
- printlog("task end...")
- ###
- ##
- #
- lgb.plot_metric(results, metric="auc")
- lgb.plot_importance(gbm, importance_type="gain")
|