test.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. import datetime
  2. import numpy as np
  3. import pandas as pd
  4. import lightgbm as lgb
  5. from sklearn import datasets
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.metrics import accuracy_score
  8. def printlog(info):
  9. nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  10. print("\n" + "==========" * 8 + "%s" % nowtime)
  11. print(info + '...\n\n')
  12. # ================================================================================
  13. # 一,读取数据
  14. # ================================================================================
  15. printlog("step1: reading data...")
  16. # 读取dftrain,dftest
  17. breast = datasets.load_breast_cancer()
  18. df = pd.DataFrame(breast.data, columns=[x.replace(' ', '_') for x in breast.feature_names])
  19. df['label'] = breast.target
  20. df['mean_radius'] = df['mean_radius'].apply(lambda x: int(x))
  21. df['mean_texture'] = df['mean_texture'].apply(lambda x: int(x))
  22. dftrain, dftest = train_test_split(df)
  23. categorical_features = ['mean_radius', 'mean_texture']
  24. lgb_train = lgb.Dataset(dftrain.drop(['label'], axis=1), label=dftrain['label'],
  25. categorical_feature=categorical_features)
  26. lgb_valid = lgb.Dataset(dftest.drop(['label'], axis=1), label=dftest['label'],
  27. categorical_feature=categorical_features,
  28. reference=lgb_train)
  29. # ================================================================================
  30. # 二,设置参数
  31. # ================================================================================
  32. printlog("step2: setting parameters...")
  33. boost_round = 50
  34. early_stop_rounds = 10
  35. params = {
  36. 'boosting_type': 'gbdt',
  37. 'objective': 'binary',
  38. 'metric': ['auc'],
  39. 'num_leaves': 31,
  40. 'learning_rate': 0.05,
  41. 'feature_fraction': 0.9,
  42. 'bagging_fraction': 0.8,
  43. 'bagging_freq': 5,
  44. 'verbose': 0
  45. }
  46. # ================================================================================
  47. # 三,训练模型
  48. # ================================================================================
  49. printlog("step3: training model...")
  50. results = {}
  51. gbm = lgb.train(params,
  52. lgb_train,
  53. num_boost_round=boost_round,
  54. valid_sets=(lgb_valid, lgb_train),
  55. valid_names=('validate', 'train'),
  56. early_stopping_rounds=early_stop_rounds,
  57. evals_result=results)
  58. # ================================================================================
  59. # 四,评估模型
  60. # ================================================================================
  61. printlog("step4: evaluating model ...")
  62. y_pred_train = gbm.predict(dftrain.drop('label', axis=1), num_iteration=gbm.best_iteration)
  63. y_pred_test = gbm.predict(dftest.drop('label', axis=1), num_iteration=gbm.best_iteration)
  64. print('train accuracy: {:.5} '.format(accuracy_score(dftrain['label'], y_pred_train > 0.5)))
  65. print('valid accuracy: {:.5} \n'.format(accuracy_score(dftest['label'], y_pred_test > 0.5)))
  66. lgb.plot_metric(results)
  67. lgb.plot_importance(gbm, importance_type="gain")
  68. # ================================================================================
  69. # 五,保存模型
  70. # ================================================================================
  71. printlog("step5: saving model ...")
  72. model_dir = "data/gbm.model"
  73. print("model_dir: %s" % model_dir)
  74. gbm.save_model("data/gbm.model")
  75. printlog("task end...")
  76. ###
  77. ##
  78. #
  79. lgb.plot_metric(results, metric="auc")
  80. lgb.plot_importance(gbm, importance_type="gain")