{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习与社会科学应用"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第五章 集成算法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >郭峰    \n",
    "    教授、博士生导师  \n",
    "上海财经大学公共经济与管理学院  \n",
    "邮箱：guofengsfi@163.com</font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >本章目录：  \n",
    "第二节  随机森林算法  \n",
    "第三节  梯度提升树算法  \n",
    "第四节  XGBoost算法</font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第一节 随机森林算法 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1 导入第三方模块，调用数据集train，并查看数据集结构及前10行信息 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入相关第三方库\n",
    "import pandas as pd\n",
    "import time\n",
    "from collections import defaultdict\n",
    "# from sklearn.externals import joblib\n",
    "import joblib\n",
    "import datetime\n",
    "from sklearn.tree import DecisionTreeClassifier \n",
    "from sklearn.ensemble import RandomForestClassifier  \n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "# from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from numpy import *\n",
    "from sklearn.model_selection import train_test_split\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "#导入数据\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/05集成算法/name_and_gender/\"\n",
    "f = open(path+'train.txt',encoding='utf-8')\n",
    "data = pd.read_csv(f,header=0,sep=',') #一个竞赛网站12万样本\n",
    "data['name'] = data['name'].astype(str)\n",
    "data['gender'] = data['gender'].astype(int)\n",
    "print(data.shape)\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.2 将data前1000行作为示例数据，并对其划分训练集和测试集 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#样本量太大的情况下，运行效率会比较低\n",
    "data = data[0:1000]\n",
    "# 将数据分出一部分，作为测试集，剩下的用于建模\n",
    "data_train, data_test = train_test_split(data, test_size=0.3, random_state=666)  \n",
    "print(\"随机挑选一部分进行建模：\", data_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  1.3 提取数据中关于name的相关信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 特征x是姓名用字，需要将x转换为一个数字化的向量\n",
    "# 所有姓名合并在一起，去重，构造一个姓名用字池向量\n",
    "name_vec_total = list(data_train['name'])  \n",
    "name_vec_total = list(''.join(name_vec_total))\n",
    "#print(name_vec_total[0:20])\n",
    "print(\"语料库原始总字数：\", len(name_vec_total))\n",
    "print(\"不重复字样本量：\", len(set(name_vec_total)))\n",
    "freq = defaultdict(int)\n",
    "for w in name_vec_total:\n",
    "    freq[w] += 1\n",
    "name_vec_total = [w  for w in name_vec_total if freq[w]>5]\n",
    "name_vec_total = list(set(name_vec_total)) #去重后再转换成列表\n",
    "print(\"剔除稀缺字后不重复字样本量：\",len(set(name_vec_total)))\n",
    "print(\"不重复姓名用字举例:\",name_vec_total[0:20])\n",
    "\n",
    "f = open(path+'name_vec_total_rf.txt','w',encoding='utf8')\n",
    "f.write(';'.join(name_vec_total))\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  1.4 把具体某个姓名用字用上述姓名用字池向量来表示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 把具体某个姓名(如“建国”)的用字用上述姓名用字池向量来表示\n",
    "def words2vec(inputSet): #inputSet是待定义姓名,这个函数基于上文得到的name_total\n",
    "    returnVec = [0] * len(name_vec_total)    #获得所有单词等长的0列表\n",
    "    for word in inputSet:\n",
    "        if word in name_vec_total:\n",
    "            returnVec[name_vec_total.index(word)] += 1   #对应单词位置加1\n",
    "    return returnVec\n",
    "\n",
    "# 这个方式是在dataframe中计算\n",
    "# data_train['name_vec']=data_train['name'].apply(words2vec)\n",
    "# print(data_train['name'][11],data_train['name_vec'][11])\n",
    "\n",
    "# 也可以先转换成list后再计算\n",
    "name = list(data_train['name'])\n",
    "print(\"姓名举例:\",name[0:20])\n",
    "name_vec = [words2vec(n) for n in name]   #特征x是用向量表示的姓名，这是一个嵌套列表，会占用内存超级多\n",
    "# print(name_vec[0:2])\n",
    "\n",
    "# print(name_vec[0:5])\n",
    "# 相应y为gender，\n",
    "gender_vec = list(data_train['gender'])\n",
    "# print(gender_vec[0:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.5 对参数进行网格搜索和调参，并检验训练集预测准确率 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对n_estimators进行网格搜索\n",
    "param_test1 = {'n_estimators':list(range(3,50,2))}\n",
    "gsearch1 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True, random_state=33), \n",
    "                       param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=-1)\n",
    "gsearch1.fit(name_vec,gender_vec)\n",
    "print(gsearch1.best_params_)\n",
    "\n",
    "#接着我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索\n",
    "param_test2 = {'max_depth':list(range(1,14,2)), 'min_samples_split':list(range(5,201,20))}\n",
    "gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=27,oob_score=True, random_state=33),\n",
    "   param_grid = param_test2, scoring='roc_auc',cv=5,n_jobs=-1)\n",
    "gsearch2.fit(name_vec,gender_vec)\n",
    "print(gsearch2.best_params_)\n",
    "\n",
    "#对于内部节点再划分所需最小样本数min_samples_split，我们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。\n",
    "#下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。\n",
    "#最优min_samples_split为10，最优min_samples_split为140\n",
    "param_test3 = {'min_samples_split':list(range(80,150,20)), 'min_samples_leaf':list(range(10,60,10))}\n",
    "gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=13, max_depth=9,\n",
    "                                  max_features='sqrt', oob_score=True, random_state=10),\n",
    "                                  param_grid = param_test3, scoring='roc_auc',cv=5,n_jobs=-1)\n",
    "gsearch3.fit(name_vec,gender_vec)\n",
    "print(gsearch3.best_params_)\n",
    "\n",
    "#最后我们再对最大特征数max_features做调参: 基本上也是越大越好，但差别不大，取11\n",
    "param_test4 = {'max_features':list(range(3,20,2))}\n",
    "gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=20, max_depth=13, min_samples_split=140,\n",
    "                                      min_samples_leaf=10 ,oob_score=True, random_state=10),param_grid = param_test4, scoring='roc_auc',cv=5,n_jobs=-1)\n",
    "gsearch4.fit(name_vec,gender_vec)\n",
    "print(gsearch4.best_params_)\n",
    "\n",
    "rf_clf = RandomForestClassifier(n_estimators=40, max_depth=13, min_samples_split=140,\n",
    "                                  min_samples_leaf=10,max_features=11,oob_score=True, random_state=10)\n",
    "rf_clf.fit(name_vec,gender_vec)\n",
    "print(\"验证集预测准确率:\",rf_clf.oob_score_)\n",
    "\n",
    "joblib.dump(rf_clf,path+'random_forest'+'.model')  #模型的保存"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.6 在测试集中进行测试 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#测试集测试\n",
    "data_test, data_test2 = train_test_split(data_test, test_size=0.95) #测试集可能会太大了\n",
    "name_new = list(data_test['name']) \n",
    "x_test = [words2vec(n) for n in name_new]   \n",
    "y_test = list(data_test['gender'])\n",
    "y_pred_new = rf_clf.predict(x_test)\n",
    "print(\"随机森林测试集正确率 {:05.2f}%\" .format(100*(1-(sum(array(y_pred_new)!=array(y_test))/len(y_test)))))\n",
    "\n",
    "endtime = datetime.datetime.now()\n",
    "print(\"运行时间:\",(endtime - starttime).seconds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.7 网格搜索调参"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入相关第三方库\n",
    "import pandas as pd\n",
    "import time\n",
    "from collections import defaultdict\n",
    "#from sklearn.externals import joblib\n",
    "import joblib\n",
    "import datetime\n",
    "from sklearn.tree import DecisionTreeClassifier \n",
    "from sklearn.ensemble import RandomForestClassifier  \n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "#from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from numpy import *\n",
    "from sklearn.model_selection import train_test_split\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "#导入数据\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/05集成算法/name_and_gender/\"\n",
    "f = open(path+'train.txt',encoding='utf-8')\n",
    "data = pd.read_csv(f,header=0,sep=',') #一个竞赛网站12万样本\n",
    "data['name'] = data['name'].astype(str)\n",
    "data['gender'] = data['gender'].astype(int)\n",
    "print(data.shape)\n",
    "data.head(10)\n",
    "\n",
    "###通过这里调节样本数量，测试运行效率\n",
    "data = data[0:1000]\n",
    "#将数据分出一部分，作为测试集，剩下的用于建模\n",
    "data_train,data_test = train_test_split(data,test_size=0.3,random_state=666)  \n",
    "print(\"随机挑选一部分进行建模：\",data_train.shape)\n",
    "\n",
    "#特征x是姓名用字，需要将x转换为一个数字化的向量\n",
    "#所有姓名合并在一起，去重，构造一个姓名用字池向量\n",
    "name_vec_total = list(data_train['name'])  \n",
    "name_vec_total = list(''.join(name_vec_total))\n",
    "#print(name_vec_total[0:20])\n",
    "print(\"语料库原始总字数：\",len(name_vec_total))\n",
    "print(\"不重复字样本量：\",len(set(name_vec_total)))\n",
    "freq = defaultdict(int)\n",
    "for w in name_vec_total:\n",
    "    freq[w] += 1\n",
    "name_vec_total = [w  for w in name_vec_total if freq[w]>5]\n",
    "name_vec_total = list(set(name_vec_total)) #去重后再转换成列表\n",
    "print(\"剔除稀缺字后不重复字样本量：\",len(set(name_vec_total)))\n",
    "print(\"不重复姓名用字举例:\",name_vec_total[0:20])\n",
    "\n",
    "f = open(path+'name_vec_total_rf.txt','w',encoding='utf8')\n",
    "f.write(';'.join(name_vec_total))\n",
    "f.close()\n",
    "\n",
    "#把具体某个姓名(如“建国”)的用字用上述姓名用字池向量来表示\n",
    "def words2vec(inputSet): #inputSet是待定义姓名,这个函数基于上文得到的name_total\n",
    "    returnVec = [0] * len(name_vec_total)    #获得所有单词等长的0列表\n",
    "    for word in inputSet:\n",
    "        if word in name_vec_total:\n",
    "            returnVec[name_vec_total.index(word)] += 1   #对应单词位置加1\n",
    "    return returnVec\n",
    "\n",
    "#这个方式是在dataframe中计算\n",
    "#data_train['name_vec']=data_train['name'].apply(words2vec)\n",
    "#print(data_train['name'][11],data_train['name_vec'][11])\n",
    "\n",
    "#也可以先转换成list后再计算\n",
    "name = list(data_train['name'])\n",
    "print(\"姓名举例:\",name[0:20])\n",
    "name_vec = [words2vec(n) for n in name]   #特征x是用向量表示的姓名，这是一个嵌套列表，会占用内存超级多\n",
    "#print(name_vec[0:2])\n",
    "\n",
    "#print(name_vec[0:5])\n",
    "#相应y为gender，\n",
    "gender_vec = list(data_train['gender'])\n",
    "#print(gender_vec[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#网格搜索，时间消耗的太久了\n",
    "param_test6 = {'n_estimators':list(range(3,50,2)),'max_depth':list(range(1,14,2)), 'min_samples_split':list(range(5,201,10)),\n",
    "              'min_samples_leaf':list(range(10,60,10)),'max_features':list(range(3,20,2))}\n",
    "gsearch6 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True, random_state=33), \n",
    "                       param_grid = param_test6, scoring='roc_auc',cv=5,n_jobs=-1)\n",
    "gsearch6.fit(name_vec,gender_vec)\n",
    "print(gsearch6.best_params_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第二节 梯度提升树 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 导入第三方模块和数据 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据说明:\n",
    "#新能源汽车充电桩的故障检测问题，提供85500条训练数据（标签：0代表充电桩正常，1代表充电桩有故障）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读入数据\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/05集成算法/\"\n",
    "charging_pile = pd.read_csv(path+\"charging_pile.csv\",encoding='utf-8')\n",
    "#charging_pile = pd.read_csv(f,header=0,sep=',')\n",
    "print(charging_pile.shape)\n",
    "charging_pile.head()\n",
    "#s1-s6的含义参阅：https://blog.csdn.net/gb4215287/article/details/105184238/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 定义特征变量和响应变量，划分测试集和训练集，并进行训练 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#区分x和y\n",
    "x_columns = []\n",
    "for x in charging_pile.columns:\n",
    "    if x not in ['id', 'label']:\n",
    "        x_columns.append(x)\n",
    "X = charging_pile[x_columns]\n",
    "y = charging_pile['label']\n",
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n",
    "print(x_train.shape)\n",
    "print(x_test.shape)\n",
    "\n",
    "# 模型训练，使用GBDT算法\n",
    "gbr = GradientBoostingClassifier(n_estimators=3000, max_depth=2, min_samples_split=2, learning_rate=0.1)\n",
    "gbr.fit(x_train, y_train.ravel())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.3 查看训练集和测试集的准确率 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#训练和验证的准确率\n",
    "y_gbr = gbr.predict(x_train)\n",
    "y_gbr1 = gbr.predict(x_test)\n",
    "acc_train = gbr.score(x_train, y_train)\n",
    "acc_test = gbr.score(x_test, y_test)\n",
    "print(acc_train)\n",
    "print(acc_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.4 GBDT算法参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT分类算法参数\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "GradientBoostingClassifier()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "主要的几个参数：\n",
    "1.criterion参数：指特征选择的标准，我们就选择默认即可。\n",
    "2.init参数：指是否用该参数提供的弱分类器来进行预测，默认为None，即使用原始样本集来进行预测。\n",
    "3.learning_rate：习率，指弱分类器的系数。\n",
    "4.loss：指损失函数的类型，默认为deviance，即使用对数似然函数；也可以选择exponential，即指数损失函数。\n",
    "5.subsample：指采样的比例，在0-1之间，默认为1，即不采样，使用全部样本；小于1，意味着只有一部分参与了模型的拟合。\n",
    "6.n_estimators：指弱分类器的个数，默认为100."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT回归模型的参数\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "GradientBoostingRegressor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#回归模型的参数大部分与分类相同，只有损失函数采用的不同\n",
    "#回归模型的损失函数主要有4种，默认为ls（标准差函数）。还有三种分别是lad（绝对损失函数）、huber和quantile（分位损失函数）。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第三节 XGBoost "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.1 导入第三方模块，并且导入数据 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "import matplotlib as mlp\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import time\n",
    "import os\n",
    "\n",
    "import xgboost as xgb #导入成功则说明安装正确\n",
    "xgb.__version__\n",
    "\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.model_selection import cross_validate, KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/05集成算法/\"\n",
    "data = pd.read_csv(path+\"house.csv\",index_col=0)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 指定特征变量与响应变量，并设置训练集和测试集 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data.iloc[:,:-1]\n",
    "y = data.iloc[:,-1]\n",
    "\n",
    "x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=14)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3 使用XGBoost算法和XGBoost交叉验证算法 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# XGBoost算法：回归\n",
    "xgb = XGBRegressor(random_state=12)\n",
    "xgb.fit(x_train,y_train)\n",
    "xgb.score(x_test,y_test) #默认指标R2\n",
    "\n",
    "# XGBoost交叉验证算法\n",
    "cv = KFold(n_splits=5,shuffle=True,random_state=14)\n",
    "\n",
    "cv_xgb = cross_validate(xgb,X,y,cv=cv,scoring=\"neg_root_mean_squared_error\",return_train_score=True,verbose=True,n_jobs=-1)\n",
    "\n",
    "cv_xgb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.4  由于XGBoost算法不稳定、过拟合严重等问题，通过限制Max_depth进行缓解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def RMSE(result,name):\n",
    "    return abs(result[name].mean())\n",
    "\n",
    "# 训练集上RMSE\n",
    "RMSE(cv_xgb,\"train_score\")\n",
    "\n",
    "# 测试集上RMSE\n",
    "RMSE(cv_xgb,\"test_score\")\n",
    "\n",
    "xgb_depth = XGBRegressor(max_depth=5,random_state=14) #实例化\n",
    "cv_xgb_depth = cross_validate(xgb_depth,X,y,cv=cv\n",
    "                               ,scoring=\"neg_root_mean_squared_error\" #负根均方误差\n",
    "                               ,return_train_score=True\n",
    "                               ,verbose=True\n",
    "                               ,n_jobs=-1)\n",
    "\n",
    "RMSE(cv_xgb_depth,\"train_score\")\n",
    "\n",
    "RMSE(cv_xgb_depth,\"test_score\")\n",
    "\n",
    "xgb_depth = XGBRegressor(max_depth=5,random_state=14).fit(X,y)\n",
    "#查看特征重要性\n",
    "xgb_depth.feature_importances_\n",
    "\n",
    "#获取每一个参数的取值\n",
    "xgb_depth.get_params()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.5 二分类问题：乳腺癌数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score as ACC\n",
    "from sklearn.metrics import log_loss as logloss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.datasets import load_breast_cancer, load_digits\n",
    "# 二分类问题\n",
    "X = load_breast_cancer().data\n",
    "y = load_breast_cancer().target\n",
    "data_binary = xgb.DMatrix(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 二分类参数用logloss交叉熵损失\n",
    "params1 = {\"seed\":14, \"objective\":\"binary:logistic\", \"eval_metric\":\"logloss\"}\n",
    "clf_binary = xgb.train(params1, data_binary, num_boost_round=100)\n",
    "\n",
    "y_pred_binary = clf_binary.predict(data_binary)\n",
    "\n",
    "# 二分类返回概率，可以转换成0、1变量\n",
    "y_pred_binary[:20] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(y_pred_binary > 0.5).astype(\"int\")\n",
    "\n",
    "# 数据较简单，分类准确率100%\n",
    "ACC(y,(y_pred_binary > 0.5).astype(int))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.6 多分类问题：手写数字识别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 多分类问题\n",
    "X = load_digits().data\n",
    "y = load_digits().target\n",
    "data_multi = xgb.DMatrix(X, y)\n",
    "\n",
    "# 多分类参数用mlogloss交叉熵损失，num_class是分类类别\n",
    "params2 = {\"seed\":1412, \"objective\":\"multi:softmax\", \"eval_metric\":\"mlogloss\" ,\"num_class\":10}\n",
    "clf_multi = xgb.train(params2, data_multi, num_boost_round=100)\n",
    "\n",
    "y_pred_multi = clf_multi.predict(data_multi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_multi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据较简单，分类准确率100%\n",
    "ACC(y, y_pred_multi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本章结束"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
