{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习与社会科学应用"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第六章 无监督学习算法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第二节 K-means聚类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >郭峰    \n",
    "    教授、博士生导师  \n",
    "上海财经大学公共经济与管理学院  \n",
    "上海财经大学数实融合与智能治理实验室    \n",
    "    邮箱：guofengsfi@163.com</font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >本节目录  \n",
    "2.1.演示性案例  \n",
    "2.2.手写数字识别  \n",
    "2.3.K-meas参数 </font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.1. 演示性案例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "# 加载鸢尾花数据集\n",
    "iris = load_iris()\n",
    "X = iris.data  # 特征矩阵\n",
    "X[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建K-means模型\n",
    "kmeans = KMeans(n_clusters=3, random_state=0)\n",
    "\n",
    "# 拟合模型到数据\n",
    "kmeans.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取聚类中心点的坐标\n",
    "cluster_centers = kmeans.cluster_centers_\n",
    "print(\"Cluster centers:\")\n",
    "print(cluster_centers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 预测每个数据点的簇标签\n",
    "labels = kmeans.labels_\n",
    "print(\"Labels:\")\n",
    "print(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 可视化聚类结果\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels)\n",
    "plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker='x', s=200, linewidths=3, color='r')\n",
    "plt.xlabel(\"Feature 1\")\n",
    "plt.ylabel(\"Feature 2\")\n",
    "plt.title(\"K-means Clustering\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.2 手写数字识别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import *\n",
    "import os\n",
    "from os import listdir\n",
    "import operator\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn import metrics\n",
    "\n",
    "#步骤1：路径与标签\n",
    "path1='D:/python/机器学习与社会科学应用/演示数据/06无监督学习/digits/trainingDigits/'\n",
    "path2='D:/python/机器学习与社会科学应用/演示数据/06无监督学习/digits/testDigits/'\n",
    "print(path1)\n",
    "#将训练数据存储到一个矩阵中1024维，并存储对应的标签\n",
    "trainName=listdir(path1)\n",
    "trainNum=len(trainName)\n",
    "trainNumpy = zeros((trainNum,1024))\n",
    "#print(\"trainNum=%d\"%trainNum)\n",
    "#对文件名进行分析，训练文本对应的标签\n",
    "print(trainNum)\n",
    "print(trainName[0])\n",
    "\n",
    "\n",
    "#将测试数据存储到一个矩阵中1024维，并存储对应的标签\n",
    "testName=listdir(path2)\n",
    "testNum=len(testName)\n",
    "testNumpy = zeros((testNum,1024))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#步骤2：将像素数据转换成向量数据\n",
    "#这个函数是1024长度的向量\n",
    "def img2vector(filename):\n",
    "    vect=zeros((1,1024))\n",
    "    f=open(filename)\n",
    "    for i in range(32):\n",
    "        line=f.readline()\n",
    "        for j in range(32):\n",
    "            vect[0,32*i+j]=int(line[j])\n",
    "    return vect\n",
    "handlabel=[]\n",
    "\n",
    "#训练集\n",
    "for i in range(trainNum):\n",
    "    filename=trainName[i]#文件名\n",
    "    filestr=filename.split('.')[0]#不带后缀的文件名\n",
    "    filelabel=int(filestr.split('_')[0])#文件的标签,0,1,2,....,9\n",
    "    #将标签添加至handlabel中\n",
    "    handlabel.append(filelabel)\n",
    "    trainNumpy[i,:]=img2vector(path1+str(filename))#转成1024\n",
    "    #print(handlabel[:20])\n",
    "\n",
    "print(trainNumpy[0][0:40])\n",
    "print(trainNumpy.shape[0])\n",
    "print(handlabel[0:40])\n",
    "print(len(handlabel))\n",
    "\n",
    "\n",
    "handlabel_test=[]\n",
    "for i in range(testNum):\n",
    "    filename_test=testName[i]#文件名\n",
    "    filestr_test=filename_test.split('.')[0]#不带后缀的文件名\n",
    "    filelabel_test=int(filestr_test.split('_')[0])#文件的标签,0,1,2,....,9\n",
    "    #将标签添加至handlabel中\n",
    "    handlabel_test.append(filelabel_test)\n",
    "    testNumpy[i,:]=img2vector(path1+str(filename_test))#转成1024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train=handlabel\n",
    "x_train=trainNumpy  #列表，1934个元素，每个元素又是一个1024长度的列表\n",
    "print(len(x_train))\n",
    "print(len(y_train))\n",
    "kmeans = KMeans(n_clusters=10, random_state=100)\n",
    "kmeans.fit(x_train)\n",
    "#测试集\n",
    "print(y_train[0:189])\n",
    "y_pred = kmeans.predict(x_train)\n",
    "print(y_pred[0:189])\n",
    "print(1-len([y for y in y_pred[0:189] if y!=8])/len(y_train[0:189]))\n",
    "#print(adjusted_rand_score(y_train, y_pred))\n",
    "#随机试验中，把0可能会随机分配给某个\"标签\"\n",
    "\n",
    "#结果评价：根据已知分类标签\n",
    "for num in range(0,10):\n",
    "    x_train_temp=[x_train[j] for j in [i for i,y in enumerate(y_train) if y==num]]\n",
    "    y_train_temp=[y for y in y_train if y==num]\n",
    "    y_pred_temp=kmeans.predict(x_train_temp)\n",
    "    y_pred_dict=dict((a,list(y_pred_temp).count(a)) for a in y_pred_temp)\n",
    "    y_pred_value=[k for k,v in y_pred_dict.items() if max(y_pred_dict.values())==v]\n",
    "    print(\"数值%d对应的预测\\\"聚类代码\\\"为:%d,准确率为：%.4f\"\n",
    "          %(num,y_pred_value[0],sum(y_pred_temp==y_pred_value[0])/len(y_train_temp)))\n",
    "#3和9没有区分开\n",
    "#1和8没有区分开"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#结果评价：兰德指数\n",
    "#https://blog.csdn.net/sinat_26917383/article/details/70577710"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for k in range(2,15):\n",
    "    kmeans = KMeans(n_clusters=k)\n",
    "    kmeans.fit(x_train)\n",
    "    y_pred = kmeans.predict(x_train)\n",
    "    print(k,metrics.adjusted_rand_score(y_train, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#轮廓系数\n",
    "for k in range(2,15):\n",
    "    kmeans = KMeans(n_clusters=k)\n",
    "    kmeans.fit(x_train)\n",
    "    y_pred = kmeans.predict(x_train)\n",
    "    print(k,metrics.silhouette_score(x_train, y_pred, metric='euclidean'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.3. K-Means参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# n_clusters：整型，缺省值=8 ，生成的聚类数。\n",
    "# max_iter：整型，缺省值=300 。执行一次k-means算法所进行的最大迭代数。 \n",
    "# n_init：整型，缺省值=10 。用不同的聚类中心初始化值运行算法的次数，最终解是在inertia意义下选出的最优结果。 \n",
    "# init：有三个可选值：’k-means++’， ‘random’，或者传递一个ndarray向量。 此参数指定初始化方法，默认值为 ‘k-means++’。 \n",
    "# 　　（１）‘k-means++’ 用一种特殊的方法选定初始聚类中发，可加速迭代过程的收敛。\n",
    "# 　　（２）‘random’ 随机从训练数据中选取初始质心。 \n",
    "# 　　（３）如果传递的是一个ndarray，则应该形如 (n_clusters, n_features) 并给出初始质心。 \n",
    "# precompute_distances：三个可选值，‘auto’，True 或者 False。 预计算距离，计算速度更快但占用更多内存。 \n",
    "# 　　（１）‘auto’：如果 样本数乘以聚类数大于 12million 的话则不预计算距离。\n",
    "# 　　（２）True：总是预先计算距离。 \n",
    "# 　　（３）False：永远不预先计算距离。 \n",
    "# tol：float类型，默认值= 1e-4　与inertia结合来确定收敛条件。 \n",
    "# n_jobs：整形数。　指定计算所用的进程数。内部原理是同时进行n_init指定次数的计算。 \n",
    "# 　　（１）若值为 -1，则用所有的CPU进行运算。若值为1，则不进行并行运算。\n",
    "# 　　（２）若值小于-1，则用到的CPU数为(n_cpus + 1 + n_jobs)。因此如果 n_jobs值为-2，则用到的CPU数为总CPU数减1。 \n",
    "# random_state：整型或 numpy.RandomState 类型，可选 用于初始化质心的生成器（generator）。如果值为一个整数，则确定一个seed。此参数默认值为numpy的随机数生成器。 \n",
    "# copy_x：布尔型，默认值=True 　　\n",
    "# 当我们precomputing distances时，将数据中心化会得到更准确的结果。如果把此参数值设为True，则原始数据不会被改变。如果是False，则会直接在原始数据 \n",
    "# 上做修改并在函数返回值时将其还原。但是在计算过程中由于有对数据均值的加减运算，所以数据返回后，原始数据和计算前可能会有细小差别。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"微软雅黑\" size=3>本节结束</font> "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
