{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习与社会科学应用"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第六章 无监督学习算法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第三节 PCA降维"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >郭峰    \n",
    "    教授、博士生导师  \n",
    "上海财经大学公共经济与管理学院  \n",
    "上海财经大学数实融合与智能治理实验室    \n",
    "    邮箱：guofengsfi@163.com</font> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入库\n",
    "# 用于3D可视化\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "# 用于可视化图表\n",
    "import matplotlib.pyplot as plt\n",
    "# 用于做科学计算\n",
    "import numpy as np\n",
    "# 用于做数据分析\n",
    "import pandas as pd\n",
    "# 用于加载数据或生成数据等\n",
    "from sklearn import datasets\n",
    "# 导入PCA库\n",
    "from sklearn.decomposition import PCA\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入数据集\n",
    "iris = datasets.load_iris()\n",
    "iris_X = iris.data   ##获得数据集中的输入\n",
    "iris_y = iris.target ##获得数据集中的输出，即标签(也就是类别)\n",
    "print(iris_X.shape)\n",
    "print(iris.feature_names)\n",
    "print(iris.target_names)\n",
    "\n",
    "# iris数据集共有150个样本，每个样本有四个特征(四维)\n",
    "# 分别是萼片长度(sepal length)，萼片宽度(sepal width)，花瓣长度(petal length)，花瓣宽度(petal width)。 \n",
    "# 标签有三种，分别是setosa，versicolor和virginica\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PCA降维\n",
    "# 加载PCA模型并训练、降维\n",
    "pca = PCA(n_components=3)\n",
    "X_pca = pca.fit(iris_X).transform(iris_X)\n",
    "print(iris_X.shape)\n",
    "print(iris_X[0:5])\n",
    "print(X_pca.shape)\n",
    "print(X_pca[0:5])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 四维的样本变为了三维的。让我们分别看看四维和三维时的方差分布\n",
    "# 4维时\n",
    "pca = PCA(n_components=4)\n",
    "X_pca = pca.fit(iris_X).transform(iris_X)\n",
    "print(\"各主成分方向：\\n\",pca.components_)\n",
    "print(\"各主成分的方差值：\",pca.explained_variance_)\n",
    "print(\"各主成分的方差值与总方差之比：\",pca.explained_variance_ratio_)\n",
    "#print(\"奇异值分解后得到的特征值：\",pca.singular_values_)\n",
    "print(\"主成分数：\",pca.n_components_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 三维时\n",
    "pca = PCA(n_components=3)\n",
    "X_pca = pca.fit(iris_X).transform(iris_X)\n",
    "print(\"降维后各主成分方向：\\n\",pca.components_)\n",
    "print(\"降维后各主成分的方差值：\",pca.explained_variance_)\n",
    "print(\"降维后各主成分的方差值与总方差之比：\",pca.explained_variance_ratio_)\n",
    "#print(\"奇异值分解后得到的特征值：\",pca.singular_values_)\n",
    "print(\"降维后主成分数：\",pca.n_components_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 从四维降到三维，也就是将四维时，主成分方差值(方差值与总方差之比)最小的那个成分给去掉了。选取的是前三个最大的特征值。\n",
    "# 我们可以看看用图来看看三维的点的情况\n",
    "fig = plt.figure(figsize=(10,8))\n",
    "ax = Axes3D(fig,rect=[0, 0, 1, 1], elev=30, azim=20)\n",
    "ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], marker='o',c=iris_y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 也可以通过固定elev的值，改变azim的值来看看将这些点投影到各个平面的情况\n",
    "fig = plt.figure(figsize=(10,8))\n",
    "# 固定elev=0，改变azim为0，90，180，270\n",
    "ax = Axes3D(fig,rect=[0, 0, 1, 1], elev=90, azim=0)\n",
    "ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], marker='o',c=iris_y)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(10,8))\n",
    "# 固定elev=90，改变azim为0，90，180，270\n",
    "ax = Axes3D(fig,rect=[0, 0, 1, 1], elev=90, azim=0)\n",
    "ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], marker='o',c=iris_y)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 还可以在看看降维到二维的情况\n",
    "pca = PCA(n_components=2)\n",
    "X_pca =pca.fit(iris_X).transform(iris_X)\n",
    "print(\"降维后各主成分方向：\\n\",pca.components_)\n",
    "print(\"降维后各主成分的方差值：\",pca.explained_variance_)\n",
    "print(\"降维后各主成分的方差值与总方差之比：\",pca.explained_variance_ratio_)\n",
    "#print(\"奇异值分解后得到的特征值：\",pca.singular_values_)\n",
    "print(\"降维后主成分数：\",pca.n_components_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 降到二维，其实就是取了方差值(方差值与总方差之比)最大的前两个主成分\n",
    "fig = plt.figure(figsize=(10,8))\n",
    "plt.scatter(X_pca[:, 0], X_pca[:, 1],marker='o',c=iris_y)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"微软雅黑\" size=3>本节结束</font> "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
