波士顿住房数据集(bostonhousing.csv)包含美国人口普查局收集的美国马萨诸塞州波士顿住房价格的有关信息,数据字段及具体含义如下:
## load required libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
data = pd.read_csv('./bostonhousing.csv')
data.head()
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CRIM 506 non-null float64 1 ZN 506 non-null float64 2 INDUS 506 non-null float64 3 CHAS 506 non-null int64 4 NOX 506 non-null float64 5 RM 506 non-null float64 6 AGE 506 non-null float64 7 DIS 506 non-null float64 8 RAD 506 non-null int64 9 TAX 506 non-null int64 10 PTRATIO 506 non-null float64 11 B 506 non-null float64 12 LSTAT 506 non-null float64 13 MEDV 506 non-null float64 dtypes: float64(11), int64(3) memory usage: 55.5 KB
graph_by_variables = data.columns
plt.figure(figsize = (15, 18))
for i in range(0, 14):
plt.subplot(5, 3, i+1)
sns.distplot(data[graph_by_variables[i]])
plt.title(graph_by_variables[i])
plt.tight_layout()
查看变量之间相关性
f, ax = plt.subplots(figsize = (15, 15))
sns.heatmap(data.corr(), annot = True, linewidths = 0.5, fmt = '.1f', ax = ax)
<Axes: >
部分变量之间存在较强的线性相关性,比如变量RAD与变量TAX
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
# 变量MEDV在住房价格预测中一般作为因变量,本实验基于剩余自变量开展聚类分析
data_s = data.drop(['MEDV'], axis = 1)
graph_by_variables = data_s.columns
data_s = standardscaler.fit_transform(data_s)
data_s[:2]
array([[-0.41978194, 0.28482986, -1.2879095 , -0.27259857, -0.14421743,
0.41367189, -0.12001342, 0.1402136 , -0.98284286, -0.66660821,
-1.45900038, 0.44105193, -1.0755623 ],
[-0.41733926, -0.48772236, -0.59338101, -0.27259857, -0.74026221,
0.19427445, 0.36716642, 0.55715988, -0.8678825 , -0.98732948,
-0.30309415, 0.44105193, -0.49243937]])
from sklearn.cluster import KMeans
n_clusters = 4
kmeans = KMeans(n_clusters = n_clusters, random_state = 2021)
labels = kmeans.fit_predict(data_s)
# plot cluster sizes
plt.hist(labels, bins = range(n_clusters + 1))
plt.title ('Houses per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Houses')
plt.show()
sse = []
cluster_list = range(1, 16)
for i in cluster_list :
kmeans = KMeans(n_clusters = i, random_state = 2021)
kmeans.fit(data_s)
sse.append(kmeans.inertia_)
plt.plot(cluster_list, sse)
plt.title('Elbow Method')
plt.xlabel('Clusters')
plt.ylabel('SSE')
plt.show()
无法观测到非常明显的肘部
from sklearn.metrics import silhouette_score
s = []
cluster_list = range(2, 16)
for i in cluster_list:
kmeans = KMeans(n_clusters = i, random_state = 2021)
s.append(silhouette_score(data_s, kmeans.fit_predict(data_s)))
# Plotting a bar graph to compare the results
plt.bar(cluster_list, s)
plt.xlabel('Number of clusters', fontsize = 10)
plt.ylabel('Silhouette Score', fontsize = 10)
plt.show()
聚类数为2时,轮廓系数最大,可作为最优聚类数
kmeans = KMeans(n_clusters = 2, random_state = 2021)
labels = kmeans.fit_predict(data_s)
data["cluster"] = labels
# plot cluster sizes
plt.hist(labels, bins = range(3))
plt.title ('Houses per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Houses')
plt.show()
聚类中心点可视化
plt.subplots(figsize = (10, 10))
centers = kmeans.cluster_centers_
print(centers)
idx = np.arange(13)
plt.bar(idx, centers[0], color = 'b', width = 0.25, tick_label = graph_by_variables)
plt.bar(idx + 0.25, centers[1], color = 'r', width = 0.25)
plt.xticks(rotation = 90)
plt.show()
[[-0.39012396 0.26239167 -0.62036759 0.00291182 -0.58467512 0.24331476 -0.43510819 0.45722226 -0.58380115 -0.63145993 -0.28580826 0.32645106 -0.44642061] [ 0.72514566 -0.48772236 1.15311264 -0.00541237 1.086769 -0.45226302 0.80876041 -0.8498651 1.0851445 1.1737306 0.53124811 -0.60679321 0.82978746]]
重要变量上的聚类结果
imp_cols = list(graph_by_variables)
imp_cols.append("cluster")
sns.pairplot(data[imp_cols], hue = "cluster")
<seaborn.axisgrid.PairGrid at 0x1c412deeed0>
from sklearn.cluster import AgglomerativeClustering
s = []
cluster_list = range(2, 11)
for i in cluster_list:
hc = AgglomerativeClustering(n_clusters = i)
s.append(silhouette_score(data_s, hc.fit_predict(data_s)))
# Plotting a bar graph to compare the results
plt.bar(cluster_list, s)
plt.xlabel('Number of Clusters', fontsize = 10)
plt.ylabel('Silhouette Score', fontsize = 10)
plt.show()
hc = AgglomerativeClustering(n_clusters = 2)
labels = hc.fit_predict(data_s)
data["cluster"] = labels
imp_cols = ['CRIM', 'ZN', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV', 'cluster']
sns.pairplot(data[imp_cols], hue = "cluster")
<seaborn.axisgrid.PairGrid at 0x1c41d399210>
import scipy.cluster.hierarchy as sch
plt.figure(figsize = (10, 6))
linked = sch.linkage(data_s, method = 'ward')
dendrogram = sch.dendrogram(linked)
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean Distances')
plt.show()
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors = 2)
nbrs = neigh.fit(data_s)
distances, indices = nbrs.kneighbors(data_s)
distances = np.sort(distances, axis = 0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x1c429de8750>]
Until around 500th variable eps is under 2.0
from sklearn.cluster import DBSCAN
s = []
for eps in np.arange(1.0, 2.0, 0.1):
for min_sample in range(1, 10):
dbscan = DBSCAN(eps = eps, min_samples = min_sample)
dbscan.fit(data_s)
sil_score = silhouette_score(data_s, dbscan.labels_, metric = 'euclidean')
s.append((eps, min_sample, sil_score, len(set(dbscan.labels_))))
df_dbscan = pd.DataFrame(s, columns = ['eps', 'min_samples', 'sil_score', 'number_of_clusters'])
df_dbscan.sort_values('sil_score', ascending = False).head()
| eps | min_samples | sil_score | number_of_clusters | |
|---|---|---|---|---|
| 65 | 1.7 | 3 | 0.222789 | 13 |
| 56 | 1.6 | 3 | 0.222019 | 13 |
| 66 | 1.7 | 4 | 0.219773 | 13 |
| 57 | 1.6 | 4 | 0.219012 | 13 |
| 67 | 1.7 | 5 | 0.214790 | 11 |
dbscan = DBSCAN(eps = 1.7, min_samples = 3)
labels = dbscan.fit_predict(data_s)
data["cluster"] = labels
print(data["cluster"].value_counts())
cluster 0 276 9 84 -1 35 10 27 2 14 5 14 3 11 4 11 6 11 1 7 8 7 11 5 7 4 Name: count, dtype: int64