email数据集(email.csv)整理自某邮箱账户2012年前3个月收到的所有邮件,邮件按是否为垃圾邮件进行标注,邮件内容也已进行预处理与特征提取。数据字段及具体含义如下:
## load required libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
data = pd.read_csv("./email.csv")
data.head()
data.info()
比如将二分类变量映射为1/0,例如yes/no
yes_no_map = {
'yes': 1,
'no': 0
}
data['to_multiple'] = data['to_multiple'].map(yes_no_map)
data['from'] = data['from'].map(yes_no_map)
data['cc'] = data['cc'].map(yes_no_map)
data['sent_email'] = data['sent_email'].map(yes_no_map)
data['image'] = data['image'].map(yes_no_map)
data['attach'] = data['attach'].map(yes_no_map)
data['dollar'] = data['dollar'].map(yes_no_map)
data['winner'] = data['winner'].map(yes_no_map)
data['inherit'] = data['inherit'].map(yes_no_map)
data['password'] = data['password'].map(yes_no_map)
data['re_subj'] = data['re_subj'].map(yes_no_map)
data['exclaim_subj'] = data['exclaim_subj'].map(yes_no_map)
data['urgent_subj'] = data['urgent_subj'].map(yes_no_map)
# format
data['format'].value_counts()
format_map = {
'HTML': 1,
'Plain': 0
}
data['format'] = data['format'].map(format_map)
# number
data['number'].value_counts()
使用哑变量表示多类别变量
data = data.join(pd.get_dummies(data['number'], prefix = 'number'))
data = data.drop(['number'], axis = 1)
data.head()
print(data['spam'].unique())
data_label_count = data.groupby('spam').count()['cc'].sort_values(ascending = False)
print(data_label_count)
data_label_count.plot.bar()
plt.show()
import seaborn as sns
graph_by_variables = data.columns
plt.figure(figsize = (15, 18))
for i in range(0, 21):
plt.subplot(7, 3, i+1)
sns.distplot(data[graph_by_variables[i]])
plt.title(graph_by_variables[i])
plt.tight_layout()
f, ax = plt.subplots(figsize = (15, 15))
sns.heatmap(data.corr(), annot = True, linewidths = 0.5, fmt = '.1f', ax = ax)
from sklearn.model_selection import train_test_split
# Splitting into train and test sets
X = data.drop(['spam'], axis = 1)
y = data['spam']
# test_size = 0.2意为训练集占80%,测试集占20%,即将数据集按4:1的比例划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 628)
print(len(X_train), len(X_test))
from sklearn.linear_model import LogisticRegression
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test)
# Logistic Regression score
print("Logistic regression score for test set:")
print("Predicion: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test)))
from sklearn.naive_bayes import BernoulliNB
# Fitting a Naive Bayes model with default parameters
clf = BernoulliNB()
clf.fit(X_train, y_train)
# Prediction & Evaluation
y_hat_test = clf.predict(X_test)
# Naive Bayes score
print("Naive Bayes score for test set:")
print("Predicion: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test)))
from sklearn.tree import DecisionTreeClassifier
# Fitting a decision tree model with default parameters
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
# Prediction & Evaluation
y_hat_test = dt.predict(X_test)
# Decision Tree score
print("Decision tree score for test set:")
print("Predicion: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test)))
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(dt, feature_names = X.columns, filled = True, class_names = True,out_file=None)
graph = graphviz.Source(dot_data)
graph
from sklearn.tree import DecisionTreeClassifier
# Fitting a decision tree model with default parameters
dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(X_train, y_train)
# Prediction & Evaluation
y_hat_test = dt.predict(X_test)
# Decision Tree score
print("Decision tree score for test set:")
print("Predicion: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test)))
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(dt, feature_names = X.columns, filled = True, class_names = True,out_file=None)
graph = graphviz.Source(dot_data)
graph