商务案例: 在线用户评论分析¶
1. 数据集介绍¶
Yelp数据集整理自官方公开 (https://www.yelp.com/dataset) 的商户、点评和用户数据。本实验使用所有位于多伦多的餐馆截至2017年7月的评论数据(review_res.txt)开展文本挖掘,数据字段及具体含义如下:
- user_id: 用户ID
- business_id: 商户ID
- date: 用户评论日期
- text: 用户评论内容
- stars: 用户评分星级,1星到5星
In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
3.1. 载入预处理后的数据¶
In [2]:
reviews = pd.read_csv("./data_preprocessed.csv")
print ("# of reviews: ", reviews.shape[0])
reviews_senti_count = reviews.groupby("sentiment").count()
print(reviews_senti_count)
reviews.head()
# of reviews: 276883
review
sentiment
0 107728
1 169155
Out[2]:
| review | sentiment | |
|---|---|---|
| 0 | leave table feel ambivalent meh others would p... | 0 |
| 1 | time worth stick magical number three okay let... | 0 |
| 2 | love place boyfriend celebrate anniversary gla... | 1 |
| 3 | lovely even last night table great selection w... | 1 |
| 4 | table classic bistro good menu wine list cockt... | 0 |
3.2. 划分训练集与测试集¶
In [3]:
X = []
for index, row in reviews.iterrows():
review = row['review']
X.append(review)
y = reviews["sentiment"]
from sklearn.model_selection import train_test_split
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 628)
print(len(X_train), len(X_test))
221506 55377
In [4]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer()
X_train_tf = tf_vec.fit_transform(X_train)
X_test_tf = tf_vec.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)
(221506, 132590) (55377, 132590)
In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train_tf, y_train)
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test_tf)
# Logistic Regression score
print("Logistic regression score for test set:", round(logreg.score(X_test_tf, y_test), 5))
print("\n Classification report:")
print(classification_report(y_test, y_hat_test))
Logistic regression score for test set: 0.85136
Classification report:
precision recall f1-score support
0 0.83 0.77 0.80 21508
1 0.86 0.90 0.88 33869
accuracy 0.85 55377
macro avg 0.85 0.84 0.84 55377
weighted avg 0.85 0.85 0.85 55377
4.1.2. TF-IDF向量 + 逻辑回归¶
In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)
print(X_train_tfidf.shape, X_test_tfidf.shape)
(221506, 132590) (55377, 132590)
In [7]:
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test_tfidf)
# Logistic Regression score
print("Logistic regression score for test set:", round(logreg.score(X_test_tfidf, y_test), 5))
print("\nClassification report:")
print(classification_report(y_test, y_hat_test))
Logistic regression score for test set: 0.85953
Classification report:
precision recall f1-score support
0 0.84 0.79 0.81 21508
1 0.87 0.91 0.89 33869
accuracy 0.86 55377
macro avg 0.86 0.85 0.85 55377
weighted avg 0.86 0.86 0.86 55377
4.2. 评论主题提取 - 主题模型¶
In [8]:
doc_len = []
for review in X:
doc_len.append(len(review.split(' ')))
plt.hist(doc_len, bins = 20)
plt.show()
应用Latent Dirichlet Allocation模型¶
In [9]:
from gensim import corpora, models
X_train_s = [text.split(' ') for text in X_train]
X_test_s = [text.split(' ') for text in X_test]
dictionary = corpora.Dictionary(X_train_s)
corpus_train = [dictionary.doc2bow(text) for text in X_train_s]
corpus_test = [dictionary.doc2bow(text) for text in X_test_s]
topic_n = 50
lda = models.LdaModel(corpus = corpus_train, id2word = dictionary, num_topics = topic_n, passes = 10, alpha = 0.1, eta = 0.01)
topic_list = lda.print_topics(10)
for topic in topic_list:
print(topic)
(13, '0.103*"box" + 0.085*"middle" + 0.045*"par" + 0.040*"gourmet" + 0.038*"signature" + 0.037*"falafel" + 0.037*"bento" + 0.030*"car" + 0.028*"turkey" + 0.027*"aroma"') (21, '0.107*"lobster" + 0.097*"seafood" + 0.088*"wrap" + 0.059*"crab" + 0.043*"market" + 0.039*"shrimp" + 0.037*"mussel" + 0.021*"clam" + 0.020*"kensington" + 0.019*"plastic"') (44, '0.161*"free" + 0.089*"vegan" + 0.079*"husband" + 0.063*"option" + 0.061*"juice" + 0.034*"gluten" + 0.027*"ate" + 0.025*"load" + 0.024*"veg" + 0.023*"muffin"') (10, '0.034*"tea" + 0.032*"cream" + 0.028*"sweet" + 0.028*"dessert" + 0.025*"ice" + 0.024*"chocolate" + 0.018*"flavour" + 0.017*"like" + 0.016*"taste" + 0.014*"try"') (3, '0.038*"order" + 0.026*"ask" + 0.026*"come" + 0.021*"server" + 0.017*"service" + 0.017*"time" + 0.016*"get" + 0.016*"food" + 0.014*"say" + 0.014*"one"') (33, '0.036*"steak" + 0.025*"main" + 0.022*"bread" + 0.020*"dish" + 0.020*"good" + 0.019*"cook" + 0.018*"appetizer" + 0.018*"dessert" + 0.018*"sauce" + 0.017*"mushroom"') (43, '0.108*"cheese" + 0.055*"mac" + 0.054*"view" + 0.032*"khao" + 0.031*"sport" + 0.029*"road" + 0.021*"san" + 0.018*"sprout" + 0.016*"tower" + 0.016*"squash"') (47, '0.055*"time" + 0.050*"best" + 0.045*"place" + 0.037*"try" + 0.032*"toronto" + 0.031*"love" + 0.030*"one" + 0.021*"ever" + 0.021*"always" + 0.018*"great"') (38, '0.116*"chef" + 0.043*"bone" + 0.041*"fall" + 0.038*"north" + 0.029*"apart" + 0.026*"york" + 0.023*"win" + 0.021*"gras" + 0.019*"foie" + 0.018*"staple"') (8, '0.085*"drink" + 0.066*"night" + 0.037*"place" + 0.035*"bar" + 0.031*"great" + 0.027*"friend" + 0.027*"music" + 0.019*"cool" + 0.018*"cocktail" + 0.018*"fun"')
LDA主题可视化¶
In [12]:
#! pip install WordCloud
from wordcloud import WordCloud
cloud = WordCloud(background_color = 'white', width = 200, height = 150, max_words = 20, color_func = lambda *args, **kwargs: 'black')
topics = lda.show_topics(formatted = False, num_topics = 50)
fig, axes = plt.subplots(5, 10, figsize = (37.5, 20), sharex = True, sharey = True)
for i, ax in enumerate(axes.flatten()):
fig.add_subplot(ax)
topic_words = dict(topics[i][1])
cloud.generate_from_frequencies(topic_words, max_font_size = 64)
plt.gca().imshow(cloud)
plt.gca().set_title('Topic ' + str(i), fontdict = dict(size=12))
plt.gca().axis('off')
plt.subplots_adjust(wspace = 0, hspace = 0)
plt.margins(x = 0, y = 0)
plt.tight_layout()
plt.show()
In [13]:
# load pre-trained English word vectors
# https://fasttext.cc/docs/en/english-vectors.html
voca = {}
for review in X:
words = review.split(' ')
for word in words:
voca[word] = 1
print("# of words", len(voca.keys()))
data = {}
file = open("./wiki-news-300d-1M.vec", "r")
while 1:
line = file.readline()
if not line:
break
else:
segs = line.split(" ")
if segs[0] in voca:
data[segs[0]] = np.array([float(seg) for seg in segs[1:]])
file.close()
def text2vec(text):
vals = np.zeros(300)
n = 0
if len(text) == 0:
return vals
else:
words = text.split(' ')
for word in words:
if word in data:
vals += data[word]
n += 1
if n == 0:
return vals
else:
return vals/n
X_train_word2vec = np.array([text2vec(text) for text in X_train])
X_test_word2vec = np.array([text2vec(text) for text in X_test])
print(X_train_word2vec.shape, X_test_word2vec.shape)
# of words 150544 (221506, 300) (55377, 300)
In [14]:
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train_word2vec, y_train)
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test_word2vec)
# Logistic Regression score
print("Logistic regression score for test set:", round(logreg.score(X_test_word2vec, y_test), 5))
print("\nClassification report:")
print(classification_report(y_test, y_hat_test))
Logistic regression score for test set: 0.81991
Classification report:
precision recall f1-score support
0 0.79 0.73 0.76 21508
1 0.84 0.87 0.86 33869
accuracy 0.82 55377
macro avg 0.81 0.80 0.81 55377
weighted avg 0.82 0.82 0.82 55377
In [ ]: