In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv('/Users/spark/Downloads/Restaurant_Reviews.tsv',sep='\t')
In [3]:
df.head()
Out[3]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Review | Liked | |
---|---|---|
0 | Wow... Loved this place. | 1 |
1 | Crust is not good. | 0 |
2 | Not tasty and the texture was just nasty. | 0 |
3 | Stopped by during the late May bank holiday of... | 1 |
4 | The selection on the menu was great and so wer... | 1 |
In [4]:
df.describe()
Out[4]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Liked | |
---|---|
count | 1000.00000 |
mean | 0.50000 |
std | 0.50025 |
min | 0.00000 |
25% | 0.00000 |
50% | 0.50000 |
75% | 1.00000 |
max | 1.00000 |
In [5]:
df.dtypes
Out[5]:
Review object
Liked int64
dtype: object
In [6]:
# df['text_length'] = df.Review.map(len)
df['word_length'] = df.Review.map(lambda x:len(x.split(' ')))
In [7]:
df.corr()
Out[7]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Liked | word_length | |
---|---|---|
Liked | 1.000000 | -0.096573 |
word_length | -0.096573 | 1.000000 |
In [8]:
g = sns.FacetGrid(data=df, col='Liked')
g.map(plt.hist, 'word_length', bins=50)
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x10e6e0d30>
In [9]:
sns.boxplot(x='Liked', y='word_length', data=df)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1108c8278>
可以看出,是否喜欢和文字长度没有相关性
In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
[nltk_data] Downloading package stopwords to /Users/spark/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
In [11]:
import string
def text_process(text):
'''
按照下面方式处理字符串
1. 去除标点符号
2. 去掉无用词
3. 返回剩下的词的list
'''
nopunc = [char for char in text if char not in string.punctuation]
nopunc = ''.join(nopunc)
return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
In [12]:
X = df.Review
y = df.Liked
bow_transformer = CountVectorizer(analyzer=text_process).fit(X)
X = bow_transformer.transform(X)
In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
In [14]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
Out[14]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [15]:
preds = nb.predict(X_test)
In [16]:
my_test_review = 'room is bad'
my_test_review_transformed = bow_transformer.transform([my_test_review])
nb.predict(my_test_review_transformed)[0]
Out[16]:
0
In [17]:
my_test_review = 'room is expensive'
my_test_review_transformed = bow_transformer.transform([my_test_review])
nb.predict(my_test_review_transformed)[0]
Out[17]:
0
In [18]:
my_test_review = 'suprise me'
my_test_review_transformed = bow_transformer.transform([my_test_review])
nb.predict(my_test_review_transformed)[0]
Out[18]:
0
In [19]:
my_test_review = 'amazing'
my_test_review_transformed = bow_transformer.transform([my_test_review])
nb.predict(my_test_review_transformed)[0]
Out[19]:
1
准确率在74%
In [20]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, preds))
print('\n')
print(classification_report(y_test, preds))
[[ 96 54]
[ 27 123]]
precision recall f1-score support
0 0.78 0.64 0.70 150
1 0.69 0.82 0.75 150
micro avg 0.73 0.73 0.73 300
macro avg 0.74 0.73 0.73 300
weighted avg 0.74 0.73 0.73 300