Amazon phone reviews sentiment analysis

In [47]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
In [48]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.shape
Out[48]:
(413840, 6)
In [49]:
pos = df[df['Rating']==5][['Rating','Reviews']]
pos['Reviews'].fillna('', inplace = True)
pos.shape
Out[49]:
(223605, 2)
In [50]:
neg = df[df['Rating']<4][['Rating','Reviews']]
neg['Reviews'].fillna('', inplace = True)

neg.shape
Out[50]:
(128843, 2)

customizing stop words

Keep Negative words!

In [51]:
stop = pd.read_fwf('stop.txt',names=['words'])
In [52]:
stop_list = list(stop['words'].values)
stop_list = stop_list + ['lot','phone','wa','im','ha','doe','wa', 'able','unlocked','buy']

why HashingVectorizer

n-gram

In [53]:
TfidfVectorizer?
In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer

token = HashingVectorizer(ngram_range=(1, 3), stop_words=stop_list)
token
Out[54]:
HashingVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=1048576, ngram_range=(1, 3),
         non_negative=False, norm='l2', preprocessor=None,
         stop_words=['a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking',... 'yourself', 'yourselves', 'lot', 'phone', 'wa', 'im', 'ha', 'doe', 'wa', 'able', 'unlocked', 'buy'],
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)
In [55]:
start = time()

all_token = token.fit_transform(pos['Reviews'].append(neg['Reviews']))

print ('used: {:.2f}s'.format(time()-start))
used: 16.12s
In [56]:
all_token
Out[56]:
<352448x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 13597189 stored elements in Compressed Sparse Row format>
In [57]:
pos['Rating'] = 'good'
neg['Rating'] = 'bad'
In [58]:
prep = pos.append(neg)
prep.shape
Out[58]:
(352448, 2)

modeling

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_token, prep['Rating'], random_state = 777, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[59]:
((264336, 1048576), (88112, 1048576), (264336,), (88112,))
In [60]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
In [66]:
# SGDClassifier?
In [67]:
model = OneVsRestClassifier(SGDClassifier())
model
Out[67]:
OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1)
In [68]:
start = time()

model.fit(X_train, y_train)

print ('used: {:.2f}s'.format(time()-start))
used: 2.01s
In [69]:
model.score(X_test, y_test)
Out[69]:
0.89473624477937175
In [70]:
test_prediction = model.predict(X_test)
In [71]:
test_prediction
Out[71]:
array(['good', 'bad', 'good', ..., 'bad', 'good', 'good'], 
      dtype='<U4')
In [72]:
y_test.head()
Out[72]:
105881    good
18317      bad
88844     good
109977    good
350220    good
Name: Rating, dtype: object

trying out other models

LogisticRegression

In [73]:
model = OneVsRestClassifier(LogisticRegression(verbose = 1))

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
[LibLinear]used: 13.67s
accuracy
Out[73]:
0.93882785545669145

PassiveAggressiveClassifier

In [74]:
model = OneVsRestClassifier(PassiveAggressiveClassifier(n_iter=10))

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
used: 2.56s
accuracy
Out[74]:
0.96305838024332668

Testing K-fold

In [75]:
# PassiveAggressiveClassifier?
In [76]:
start = time()

model0= PassiveAggressiveClassifier(n_iter=1)

cv_result = cross_val_score(model0, all_token, prep['Rating'], cv=10)

print ('used: {:.2f}s'.format(time()-start))
print(cv_result)
print ('min  is %.4f'%cv_result.min())
print ('max  is %.4f'%cv_result.max())
print ('mean is %.4f'%cv_result.mean())
print ()
used: 9.70s
[ 0.93573739  0.9327867   0.91357885  0.91204426  0.91692439  0.91354557
  0.88979684  0.91703552  0.92580297  0.90324594]
min  is 0.8898
max  is 0.9357
mean is 0.9160

LinearSVC

In [91]:
model = LinearSVC(verbose = 1)

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))
[LibLinear]used: 7.09s
accuracy
0.962229889232
0.987005175232

use full data

In [92]:
model99 = LinearSVC(verbose = 1)

start = time()
model99.fit(all_token, prep['Rating'])
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy on train dataset')
model99.score(all_token, prep['Rating'])
[LibLinear]used: 15.76s
accuracy on train dataset
Out[92]:
0.98699666333757041
In [93]:
print(model99.score(X_test, y_test))
0.986630651898

save model

In [82]:
from sklearn.externals import joblib
# joblib?
In [83]:
from sklearn.externals import joblib
joblib.dump(token, 'token.pkl', protocol=2)
joblib.dump(model99, 'token_SVM.pkl', protocol=2)
Out[83]:
['token_SVM.pkl']
In [ ]:
 
In [ ]:
 
In [83]:
print(classification_report(y_test, model.predict(X_test), target_names=['Negative','Positive']))
             precision    recall  f1-score   support

   Negative       0.96      0.91      0.93     32170
   Positive       0.95      0.98      0.96     55942

avg / total       0.95      0.95      0.95     88112

In [78]:
print(confusion_matrix(y_test, model.predict(X_test)))
[[29888  2282]
 [ 1046 54896]]
In [79]:
plot_confusion_matrix(confusion_matrix(y_test, model.predict(X_test)), 
                      classes=['Negative','Positive'],
                      title='Confusion matrix, without normalization')
Confusion matrix, without normalization
[[29888  2282]
 [ 1046 54896]]

SVM never stop

same as trees, forests, ensembles

  • should PCA, or Kbest first
In [38]:
# model = OneVsRestClassifier(SVC(verbose = 1))

# start = time()
# model.fit(X_train, y_train)
# print ('used: {:.2f}s'.format(time()-start))
# print ('accuracy')
# model.score(X_test, y_test)

Regression

In [39]:
model = OneVsRestClassifier(RidgeClassifier())

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
used: 15.60s
accuracy
Out[39]:
0.94840657345197021

predicting new comments

In [108]:
def pred_new(text):
    
    text = token.transform([text])
    result = model99.predict(text)
#     final = result[0]
#     final = 'bad' if result[0]==1 else 'good'
    final = 'Negative' if result[0]=='bad' else 'Positive'

    return final
In [109]:
pred_new('I have a pineapple, sucks')
Out[109]:
'Negative'
In [110]:
pred_new('煎饼果子来一套, I love')
Out[110]:
'Positive'
In [111]:
pred_new('bad')
Out[111]:
'Negative'

load model

In [112]:
new_model = joblib.load('token_SVM.pkl')
new_token = joblib.load('token.pkl')
In [113]:
def pred2_new(text):
    
    text = new_token.transform([text])
    result = new_model.predict(text)
#     final = result[0]
#     final = 'bad' if result[0]==1 else 'good'
    final = 'Negative' if result[0]=='bad' else 'Positive'

    return final
In [114]:
pred2_new('bad')
Out[114]:
'Negative'
In [116]:
pred2_new('yo whatever')
Out[116]:
'Positive'
In [ ]:
 
In [ ]:
 
In [9]:
# graphing 
import numpy as np
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
In [ ]: