Amazon phone reviews sentiment analysis¶

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
%matplotlib inline

df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.shape

(413840, 6)

pos = df[df['Rating']==5][['Rating','Reviews']]
pos['Reviews'].fillna('', inplace = True)
pos.shape

(223605, 2)

neg = df[df['Rating']<4][['Rating','Reviews']]
neg['Reviews'].fillna('', inplace = True)

neg.shape

(128843, 2)

customizing stop words¶

Keep Negative words!¶

stop = pd.read_fwf('stop.txt',names=['words'])

stop_list = list(stop['words'].values)
stop_list = stop_list + ['lot','phone','wa','im','ha','doe','wa', 'able','unlocked','buy']

why HashingVectorizer¶

n-gram¶

TfidfVectorizer?

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer

token = HashingVectorizer(ngram_range=(1, 3), stop_words=stop_list)
token

HashingVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=1048576, ngram_range=(1, 3),
         non_negative=False, norm='l2', preprocessor=None,
         stop_words=['a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking',... 'yourself', 'yourselves', 'lot', 'phone', 'wa', 'im', 'ha', 'doe', 'wa', 'able', 'unlocked', 'buy'],
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)

start = time()

all_token = token.fit_transform(pos['Reviews'].append(neg['Reviews']))

print ('used: {:.2f}s'.format(time()-start))

used: 16.12s

all_token

<352448x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 13597189 stored elements in Compressed Sparse Row format>

pos['Rating'] = 'good'
neg['Rating'] = 'bad'

prep = pos.append(neg)
prep.shape

(352448, 2)

modeling¶

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_token, prep['Rating'], random_state = 777, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((264336, 1048576), (88112, 1048576), (264336,), (88112,))

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

# SGDClassifier?

model = OneVsRestClassifier(SGDClassifier())
model

OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1)

start = time()

model.fit(X_train, y_train)

print ('used: {:.2f}s'.format(time()-start))

used: 2.01s

model.score(X_test, y_test)

0.89473624477937175

test_prediction = model.predict(X_test)

test_prediction

array(['good', 'bad', 'good', ..., 'bad', 'good', 'good'], 
      dtype='<U4')

y_test.head()

105881    good
18317      bad
88844     good
109977    good
350220    good
Name: Rating, dtype: object

trying out other models¶

LogisticRegression¶

model = OneVsRestClassifier(LogisticRegression(verbose = 1))

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)

[LibLinear]used: 13.67s
accuracy

0.93882785545669145

PassiveAggressiveClassifier¶

model = OneVsRestClassifier(PassiveAggressiveClassifier(n_iter=10))

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)

used: 2.56s
accuracy

0.96305838024332668

Testing K-fold¶

# PassiveAggressiveClassifier?

start = time()

model0= PassiveAggressiveClassifier(n_iter=1)

cv_result = cross_val_score(model0, all_token, prep['Rating'], cv=10)

print ('used: {:.2f}s'.format(time()-start))
print(cv_result)
print ('min  is %.4f'%cv_result.min())
print ('max  is %.4f'%cv_result.max())
print ('mean is %.4f'%cv_result.mean())
print ()

used: 9.70s
[ 0.93573739  0.9327867   0.91357885  0.91204426  0.91692439  0.91354557
  0.88979684  0.91703552  0.92580297  0.90324594]
min  is 0.8898
max  is 0.9357
mean is 0.9160

LinearSVC¶

model = LinearSVC(verbose = 1)

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

[LibLinear]used: 7.09s
accuracy
0.962229889232
0.987005175232

use full data¶

model99 = LinearSVC(verbose = 1)

start = time()
model99.fit(all_token, prep['Rating'])
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy on train dataset')
model99.score(all_token, prep['Rating'])

[LibLinear]used: 15.76s
accuracy on train dataset

0.98699666333757041

print(model99.score(X_test, y_test))

0.986630651898

save model¶

from sklearn.externals import joblib
# joblib?

from sklearn.externals import joblib
joblib.dump(token, 'token.pkl', protocol=2)
joblib.dump(model99, 'token_SVM.pkl', protocol=2)

['token_SVM.pkl']

print(classification_report(y_test, model.predict(X_test), target_names=['Negative','Positive']))

             precision    recall  f1-score   support

   Negative       0.96      0.91      0.93     32170
   Positive       0.95      0.98      0.96     55942

avg / total       0.95      0.95      0.95     88112

print(confusion_matrix(y_test, model.predict(X_test)))

[[29888  2282]
 [ 1046 54896]]

plot_confusion_matrix(confusion_matrix(y_test, model.predict(X_test)), 
                      classes=['Negative','Positive'],
                      title='Confusion matrix, without normalization')

Confusion matrix, without normalization
[[29888  2282]
 [ 1046 54896]]

SVM never stop¶

same as trees, forests, ensembles¶

should PCA, or Kbest first

# model = OneVsRestClassifier(SVC(verbose = 1))

# start = time()
# model.fit(X_train, y_train)
# print ('used: {:.2f}s'.format(time()-start))
# print ('accuracy')
# model.score(X_test, y_test)

Regression¶

model = OneVsRestClassifier(RidgeClassifier())

start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)

used: 15.60s
accuracy

0.94840657345197021

predicting new comments¶

def pred_new(text):
    
    text = token.transform([text])
    result = model99.predict(text)
#     final = result[0]
#     final = 'bad' if result[0]==1 else 'good'
    final = 'Negative' if result[0]=='bad' else 'Positive'

    return final

pred_new('I have a pineapple, sucks')

'Negative'

pred_new('煎饼果子来一套, I love')

'Positive'

pred_new('bad')

'Negative'

load model¶

new_model = joblib.load('token_SVM.pkl')
new_token = joblib.load('token.pkl')

def pred2_new(text):
    
    text = new_token.transform([text])
    result = new_model.predict(text)
#     final = result[0]
#     final = 'bad' if result[0]==1 else 'good'
    final = 'Negative' if result[0]=='bad' else 'Positive'

    return final

pred2_new('bad')

'Negative'

pred2_new('yo whatever')

'Positive'

# graphing 
import numpy as np
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix