import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.shape
pos = df[df['Rating']==5][['Rating','Reviews']]
pos['Reviews'].fillna('', inplace = True)
pos.shape
neg = df[df['Rating']<4][['Rating','Reviews']]
neg['Reviews'].fillna('', inplace = True)
neg.shape
stop = pd.read_fwf('stop.txt',names=['words'])
stop_list = list(stop['words'].values)
stop_list = stop_list + ['lot','phone','wa','im','ha','doe','wa', 'able','unlocked','buy']
TfidfVectorizer?
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
token = HashingVectorizer(ngram_range=(1, 3), stop_words=stop_list)
token
start = time()
all_token = token.fit_transform(pos['Reviews'].append(neg['Reviews']))
print ('used: {:.2f}s'.format(time()-start))
all_token
pos['Rating'] = 'good'
neg['Rating'] = 'bad'
prep = pos.append(neg)
prep.shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_token, prep['Rating'], random_state = 777, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
# SGDClassifier?
model = OneVsRestClassifier(SGDClassifier())
model
start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
model.score(X_test, y_test)
test_prediction = model.predict(X_test)
test_prediction
y_test.head()
model = OneVsRestClassifier(LogisticRegression(verbose = 1))
start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
model = OneVsRestClassifier(PassiveAggressiveClassifier(n_iter=10))
start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
# PassiveAggressiveClassifier?
start = time()
model0= PassiveAggressiveClassifier(n_iter=1)
cv_result = cross_val_score(model0, all_token, prep['Rating'], cv=10)
print ('used: {:.2f}s'.format(time()-start))
print(cv_result)
print ('min is %.4f'%cv_result.min())
print ('max is %.4f'%cv_result.max())
print ('mean is %.4f'%cv_result.mean())
print ()
model = LinearSVC(verbose = 1)
start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))
model99 = LinearSVC(verbose = 1)
start = time()
model99.fit(all_token, prep['Rating'])
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy on train dataset')
model99.score(all_token, prep['Rating'])
print(model99.score(X_test, y_test))
from sklearn.externals import joblib
# joblib?
from sklearn.externals import joblib
joblib.dump(token, 'token.pkl', protocol=2)
joblib.dump(model99, 'token_SVM.pkl', protocol=2)
print(classification_report(y_test, model.predict(X_test), target_names=['Negative','Positive']))
print(confusion_matrix(y_test, model.predict(X_test)))
plot_confusion_matrix(confusion_matrix(y_test, model.predict(X_test)),
classes=['Negative','Positive'],
title='Confusion matrix, without normalization')
# model = OneVsRestClassifier(SVC(verbose = 1))
# start = time()
# model.fit(X_train, y_train)
# print ('used: {:.2f}s'.format(time()-start))
# print ('accuracy')
# model.score(X_test, y_test)
model = OneVsRestClassifier(RidgeClassifier())
start = time()
model.fit(X_train, y_train)
print ('used: {:.2f}s'.format(time()-start))
print ('accuracy')
model.score(X_test, y_test)
def pred_new(text):
text = token.transform([text])
result = model99.predict(text)
# final = result[0]
# final = 'bad' if result[0]==1 else 'good'
final = 'Negative' if result[0]=='bad' else 'Positive'
return final
pred_new('I have a pineapple, sucks')
pred_new('煎饼果子来一套, I love')
pred_new('bad')
new_model = joblib.load('token_SVM.pkl')
new_token = joblib.load('token.pkl')
def pred2_new(text):
text = new_token.transform([text])
result = new_model.predict(text)
# final = result[0]
# final = 'bad' if result[0]==1 else 'good'
final = 'Negative' if result[0]=='bad' else 'Positive'
return final
pred2_new('bad')
pred2_new('yo whatever')
# graphing
import numpy as np
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix