import numpy as np
import pandas as pd
from time import time, ctime
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
http://files.grouplens.org/datasets/movielens/ml-1m-README.txt
!ls ./Boltzmann_Machines/ml-100k/
movies = pd.read_csv('./Boltzmann_Machines/ml-1m/movies.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['MovieID, Title','Genres'])
print movies.shape
movies.head()
users = pd.read_csv('./Boltzmann_Machines/ml-1m/users.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['UserID','Gender','Age','Occupation','Zip'])
print users.shape
users.head()
ratings = pd.read_csv('./Boltzmann_Machines/ml-1m/ratings.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['UserID','MovieID','Rating','Timestamp'])
print ratings.shape
ratings.head()
from datetime import datetime
%%time
ratings['Time']=ratings['Timestamp'].apply(datetime.fromtimestamp)
datetime.fromtimestamp(int("874965758")).strftime('%Y-%m-%d %H:%M:%S')
ratings.tail()
training_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.base',
delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print training_set.shape
training_set.head()
t_df=training_set.copy()
training_set = np.array(training_set, dtype='int')
training_set[:2]
test_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.test',
delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print test_set.shape
test_set.head()
e_df=test_set.copy()
test_set = np.array(test_set, dtype='int')
test_set[:2]
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies
# C=lambda X: (len(t_df[X].unique()), len(e_df[X].unique()))
# CC=lambda X: (t_df[X].max(), e_df[X].max())
# C('UserID')
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))
pd.DataFrame(test_set)[pd.DataFrame(test_set).sum(1)==0].shape
def convert(data):
new_data=[]
for id_users in range(1, nb_users+1):
id_movies = data[:,1][data[:,0] == id_users]
id_ratings = data[:,2][data[:,0] == id_users]
ratings = np.zeros(nb_movies)
ratings[id_movies -1] = id_ratings
new_data.append(list(ratings))
return new_data
%%time
training_set = convert(training_set)
test_set = convert(test_set)
print pd.DataFrame(training_set).shape
len(training_set), len(training_set[0])
pd.DataFrame(training_set).head()
t_df.tail()
tmp = t_df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp.head()
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
tmp.head()
pd.DataFrame(tmp.tolist()).head()
from collections import defaultdict
d = defaultdict(int)
for i in range(1, 1683): d[i]
# def helper(x):
# d = defaultdict(int)
# for i in range(1, 1683): d[i]
# d[x['MovieID']]=x['Rating']
# return d
%%time
df=t_df.copy()
# df.columns=['user','movie','rating','time']
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
mapped = pd.DataFrame(tmp.tolist(),columns=range(1,nb_movies+1)).fillna(0)
mapped.shape
mapped.head()
mapped.values.shape
type(mapped.values)
np.array_equal(mapped.values, np.array(training_set))
mapped.values.tolist()==training_set
train_list=training_set[:]
training_set=train_list[:]
training_set=np.array(training_set)
training_set
training_set[training_set==0]=-1
training_set
training_set[training_set==1]=0
training_set[training_set==2]=0
training_set[training_set>=3]=1
training_set
from sklearn.neural_network import BernoulliRBM
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
BernoulliRBM?
rbm = BernoulliRBM(verbose=1, learning_rate=0.00003, n_components=1000)
rbm
rbm.fit(training_set)
rbm.transform(training_set)
training_set
def conv(x):
x = np.array(x)
x[x==0]=-1
x[x==1]=1
x[x==2]=1
x[x>=3]=1
return x
rbm.transform(conv(test_set[10:])).shape
from sklearn.linear_model import LogisticRegression
a = LogisticRegression()
a.