import numpy as np
import pandas as pd
from time import time, ctime
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
http://files.grouplens.org/datasets/movielens/ml-1m-README.txt
!ls ./Boltzmann_Machines/ml-100k/
movies = pd.read_csv('./Boltzmann_Machines/ml-1m/movies.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['MovieID, Title','Genres'])
print movies.shape
movies.head()
users = pd.read_csv('./Boltzmann_Machines/ml-1m/users.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['UserID','Gender','Age','Occupation','Zip'])
print users.shape
users.head()
ratings = pd.read_csv('./Boltzmann_Machines/ml-1m/ratings.dat',
sep='::', header = None,
engine='python', encoding='latin-1',
names=['UserID','MovieID','Rating','Timestamp'])
print ratings.shape
ratings.head()
from datetime import datetime
%%time
ratings['Time']=ratings['Timestamp'].apply(datetime.fromtimestamp)
datetime.fromtimestamp(int("874965758")).strftime('%Y-%m-%d %H:%M:%S')
ratings.tail()
training_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.base',
delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print training_set.shape
training_set.head()
t_df=training_set.copy()
training_set = np.array(training_set, dtype='int')
training_set[:2]
test_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.test',
delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print test_set.shape
test_set.head()
e_df=test_set.copy()
test_set = np.array(test_set, dtype='int')
test_set[:2]
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies
# C=lambda X: (len(t_df[X].unique()), len(e_df[X].unique()))
# CC=lambda X: (t_df[X].max(), e_df[X].max())
# C('UserID')
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))
pd.DataFrame(test_set)[pd.DataFrame(test_set).sum(1)==0].shape
def convert(data):
new_data=[]
for id_users in range(1, nb_users+1):
id_movies = data[:,1][data[:,0] == id_users]
id_ratings = data[:,2][data[:,0] == id_users]
ratings = np.zeros(nb_movies)
ratings[id_movies -1] = id_ratings
new_data.append(list(ratings))
return new_data
%%time
training_set = convert(training_set)
test_set = convert(test_set)
print pd.DataFrame(training_set).shape
len(training_set), len(training_set[0])
pd.DataFrame(training_set).head()
t_df.tail()
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp.head()
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
tmp.head()
pd.DataFrame(tmp.tolist()).head()
from collections import defaultdict
d = defaultdict(int)
for i in range(1, 1683): d[i]
# def helper(x):
# d = defaultdict(int)
# for i in range(1, 1683): d[i]
# d[x['MovieID']]=x['Rating']
# return d
%%time
df=t_df.copy()
# df.columns=['user','movie','rating','time']
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
mapped = pd.DataFrame(tmp.tolist(),columns=range(1,nb_movies+1)).fillna(0)
mapped.shape
mapped.head()
mapped.values.shape
type(mapped.values)
np.array_equal(mapped.values, np.array(training_set))
mapped.values.tolist()==training_set
len(training_set), len(training_set[40]), len(test_set), len(test_set[9])
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)
type(training_set)
training_set
training_set==0
training_set[training_set==0]=-1
training_set
training_set[training_set==0]=-1
training_set[training_set==1]=0
training_set[training_set==2]=0
training_set[training_set>=3]=1
test_set[test_set==0]=-1
test_set[test_set==1]=0
test_set[test_set==2]=0
test_set[test_set>=3]=1
torch.randn(2,4)
torch.randn(1,7)
torch.randn(2,4)
x = torch.IntTensor([[8,2,3,5],[6,2,7,9]])
x
y = torch.IntTensor([[66,22,11],[77,33,99],[5,5,11],[13,31,2]])
y
torch.mm(x,y)
x
x.t()
torch.mm(x, x.t())
torch.mm(x.t(),x)
q=torch.randn(1,4)
print type(q)
q
q.expand_as(torch.rand(5,4))
aaa = torch.randn(3,4)
aaa
torch.sigmoid(aaa)
aa=torch.rand(2,6)
aa
# torch.bernoulli?
torch.bernoulli(aa)
cc = torch.rand(2,4)
cc
torch.sum(cc)
torch.sum(cc, 0)
torch.sum(cc, 1)
class RBM():
def __init__(self, nv, nh):
self.W = torch.randn(nh, nv)
self.a = torch.randn(1, nh) # one bias for each hidden node, with nh hidden nodes
self.b = torch.randn(1, nv) # one bias for each visible node
# sampling the hidden nodes according to the probabilities p(h) given v
def sample_h(self, x):
wx = torch.mm(x, self.W.t())
activation = wx + self.a.expand_as(wx)
p_h_given_v = torch.sigmoid(activation)
return p_h_given_v, torch.bernoulli(p_h_given_v)
def sample_v(self, y):
wy = torch.mm(y, self.W)
activation = wy + self.b.expand_as(wy)
p_v_given_h = torch.sigmoid(activation)
return p_v_given_h, torch.bernoulli(p_v_given_h)
def train(self, v0, vk, ph0, phk):
self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
self.b += torch.sum((v0 - vk), 0)
self.a += torch.sum((ph0 - phk), 0)
def __repr__(self):
string= "RBM object\nnv: {}\nnh: {}".format(nv, nh)
return string
nv=len(training_set[0])
nh=100
print nv, nh
batch_size=64
rbm = RBM(nv, nh)
nb_epoch=12
rbm
%%time
for epoch in range(1, nb_epoch+1):
train_loss = 0
s = 0.
for id_user in range(0, nb_users - batch_size, batch_size):
vk = training_set[id_user:id_user + batch_size]
v0 = training_set[id_user:id_user + batch_size]
ph0, _ = rbm.sample_h(v0)
for k in range(10):
_, hk = rbm.sample_h(vk)
_, vk = rbm.sample_v(hk)
vk[v0<0] = v0[v0<0]
phk, _ = rbm.sample_h(vk)
rbm.train(v0, vk, ph0, phk)
train_loss += torch.mean(torch.abs(v0[v0>0] - vk[v0>0]))
s += 1.
print 'epoch: '+ str(epoch) + ' loss: '+ str(train_loss/s)
training_set
# Testing the RBM
test_loss = 0
s = 0.
for id_user in range(nb_users):
v = training_set[id_user:id_user+1]
vt = test_set[id_user:id_user+1]
if len(vt[vt>=0]) > 0:
_,h = rbm.sample_h(v)
_,v = rbm.sample_v(h)
test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
s += 1.
print('test loss: '+str(test_loss/s))