In [1]:
import numpy as np
import pandas as pd
from time import time, ctime
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
In [2]:
!ls ./Boltzmann_Machines/ml-100k/
README	   u.data   u.item	  u1.base  u2.test  u4.base  u5.test  ub.base
allbut.pl  u.genre  u.occupation  u1.test  u3.base  u4.test  ua.base  ub.test
mku.sh	   u.info   u.user	  u2.base  u3.test  u5.base  ua.test
In [3]:
movies = pd.read_csv('./Boltzmann_Machines/ml-1m/movies.dat',
                 sep='::', header = None, 
                 engine='python', encoding='latin-1',
                    names=['MovieID, Title','Genres'])
print movies.shape
movies.head()
(3883, 2)
Out[3]:
MovieID, Title Genres
1 Toy Story (1995) Animation|Children's|Comedy
2 Jumanji (1995) Adventure|Children's|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama
5 Father of the Bride Part II (1995) Comedy

UserID::Gender::Age::Occupation::Zip-code

In [4]:
users = pd.read_csv('./Boltzmann_Machines/ml-1m/users.dat',
                    sep='::', header = None, 
                 engine='python', encoding='latin-1',
                   names=['UserID','Gender','Age','Occupation','Zip'])
print users.shape
users.head()
(6040, 5)
Out[4]:
UserID Gender Age Occupation Zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

UserID::MovieID::Rating::Timestamp

In [5]:
ratings = pd.read_csv('./Boltzmann_Machines/ml-1m/ratings.dat',
                      sep='::', header = None, 
                 engine='python', encoding='latin-1',
                    names=['UserID','MovieID','Rating','Timestamp'])
print ratings.shape
ratings.head()
(1000209, 4)
Out[5]:
UserID MovieID Rating Timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [6]:
from datetime import datetime
In [7]:
%%time
ratings['Time']=ratings['Timestamp'].apply(datetime.fromtimestamp)
CPU times: user 1.3 s, sys: 436 ms, total: 1.73 s
Wall time: 1.73 s
In [8]:
datetime.fromtimestamp(int("874965758")).strftime('%Y-%m-%d %H:%M:%S')
Out[8]:
'1997-09-22 18:02:38'
In [9]:
ratings.tail()
Out[9]:
UserID MovieID Rating Timestamp Time
1000204 6040 1091 1 956716541 2000-04-25 22:35:41
1000205 6040 1094 5 956704887 2000-04-25 19:21:27
1000206 6040 562 5 956704746 2000-04-25 19:19:06
1000207 6040 1096 4 956715648 2000-04-25 22:20:48
1000208 6040 1097 4 956715569 2000-04-25 22:19:29

training, test sets

  • from 100k dataset
In [10]:
training_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.base',
                           delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print training_set.shape
training_set.head()
(80000, 4)
Out[10]:
UserID MovieID Rating Timestamp
0 1 1 5 874965758
1 1 2 3 876893171
2 1 3 4 878542960
3 1 4 3 876893119
4 1 5 3 889751712
In [11]:
t_df=training_set.copy()
training_set = np.array(training_set, dtype='int')
training_set[:2]
Out[11]:
array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171]])
In [12]:
test_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.test',
                           delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print test_set.shape
test_set.head()
(20000, 4)
Out[12]:
UserID MovieID Rating Timestamp
0 1 6 5 887431973
1 1 10 3 875693118
2 1 12 5 878542960
3 1 14 5 874965706
4 1 17 3 875073198
In [13]:
e_df=test_set.copy()
test_set = np.array(test_set, dtype='int')
test_set[:2]
Out[13]:
array([[        1,         6,         5, 887431973],
       [        1,        10,         3, 875693118]])
In [14]:
nb_users  = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies
Out[14]:
(943, 1682)
In [15]:
# C=lambda X: (len(t_df[X].unique()), len(e_df[X].unique()))
# CC=lambda X: (t_df[X].max(), e_df[X].max())
# C('UserID')
In [16]:
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))
Out[16]:
(943, 1682)
In [17]:
pd.DataFrame(test_set)[pd.DataFrame(test_set).sum(1)==0].shape
Out[17]:
(0, 4)
In [18]:
def convert(data):
    new_data=[]
    for id_users in range(1, nb_users+1):
        id_movies  = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies -1] = id_ratings
        new_data.append(list(ratings))
    return new_data
In [19]:
%%time
training_set = convert(training_set)
test_set = convert(test_set)
CPU times: user 316 ms, sys: 20 ms, total: 336 ms
Wall time: 333 ms
In [20]:
print pd.DataFrame(training_set).shape
len(training_set), len(training_set[0])
(943, 1682)
Out[20]:
(943, 1682)
In [21]:
pd.DataFrame(training_set).head()
Out[21]:
0 1 2 3 4 5 6 7 8 9 ... 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
0 5.0 3.0 4.0 3.0 3.0 0.0 4.0 1.0 5.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1682 columns

use Pandas

In [22]:
t_df.tail()
Out[22]:
UserID MovieID Rating Timestamp
79995 943 1067 2 875501756
79996 943 1074 4 888640250
79997 943 1188 3 888640250
79998 943 1228 3 888640275
79999 943 1330 3 888692465
In [24]:
tmp = t_df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
In [25]:
tmp.head()
Out[25]:
UserID
1    [(1, 5), (2, 3), (3, 4), (4, 3), (5, 3), (7, 4...
2    [(1, 4), (10, 2), (14, 4), (25, 4), (100, 5), ...
3    [(181, 4), (258, 2), (260, 4), (268, 3), (271,...
4    [(11, 4), (210, 3), (258, 5), (271, 4), (300, ...
5    [(21, 3), (25, 3), (29, 4), (50, 4), (63, 1), ...
dtype: object
In [26]:
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
In [27]:
tmp.head()
Out[27]:
UserID
1    {1: 5, 2: 3, 3: 4, 4: 3, 5: 3, 7: 4, 8: 1, 9: ...
2    {1: 4, 258: 3, 10: 2, 269: 4, 14: 4, 272: 5, 2...
3    {258: 2, 260: 4, 268: 3, 271: 3, 288: 2, 302: ...
4    {258: 5, 359: 5, 324: 5, 358: 2, 327: 5, 328: ...
5    {21: 3, 25: 3, 29: 4, 50: 4, 63: 1, 66: 1, 70:...
dtype: object
In [28]:
pd.DataFrame(tmp.tolist()).head()
Out[28]:
1 2 3 4 5 6 7 8 9 10 ... 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
0 5.0 3.0 4.0 3.0 3.0 NaN 4.0 1.0 5.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 4.0 NaN NaN NaN NaN NaN NaN NaN NaN 2.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 1650 columns

In [29]:
from collections import defaultdict
In [30]:
d = defaultdict(int)
for i in range(1, 1683): d[i]
In [31]:
# def helper(x):
#     d = defaultdict(int)
#     for i in range(1, 1683): d[i]

#     d[x['MovieID']]=x['Rating']
#     return d
In [32]:
%%time
df=t_df.copy()
# df.columns=['user','movie','rating','time']
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})

mapped = pd.DataFrame(tmp.tolist(),columns=range(1,nb_movies+1)).fillna(0)
mapped.shape
CPU times: user 636 ms, sys: 4 ms, total: 640 ms
Wall time: 640 ms
In [33]:
mapped.head()
Out[33]:
1 2 3 4 5 6 7 8 9 10 ... 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
0 5.0 3.0 4.0 3.0 3.0 0.0 4.0 1.0 5.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1682 columns

In [34]:
mapped.values.shape
Out[34]:
(943, 1682)
In [35]:
type(mapped.values)
Out[35]:
numpy.ndarray

two methods yield the same result

In [36]:
np.array_equal(mapped.values, np.array(training_set))
Out[36]:
True
In [37]:
mapped.values.tolist()==training_set
Out[37]:
True
In [53]:
train_list=training_set[:]
In [59]:
training_set=train_list[:]
training_set=np.array(training_set)
In [60]:
training_set
Out[60]:
array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])
In [61]:
training_set[training_set==0]=-1
training_set
Out[61]:
array([[ 5.,  3.,  4., ..., -1., -1., -1.],
       [ 4., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ..., 
       [ 5., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1.,  5., -1., ..., -1., -1., -1.]])
In [62]:
training_set[training_set==1]=0
training_set[training_set==2]=0
training_set[training_set>=3]=1
training_set
Out[62]:
array([[ 1.,  1.,  1., ..., -1., -1., -1.],
       [ 1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ..., 
       [ 1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1.,  1., -1., ..., -1., -1., -1.]])
In [68]:
from sklearn.neural_network import BernoulliRBM
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
In [76]:
BernoulliRBM?
In [116]:
rbm = BernoulliRBM(verbose=1, learning_rate=0.00003, n_components=1000)
rbm
Out[116]:
BernoulliRBM(batch_size=10, learning_rate=3e-05, n_components=1000, n_iter=10,
       random_state=None, verbose=1)
In [117]:
rbm.fit(training_set)
[BernoulliRBM] Iteration 1, pseudo-likelihood = -164.56, time = 7.33s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -285.08, time = 8.30s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -440.42, time = 8.22s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -669.67, time = 8.35s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -594.80, time = 8.45s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -730.89, time = 8.42s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -1090.68, time = 8.44s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -1078.60, time = 8.30s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -1338.02, time = 9.17s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -1509.13, time = 9.35s
Out[117]:
BernoulliRBM(batch_size=10, learning_rate=3e-05, n_components=1000, n_iter=10,
       random_state=None, verbose=1)
In [118]:
rbm.transform(training_set)
Out[118]:
array([[ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.]])
In [83]:
training_set
Out[83]:
array([[ 1.,  1.,  1., ..., -1., -1., -1.],
       [ 1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ..., 
       [ 1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1.,  1., -1., ..., -1., -1., -1.]])
In [85]:
def conv(x):
    x = np.array(x)
    x[x==0]=-1
    x[x==1]=1
    x[x==2]=1
    x[x>=3]=1
    return x
In [104]:
rbm.transform(conv(test_set[10:])).shape
Out[104]:
(933, 100)
In [114]:
from sklearn.linear_model import LogisticRegression
In [115]:
a = LogisticRegression()
In [ ]:
a.