-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbai23_ContentBasedRecommenderSys.py
107 lines (85 loc) · 3.24 KB
/
bai23_ContentBasedRecommenderSys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Created on Sun May 3 15:58:40 2020
@author: phamk
"""
import pandas as pd
#Reading user file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
encoding='latin-1')
n_users = users.shape[0]
print('Number of users:', n_users)
# users.head() #uncomment this to see some few examples
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
rate_train = ratings_base.values
rate_test = ratings_test.values
print('Number of traing rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])
#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')
n_items = items.shape[0]
print('Number of items:', n_items)
X0 = items.values
X_train_counts = X0[:, -19:]
# print X0
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
"""
in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
we care about the first three values
return (item_ids, scores) rated by user user_id
"""
y = rate_matrix[:,0] # all users
# item indices rated by user_id
# we need to +1 to user_id since in the rate_matrix, id starts from 1
# while index in python starts from 0
ids = np.where(y == user_id +1)[0]
item_ids = rate_matrix[ids, 1] - 1 # index starts from 0
scores = rate_matrix[ids, 2]
return (item_ids, scores)
from sklearn.linear_model import Ridge
from sklearn import linear_model
d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))
for n in range(n_users):
ids, scores = get_items_rated_by_user(rate_train, n)
clf = Ridge(alpha=0.01, fit_intercept = True)
Xhat = tfidf[ids, :]
clf.fit(Xhat, scores)
W[:, n] = clf.coef_
b[0, n] = clf.intercept_
# predicted scores
Yhat = tfidf.dot(W) + b
n = 10
np.set_printoptions(precision=2) # 2 digits after .
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings :', scores)
print('Predicted ratings:', Yhat[ids, n])
import math
def evaluate(Yhat, rates, W, b):
se = 0
cnt = 0
for n in range(n_users):
ids, scores_truth = get_items_rated_by_user(rates, n)
scores_pred = Yhat[ids, n]
e = scores_truth - scores_pred
se += (e*e).sum(axis = 0)
cnt += e.size
return math.sqrt(se/cnt)
print('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print('RMSE for test :', evaluate(Yhat, rate_test, W, b))