# uncomment if you are using google colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installing surprise library, only do it for first time
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
     |████████████████████████████████| 771 kB 4.8 MB/s 
Requirement already satisfied: joblib>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-surprise->surprise) (1.2.0)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.8/dist-packages (from scikit-surprise->surprise) (1.21.6)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from scikit-surprise->surprise) (1.7.3)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... done
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626480 sha256=6da15ab2bdf42251d488dcd9c1b97ca19f9bfa68ccd80768ba6dde0e0f9c25bd
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


# Used to ignore the warning given as output of the code
import warnings                                 
warnings.filterwarnings('ignore')

# Basic libraries of python for numeric and dataframe computations
import numpy as np                              
import pandas as pd

# Basic library for data visualization
import matplotlib.pyplot as plt     

# Slightly advanced library for data visualization            
import seaborn as sns                           

# A dictionary output that does not raise a key error
from collections import defaultdict             

# A performance metrics in surprise
from surprise import accuracy

# Class is used to parse a file containing ratings, data should be in structure - user ; item ; rating
from surprise.reader import Reader

# Class for loading datasets
from surprise.dataset import Dataset

# For model tuning model hyper-parameters
from surprise.model_selection import GridSearchCV

# For splitting the rating data in train and test dataset
from surprise.model_selection import train_test_split

# For implementing similarity based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# For implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# For implementing cross validation
from surprise.model_selection import KFold


# Import the dataset
#rating = pd.read_csv('ratings.csv')
rating = pd.read_csv('/content/drive/MyDrive/Colab/ratings.csv') # Uncomment this line code  and comment above line of code if you are using google colab.


rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# Dropping timestamp column
rating = rating.drop(['timestamp'], axis=1)


# Printing the top 5 rows of the dataset Hint: use .head()

# Remove _______and complete the code
rating.head()


plt.figure(figsize = (12, 4))

# Remove _______and complete the code
sns.countplot(x='rating',data=rating)

plt.tick_params(labelsize = 10)
plt.title("Distribution of Ratings ", fontsize = 10)
plt.xlabel("Ratings", fontsize = 10)
plt.ylabel("Number of Ratings", fontsize = 10)
plt.show()


# Finding number of unique users
#remove _______ and complete the code
rating['userId'].nunique()

671


# Finding number of unique movies
# Remove _______ and complete the code

rating['movieId'].nunique()

9066


rating.groupby(['userId', 'movieId']).count()


rating.groupby(['userId', 'movieId']).count()['rating'].sum()

100004


# Remove _______ and complete the code
rating['movieId'].value_counts()

356       341
296       324
318       311
593       304
260       291
         ... 
98604       1
103659      1
104419      1
115927      1
6425        1
Name: movieId, Length: 9066, dtype: int64


# Plotting distributions of ratings for 341 interactions with movieid 356 
plt.figure(figsize=(7,7))

rating[rating['movieId'] == 356]['rating'].value_counts().plot(kind='bar')

plt.xlabel('Rating')

plt.ylabel('Count')

plt.show()


# Remove _______ and complete the code
rating['userId'].value_counts()

547    2391
564    1868
624    1735
15     1700
73     1610
       ... 
296      20
289      20
249      20
221      20
1        20
Name: userId, Length: 671, dtype: int64


# Finding user-movie interactions distribution
count_interactions = rating.groupby('userId').count()['movieId']
count_interactions

userId
1       20
2       76
3       51
4      204
5      100
      ... 
667     68
668     20
669     37
670     31
671    115
Name: movieId, Length: 671, dtype: int64


# Plotting user-movie interactions distribution

plt.figure(figsize=(15,7))
# Remove _______ and complete the code

sns.histplot(count_interactions)

plt.xlabel('Number of Interactions by Users')

plt.show()


# Remove _______ and complete the code

# Calculating average ratings
average_rating = rating.groupby('movieId').mean()['rating']

# Calculating the count of ratings
count_rating = rating.groupby('movieId').count()['rating']

# Making a dataframe with the count and average of ratings
final_rating = pd.DataFrame({'avg_rating':average_rating, 'rating_count':count_rating})


final_rating.head()


def top_n_movies(data, n, min_interaction=100):
    
    #Finding movies with minimum number of interactions
    recommendations = data[data['rating_count'] >= min_interaction]
    
    #Sorting values w.r.t average rating 
    recommendations = recommendations.sort_values(by='avg_rating', ascending=False)
    
    return recommendations.index[:n]


# Remove _______ and complete the code
list(top_n_movies(final_rating,5,min_interaction=50))

[858, 318, 969, 913, 1221]


# Remove _______ and complete the code
list(top_n_movies(final_rating,5,min_interaction=100))

[858, 318, 1221, 50, 527]


# Remove _______ and complete the code
list(top_n_movies(final_rating,5,min_interaction=200))

[858, 318, 50, 527, 608]


# Instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(0, 5))

# Loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# Splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


# Remove _______ and complete the code
sim_options = {'name': 'cosine',
               'user_based': True}
# Defining Nearest neighbour algorithm
algo_knn_user = KNNBasic(sim_options=sim_options,verbose=False)

# Train the algorithm on the trainset or fitting the model on train dataset 
algo_knn_user.fit(trainset)

# Predict ratings for the testset
predictions = algo_knn_user.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9925

0.9924509041520163


# Remove _______ and complete the code
algo_knn_user.predict(4,10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.62   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.6244912065910952, details={'actual_k': 40, 'was_impossible': False})


# Remove _______ and complete the code
algo_knn_user.predict(4,3,verbose=True)

user: 4          item: 3          r_ui = None   est = 3.20   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.202703552548654, details={'actual_k': 40, 'was_impossible': False})


# Remove _______ and complete the code

# Setting up parameter grid to tune the hyperparameters
param_grid = {'k': [20, 30, 40], 'min_k': [3, 6, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [True]}
             }

# Performing 3-fold cross validation to tune the hyperparameters
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Fitting the data
grid_obj.fit(data)

# Best RMSE score
print(grid_obj.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(grid_obj.best_params['rmse'])

0.9653356985061953
{'k': 20, 'min_k': 3, 'sim_options': {'name': 'msd', 'user_based': True}}


results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
results_df.head()


# Remove _______ and complete the code
sim_options = {'name': 'msd',
               'user_based': True}
# Using the optimal similarity measure for user-user based collaborative filtering
# Creating an instance of KNNBasic with optimal hyperparameter values
similarity_algo_optimized_user = KNNBasic(sim_options=sim_options, k=20, min_k=3,verbose=False)

# Training the algorithm on the trainset
similarity_algo_optimized_user.fit(trainset)

# Predicting ratings for the testset
predictions = similarity_algo_optimized_user.test(testset)

# Computing RMSE on testset
accuracy.rmse(predictions)

RMSE: 0.9571

0.9571445417153293


# Remove _______ and complete the code
similarity_algo_optimized_user.predict(4,0, r_ui=4, verbose=True)

user: 4          item: 0          r_ui = 4.00   est = 3.55   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Prediction(uid=4, iid=0, r_ui=4, est=3.5459045285801785, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})


# Remove _______ and complete the code
similarity_algo_optimized_user.predict(4,3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.72   {'actual_k': 20, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.7228745701935386, details={'actual_k': 20, 'was_impossible': False})


similarity_algo_optimized_user.get_neighbors(4, k=5)

[665, 417, 647, 654, 260]


def get_recommendations(data, user_id, top_n, algo):
    
    # Creating an empty list to store the recommended movie ids
    recommendations = []
    
    # Creating an user item interactions matrix 
    user_item_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # Extracting those movie ids which the user_id has not interacted yet
    non_interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # Looping through each of the movie id which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # Predicting the ratings for those non interacted movie ids by this user
        est = algo.predict(user_id, item_id).est
        
        # Appending the predicted ratings
        recommendations.append((item_id, est))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted rating movies for this user


#remove _______ and complete the code
recommendations = get_recommendations(rating,4,5,similarity_algo_optimized_user)


recommendations

[(309, 5),
 (3038, 5),
 (6273, 4.928202652354184),
 (98491, 4.863224466679252),
 (2721, 4.845513973527148)]


# Remove _______ and complete the code

# Definfing similarity measure
sim_options = {'name': 'cosine',
               'user_based': False}

# Defining Nearest neighbour algorithm
algo_knn_item = KNNBasic(sim_options=sim_options,verbose=False)

# Train the algorithm on the trainset or fitting the model on train dataset 
algo_knn_item.fit(trainset)

# Predict ratings for the testset
predictions = algo_knn_item.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.0032

1.003221450633729


# Remove _______ and complete the code
algo_knn_item.predict(4,0, r_ui=4, verbose=True)

user: 4          item: 0          r_ui = 4.00   est = 3.55   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Prediction(uid=4, iid=0, r_ui=4, est=3.5459045285801785, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})


# Remove _______ and complete the code
algo_knn_item.predict(4,3,verbose=True)

user: 4          item: 3          r_ui = None   est = 4.07   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=4.071601862880049, details={'actual_k': 40, 'was_impossible': False})


# Remove _______ and complete the code

# Setting up parameter grid to tune the hyperparameters
param_grid = {'k': [20, 30,40], 'min_k': [3,6,9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}
              }

# Performing 3-fold cross validation to tune the hyperparameters
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Fitting the data
grid_obj.fit(data)

# Best RMSE score
print(grid_obj.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(grid_obj.best_params['rmse'])

0.9399929753224469
{'k': 40, 'min_k': 3, 'sim_options': {'name': 'msd', 'user_based': False}}


results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
results_df.head()


# Remove _______ and complete the code
# Creating an instance of KNNBasic with optimal hyperparameter values

similarity_algo_optimized_item = KNNBasic(sim_options= {'name': 'msd','user_based': False}, k=40, min_k=3,verbose=False)

# Training the algorithm on the trainset
similarity_algo_optimized_item.fit(trainset)

# Predicting ratings for the testset
predictions = similarity_algo_optimized_item.test(testset)

# Computing RMSE on testset
accuracy.rmse(predictions)

RMSE: 0.9433

0.9433184999641279


# Remove _______ and complete the code
similarity_algo_optimized_item.predict(4,0, r_ui=4, verbose=True)

user: 4          item: 0          r_ui = 4.00   est = 3.55   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Prediction(uid=4, iid=0, r_ui=4, est=3.5459045285801785, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})


# Remove _______ and complete the code
similarity_algo_optimized_item.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.87   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.865175609312417, details={'actual_k': 40, 'was_impossible': False})


# Remove _______ and complete the code
similarity_algo_optimized_item.get_neighbors(3, k=5)


# Remove _______ and complete the code
recommendations = get_recommendations(rating, 4, 5, similarity_algo_optimized_item)


recommendations


# Remove _______ and complete the code

# Using SVD matrix factorization
algo_svd = SVD()

# Training the algorithm on the trainset
algo_svd.fit(trainset)

# Predicting ratings for the testset
predictions = algo_svd.test(testset)

# Computing RMSE on the testset
accuracy.rmse(predictions)

RMSE: 0.9034

0.9034198535037269


# Remove _______ and complete the code
algo_svd.predict(4, 10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 4.15   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=4.1542603434778105, details={'was_impossible': False})


# Remove _______ and complete the code
algo_svd.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.53   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.5325352777024848, details={'was_impossible': False})


# Remove _______ and complete the code

# Set the parameter space to tune
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# Performing 3-fold gridsearch cross validation
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Fitting data
gs.fit(data)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8951985726464798
{'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.2}


results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.head()


# Remove _______ and complete the code

# Building the optimized SVD model using optimal hyperparameter search
svd_algo_optimized = SVD(n_epochs= 30,lr_all= 0.01, reg_all= 0.2)

# Training the algorithm on the trainset
svd_algo_optimized.fit(trainset)

# Predicting ratings for the testset
predictions = svd_algo_optimized.test(testset)

# Computing RMSE
accuracy.rmse(predictions)

RMSE: 0.8952

0.8951788601358034


# Remove _______ and complete the code
svd_algo_optimized.predict(4, 10, r_ui=4, verbose=True)

user: 4          item: 10         r_ui = 4.00   est = 3.99   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.989244579160853, details={'was_impossible': False})


# Remove _______ and complete the code
svd_algo_optimized.predict(4, 3, verbose=True)

user: 4          item: 3          r_ui = None   est = 3.63   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.6293926107395005, details={'was_impossible': False})


# Remove _______ and complete the code
get_recommendations(rating, 4, 5, svd_algo_optimized)

[(1192, 4.992219812811331),
 (116, 4.961040017697183),
 (926, 4.957147548937894),
 (1948, 4.927934951241887),
 (3310, 4.922485554135631)]


def predict_already_interacted_ratings(data, user_id, algo):
    
    # Creating an empty list to store the recommended movie ids
    recommendations = []
    
    # Creating an user item interactions matrix 
    user_item_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # Extracting those movie ids which the user_id has interacted already
    interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].notnull()].index.tolist()
    
    # Looping through each of the movie id which user_id has interacted already
    for item_id in interacted_movies:
        
        # Extracting actual ratings
        actual_rating = user_item_interactions_matrix.loc[user_id, item_id]
        
        # Predicting the ratings for those non interacted movie ids by this user
        predicted_rating = algo.predict(user_id, item_id).est
        
        # Appending the predicted ratings
        recommendations.append((item_id, actual_rating, predicted_rating))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return pd.DataFrame(recommendations, columns=['movieId', 'actual_rating', 'predicted_rating']) # returing top n highest predicted rating movies for this user


predicted_ratings_for_interacted_movies = predict_already_interacted_ratings(rating, 7, similarity_algo_optimized_item)
df = predicted_ratings_for_interacted_movies.melt(id_vars='movieId', value_vars=['actual_rating', 'predicted_rating'])
sns.displot(data=df, x='value', hue='variable', kde=True);


predicted_ratings_for_interacted_movies = predict_already_interacted_ratings(rating, 7, svd_algo_optimized)
df = predicted_ratings_for_interacted_movies.melt(id_vars='movieId', value_vars=['actual_rating', 'predicted_rating'])
sns.displot(data=df, x='value', hue='variable', kde=True);


# Instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(0, 5))

# Loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# Splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


# Function can be found on surprise documentation FAQs
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


# A basic cross-validation iterator.
kf = KFold(n_splits=5)

# Make list of k values
K = [5, 10]

# Remove _______ and complete the code
# Make list of models
models = [algo_knn_user, similarity_algo_optimized_user,algo_knn_item,similarity_algo_optimized_item, algo_svd, svd_algo_optimized]

for k in K:
    for model in models:
        print('> k={}, model={}'.format(k,model.__class__.__name__))
        p = []
        r = []
        for trainset, testset in kf.split(data):
            model.fit(trainset)
            predictions = model.test(testset, verbose=False)
            precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5)

            # Precision and recall can then be averaged over all users
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
        
        print('-----> Precision: ', round(sum(p) / len(p), 3))
        print('-----> Recall: ', round(sum(r) / len(r), 3))

> k=5, model=KNNBasic
-----> Precision:  0.769
-----> Recall:  0.413
> k=5, model=KNNBasic
-----> Precision:  0.773
-----> Recall:  0.416
> k=5, model=KNNBasic
-----> Precision:  0.605
-----> Recall:  0.326
> k=5, model=KNNBasic
-----> Precision:  0.683
-----> Recall:  0.355
> k=5, model=SVD
-----> Precision:  0.754
-----> Recall:  0.386
> k=5, model=SVD
-----> Precision:  0.748
-----> Recall:  0.384
> k=10, model=KNNBasic
-----> Precision:  0.75
-----> Recall:  0.545
> k=10, model=KNNBasic
-----> Precision:  0.752
-----> Recall:  0.559
> k=10, model=KNNBasic
-----> Precision:  0.594
-----> Recall:  0.471
> k=10, model=KNNBasic
-----> Precision:  0.664
-----> Recall:  0.508
> k=10, model=SVD
-----> Precision:  0.739
-----> Recall:  0.522
> k=10, model=SVD
-----> Precision:  0.725
-----> Recall:  0.524

	userId	movieId	rating
0	1	31	2.5
1	1	1029	3.0
2	1	1061	3.0
3	1	1129	2.0
4	1	1172	4.0

		rating
userId	movieId
1	31	1
	1029	1
	1061	1
	1129	1
	1172	1
...	...	...
671	6268	1
	6269	1
	6365	1
	6385	1
	6565	1

	split0_test_rmse	split1_test_rmse	split2_test_rmse	mean_test_rmse	std_test_rmse	rank_test_rmse	split0_test_mae	split1_test_mae	split2_test_mae	mean_test_mae	std_test_mae	rank_test_mae	mean_fit_time	std_fit_time	mean_test_time	std_test_time	params	param_k	param_min_k	param_sim_options
0	0.958158	0.950637	0.943976	0.950924	0.005794	7	0.739567	0.734922	0.727845	0.734111	0.004820	7	5.021798	1.125042	16.716629	1.318054	{'k': 20, 'min_k': 3, 'sim_options': {'name': ...	20	3	{'name': 'msd', 'user_based': False}
1	1.021201	1.011993	1.005091	1.012762	0.006599	16	0.796983	0.790784	0.782162	0.789976	0.006077	16	6.838112	1.777080	13.569611	2.775545	{'k': 20, 'min_k': 3, 'sim_options': {'name': ...	20	3	{'name': 'cosine', 'user_based': False}
2	0.958155	0.950605	0.944098	0.950953	0.005744	8	0.739654	0.734957	0.728037	0.734216	0.004771	8	2.813337	0.281329	11.369127	0.072564	{'k': 20, 'min_k': 6, 'sim_options': {'name': ...	20	6	{'name': 'msd', 'user_based': False}
3	1.021404	1.012055	1.005280	1.012913	0.006610	17	0.797123	0.790878	0.782375	0.790125	0.006044	17	3.878078	0.134031	13.151879	1.024427	{'k': 20, 'min_k': 6, 'sim_options': {'name': ...	20	6	{'name': 'cosine', 'user_based': False}
4	0.959678	0.950621	0.944010	0.951436	0.006422	9	0.740640	0.735047	0.728180	0.734622	0.005095	9	2.792980	0.271339	15.215239	2.727743	{'k': 20, 'min_k': 9, 'sim_options': {'name': ...	20	9	{'name': 'msd', 'user_based': False}

	split0_test_rmse	split1_test_rmse	split2_test_rmse	mean_test_rmse	std_test_rmse	rank_test_rmse	split0_test_mae	split1_test_mae	split2_test_mae	mean_test_mae	std_test_mae	rank_test_mae	mean_fit_time	std_fit_time	mean_test_time	std_test_time	params	param_n_epochs	param_lr_all	param_reg_all
0	0.940377	0.945866	0.944982	0.943742	0.002406	25	0.737076	0.739329	0.738857	0.738420	0.000970	25	0.800517	0.047832	0.459344	0.027451	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.2}	10	0.001	0.2
1	0.945583	0.949379	0.949643	0.948202	0.001855	26	0.742707	0.743683	0.743896	0.743429	0.000518	26	0.782589	0.020786	0.436198	0.006444	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.4}	10	0.001	0.4
2	0.951009	0.954499	0.954042	0.953183	0.001549	27	0.748806	0.749643	0.749080	0.749176	0.000348	27	0.785872	0.034149	0.425623	0.003899	{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.6}	10	0.001	0.6
3	0.905806	0.909565	0.908088	0.907820	0.001546	10	0.702849	0.703889	0.702569	0.703103	0.000568	9	0.817078	0.030645	0.474087	0.047315	{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.2}	10	0.005	0.2
4	0.913282	0.916795	0.914698	0.914925	0.001443	15	0.710566	0.711731	0.710063	0.710787	0.000698	15	0.792537	0.030592	0.430651	0.011269	{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}	10	0.005	0.4

	avg_rating	rating_count
movieId
1	3.872470	247
2	3.401869	107
3	3.161017	59
4	2.384615	13
5	3.267857	56

Project - Recommendation Systems: Movie Recommendation System¶

Marks: 40¶

Context¶

Objective¶

Dataset¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Question 1: Exploring the dataset (7 Marks)¶

Q 1.1 Print the top 5 rows of the dataset (1 Mark)¶

Q 1.2 Describe the distribution of ratings. (1 Mark)¶

Q 1.3 What is the total number of unique users and unique movies? (1 Mark)¶

Q 1.4 Is there a movie in which the same user interacted with it more than once? (1 Mark)¶

Q 1.5 Which is the most interacted movie in the dataset? (1 Mark)¶

Q 1.6 Which user interacted the most with any movie in the dataset? (1 Mark)¶

Q 1.7 What is the distribution of the user-movie interactions in this dataset? (1 Mark)¶

As we have now explored the data, let's start building Recommendation systems¶

Question 2: Create Rank-Based Recommendation System (3 Marks)¶

Model 1: Rank-Based Recommendation System¶

Recommending top 5 movies with 50 minimum interactions based on popularity¶

Recommending top 5 movies with 100 minimum interactions based on popularity¶

Recommending top 5 movies with 200 minimum interactions based on popularity¶

Model 2: User based Collaborative Filtering Recommendation System (7 Marks)¶

Building Similarity/Neighborhood based Collaborative Filtering¶

Building a baseline user-user similarity based recommendation system¶

Making the dataset into surprise dataset and splitting it into train and test set¶

Build the first baseline similarity based recommendation system using cosine similarity and KNN¶

Q 3.1 What is the RMSE for baseline user based collaborative filtering recommendation system? (1 Mark)¶

Q 3.2 What is the Predicted rating for an user with userId=4 and for movieId=10 and movieId=3? (1 Mark)¶

Improving user-user similarity based recommendation system by tuning its hyper-parameters¶

Q 3.3 Perform hyperparameter tuning for the baseline user based collaborative filtering recommendation system and find the RMSE for tuned user based collaborative filtering recommendation system? (3 Marks)¶

Q 3.4 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3 using tuned user based collaborative filtering? (1 Mark)¶

Identifying similar users to a given user (nearest neighbors)¶

Implementing the recommendation algorithm based on optimized KNNBasic model¶

Predicted top 5 movies for userId=4 with similarity based recommendation system¶

Q 3.5 Predict the top 5 movies for userId=4 with similarity based recommendation system (1 Mark)¶

Model 3: Item based Collaborative Filtering Recommendation System (7 Marks)¶

Q 4.1 What is the RMSE for baseline item based collaborative filtering recommendation system ?(1 Mark)¶

Q 4.2 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3? (1 Mark)¶

Q 4.3 Perform hyperparameter tuning for the baseline item based collaborative filtering recommendation system and find the RMSE for tuned item based collaborative filtering recommendation system? (3 Marks)¶

Q 4.4 What is the Predicted rating for an item with userId =4 and for movieId= 10 and movieId=3 using tuned item based collaborative filtering? (1 Mark)¶

Identifying similar items to a given item (nearest neighbors)¶

Predicted top 5 movies for userId=4 with similarity based recommendation system¶

Q 4.5 Predict the top 5 movies for userId=4 with similarity based recommendation system (1 Mark)¶

Model 4: Based Collaborative Filtering - Matrix Factorization using SVD (7 Marks)¶

Singular Value Decomposition (SVD)¶

U-matrix¶

Sigma-matrix¶

V-transpose matrix¶

Build a baseline matrix factorization recommendation system¶

Q 5.1 What is the RMSE for baseline SVD based collaborative filtering recommendation system? (1 Mark)¶

Q 5.2 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3? (1 Mark)¶

Improving matrix factorization based recommendation system by tuning its hyper-parameters¶

Q 5.3 Perform hyperparameter tuning for the baseline SVD based collaborative filtering recommendation system and find the RMSE for tuned SVD based collaborative filtering recommendation system? (3 Marks)¶

Q 5.4 What is the Predicted rating for an user with userId =4 and for movieId= 10 and movieId=3 using SVD based collaborative filtering? (1 Mark)¶

Q 5.5 Predict the top 5 movies for userId=4 with SVD based recommendation system?(1 Mark)¶

Predicting ratings for already interacted movies¶

Precision and Recall @ k¶

Question6: Compute the precision and recall, for each of the 6 models, at k = 5 and 10. This is 6 x 2 = 12 numerical values? (4 marks)¶

Question 7 ( 5 Marks)¶

Conclusions¶