From 97c891f129fe56b9455fd3944d785aa16fb0a4af Mon Sep 17 00:00:00 2001 From: VemulapalliMukesh27 Date: Wed, 16 Oct 2024 21:20:05 +0530 Subject: [PATCH] #10 netflix-analysis-recommendation --- Ml-Ds/netflixRecommendation/Mukku27.ipynb | 667 ++++++++++++++++++++++ 1 file changed, 667 insertions(+) create mode 100644 Ml-Ds/netflixRecommendation/Mukku27.ipynb diff --git a/Ml-Ds/netflixRecommendation/Mukku27.ipynb b/Ml-Ds/netflixRecommendation/Mukku27.ipynb new file mode 100644 index 0000000..697839e --- /dev/null +++ b/Ml-Ds/netflixRecommendation/Mukku27.ipynb @@ -0,0 +1,667 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Netflix Movie Dataset Analysis and Recommendation System\n", + "\n", + "In this notebook, we will:\n", + "1. Download the Netflix movie dataset.\n", + "2. Perform exploratory data analysis (EDA).\n", + "3. Visualize data trends.\n", + "4. Draw deductions and insights.\n", + "5. Build a movie recommendation system using collaborative filtering.\n", + "\n", + "Let's begin by downloading the dataset and installing the necessary libraries.\n" + ], + "metadata": { + "id": "4MazqJvohoJi" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Dataset Download\n" + ], + "metadata": { + "id": "ZVjNQZqSiXnW" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vo3N5GfIgrdF", + "outputId": "895537ea-0ad9-4545-fda3-d984fa68756f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading from https://www.kaggle.com/api/v1/datasets/download/netflix-inc/netflix-prize-data?dataset_version_number=2...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 683M/683M [00:06<00:00, 113MB/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extracting files...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Path to dataset files: /root/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2\n" + ] + } + ], + "source": [ + "# Import KaggleHub to download Netflix dataset from Kaggle\n", + "import kagglehub\n", + "\n", + "# Download the Netflix dataset\n", + "path = kagglehub.dataset_download(\"netflix-inc/netflix-prize-data\")\n", + "\n", + "print(\"Path to dataset files:\", path)\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Install Necessary Libraries" + ], + "metadata": { + "id": "k2tLIC4Jih5f" + } + }, + { + "cell_type": "code", + "source": [ + "# Install Surprise library for collaborative filtering model\n", + "!pip install surprise\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vQi8CvOriiMt", + "outputId": "ec6d8b96-e6a1-4d6a-df10-6ea525958e89" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting surprise\n", + " Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)\n", + "Collecting scikit-surprise (from surprise)\n", + " Downloading scikit_surprise-1.1.4.tar.gz (154 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.4/154.4 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.4.2)\n", + "Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.26.4)\n", + "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.13.1)\n", + "Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)\n", + "Building wheels for collected packages: scikit-surprise\n", + " Building wheel for scikit-surprise (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357269 sha256=d607d3f9006384ce44e05f113b4ccff8aaabfb05fb4da76ac25878ac1e6ecafc\n", + " Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54\n", + "Successfully built scikit-surprise\n", + "Installing collected packages: scikit-surprise, surprise\n", + "Successfully installed scikit-surprise-1.1.4 surprise-0.1\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Import Libraries and Set Up" + ], + "metadata": { + "id": "Hoyetrz6iibB" + } + }, + { + "cell_type": "code", + "source": [ + "# Import necessary libraries for analysis and modeling\n", + "import pandas as pd\n", + "import numpy as np\n", + "import math\n", + "import re\n", + "from scipy.sparse import csr_matrix\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from surprise import Reader, Dataset, SVD\n", + "from surprise.model_selection import cross_validate\n", + "\n", + "# Set seaborn style for better visualization\n", + "sns.set_style(\"darkgrid\")\n" + ], + "metadata": { + "id": "ShetVtMTiioN" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Load and Explore Datasets" + ], + "metadata": { + "id": "YSVGmgG9ii1Y" + } + }, + { + "cell_type": "code", + "source": [ + "# Load Netflix datasets and explore their structure\n", + "df1 = pd.read_csv(f'{path}/combined_data_1.txt', header=None, names=['Cust_Id', 'Rating'], usecols=[0, 1])\n", + "df1['Rating'] = df1['Rating'].astype(float)\n", + "\n", + "# Display dataset shape and sample rows\n", + "print('Dataset 1 shape: {}'.format(df1.shape))\n", + "print('-Dataset examples-')\n", + "print(df1.iloc[::5000000, :])\n", + "\n", + "# Repeat for datasets 2, 3, and 4\n", + "df2 = pd.read_csv(f'{path}/combined_data_2.txt', header=None, names=['Cust_Id', 'Rating'], usecols=[0,1])\n", + "df3 = pd.read_csv(f'{path}/combined_data_3.txt', header=None, names=['Cust_Id', 'Rating'], usecols=[0,1])\n", + "df4 = pd.read_csv(f'{path}/combined_data_4.txt', header=None, names=['Cust_Id', 'Rating'], usecols=[0,1])\n", + "\n", + "# Convert ratings to float for consistency\n", + "df2['Rating'] = df2['Rating'].astype(float)\n", + "df3['Rating'] = df3['Rating'].astype(float)\n", + "df4['Rating'] = df4['Rating'].astype(float)\n", + "\n", + "# Print shapes of datasets for verification\n", + "print('Dataset 2 shape: {}'.format(df2.shape))\n", + "print('Dataset 3 shape: {}'.format(df3.shape))\n", + "print('Dataset 4 shape: {}'.format(df4.shape))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1pJFzOZDijF_", + "outputId": "63200ab7-c16b-4780-b8a7-b6281f1af9be" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Dataset 1 shape: (24058263, 2)\n", + "-Dataset examples-\n", + " Cust_Id Rating\n", + "0 1: NaN\n", + "5000000 2560324 4.0\n", + "10000000 2271935 2.0\n", + "15000000 1921803 2.0\n", + "20000000 1933327 3.0\n", + "Dataset 2 shape: (26982302, 2)\n", + "Dataset 3 shape: (22605786, 2)\n", + "Dataset 4 shape: (26851926, 2)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Combine Datasets\n" + ], + "metadata": { + "id": "9W_XL7JFijbV" + } + }, + { + "cell_type": "code", + "source": [ + "# Combine all the datasets into one dataframe\n", + "df = pd.concat([df1, df2, df3, df4], ignore_index=True)\n", + "\n", + "# Reset the index after concatenation\n", + "df.index = np.arange(0, len(df))\n", + "\n", + "# Print combined dataset shape and examples\n", + "print('Full dataset shape: {}'.format(df.shape))\n", + "print('-Dataset examples-')\n", + "print(df.iloc[::5000000, :])\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N6CLyelkijrA", + "outputId": "69deaac1-1c21-432f-f280-a73131210cbb" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Full dataset shape: (100498277, 2)\n", + "-Dataset examples-\n", + " Cust_Id Rating\n", + "0 1: NaN\n", + "5000000 2560324 4.0\n", + "10000000 2271935 2.0\n", + "15000000 1921803 2.0\n", + "20000000 1933327 3.0\n", + "25000000 1465002 3.0\n", + "30000000 961023 4.0\n", + "35000000 1372532 5.0\n", + "40000000 854274 5.0\n", + "45000000 116334 3.0\n", + "50000000 768483 3.0\n", + "55000000 1331144 5.0\n", + "60000000 1609324 2.0\n", + "65000000 1699240 3.0\n", + "70000000 1776418 4.0\n", + "75000000 1643826 5.0\n", + "80000000 932047 4.0\n", + "85000000 2292868 4.0\n", + "90000000 932191 4.0\n", + "95000000 1815101 3.0\n", + "100000000 872339 4.0\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Exploratory Data Analysis (EDA) - Basic Overview\n", + "**Deductions:**\n", + "\n", + "1. Rating Distribution: The bar chart reveals that most ratings given by users are skewed toward higher values, indicating that users tend to give favorable ratings to movies.\n", + "2. User Behavior: Customers tend to be less critical and more positive in rating movies, with fewer low ratings being given.\n", + "\n" + ], + "metadata": { + "id": "7nezGI_Aij8J" + } + }, + { + "cell_type": "code", + "source": [ + "# EDA: Group by rating and count each rating occurrence\n", + "p = df.groupby('Rating')['Rating'].agg(['count'])\n", + "\n", + "# Calculate movie count (rows with NaN values in Rating)\n", + "movie_count = df.isnull().sum()[1]\n", + "\n", + "# Calculate customer count and rating count\n", + "cust_count = df['Cust_Id'].nunique() - movie_count\n", + "rating_count = df['Cust_Id'].count() - movie_count\n", + "\n", + "# Visualize the distribution of ratings\n", + "plt.figure(figsize=(15, 8))\n", + "ax = p.plot(kind='barh', legend=False, figsize=(15,10), color='skyblue')\n", + "plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20)\n", + "plt.xlabel('Count of Ratings')\n", + "plt.ylabel('Rating Value')\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 952 + }, + "id": "0HgeRda5ikMV", + "outputId": "c63abd3a-f527-4e6a-a2a8-ba304a50337e" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " movie_count = df.isnull().sum()[1]\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Handle Missing Data" + ], + "metadata": { + "id": "6ezIjBwHikrz" + } + }, + { + "cell_type": "code", + "source": [ + "# Identify and handle rows with missing ratings (NaN values)\n", + "df_nan = pd.DataFrame(pd.isnull(df.Rating))\n", + "df_nan = df_nan[df_nan['Rating'] == True].reset_index()\n", + "\n", + "# Generate movie IDs using numpy array to fill NaN spaces\n", + "movie_np = []\n", + "movie_id = 1\n", + "\n", + "# Fill the gaps between movie IDs in the dataset\n", + "for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):\n", + " temp = np.full((1, i-j-1), movie_id)\n", + " movie_np = np.append(movie_np, temp)\n", + " movie_id += 1\n", + "\n", + "# Fill for the last record\n", + "last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id)\n", + "movie_np = np.append(movie_np, last_record)\n", + "\n", + "# Remove NaN rows and assign movie IDs\n", + "df = df[pd.notnull(df['Rating'])]\n", + "df['Movie_Id'] = movie_np.astype(int)\n", + "df['Cust_Id'] = df['Cust_Id'].astype(int)\n", + "\n", + "# Display dataset with movie IDs added\n", + "print('-Dataset examples-')\n", + "print(df.iloc[::5000000, :])\n" + ], + "metadata": { + "id": "DLRVDpxgilD8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Advanced EDA - Movie and Customer Statistics" + ], + "metadata": { + "id": "oF4ASo8BilcP" + } + }, + { + "cell_type": "code", + "source": [ + "# Generate summary statistics for each movie and customer\n", + "df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(['count', 'mean'])\n", + "df_movie_summary.index = df_movie_summary.index.map(int)\n", + "movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0)\n", + "drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index\n", + "\n", + "print('Movie minimum times of review: {}'.format(movie_benchmark))\n", + "\n", + "df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(['count', 'mean'])\n", + "df_cust_summary.index = df_cust_summary.index.map(int)\n", + "cust_benchmark = round(df_cust_summary['count'].quantile(0.7), 0)\n", + "drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index\n", + "\n", + "print('Customer minimum times of review: {}'.format(cust_benchmark))\n" + ], + "metadata": { + "id": "ziNr8lE4ilpP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Visualizing Movie Ratings Count\n", + "**Deductions:**\n", + "\n", + "1. The histogram shows a right-skewed distribution where most movies receive fewer reviews, while a small number of popular movies attract many reviews. This highlights the significant disparity in movie popularity, a common pattern in recommendation systems.\n" + ], + "metadata": { + "id": "cz5paPBOil4b" + } + }, + { + "cell_type": "code", + "source": [ + "# Visualizing the distribution of movie rating counts\n", + "plt.figure(figsize=(15, 8))\n", + "sns.histplot(df_movie_summary['count'], bins=50, kde=False, color='blue')\n", + "plt.title('Distribution of Movie Rating Counts', fontsize=20)\n", + "plt.xlabel('Number of Ratings')\n", + "plt.ylabel('Frequency')\n", + "plt.show()\n" + ], + "metadata": { + "id": "L_OMPfxkimiC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Visualizing Customer Rating Counts\n", + "**Deductions:**\n", + "\n", + "\n", + "1. Most users rate a limited number of movies, with only a few users rating a large number of movies. This implies that the majority of customers are casual raters, while a small fraction of power users rate more frequently." + ], + "metadata": { + "id": "aJsXWmiqinEI" + } + }, + { + "cell_type": "code", + "source": [ + "# Visualizing the distribution of customer rating counts\n", + "plt.figure(figsize=(15, 8))\n", + "sns.histplot(df_cust_summary['count'], bins=50, kde=False, color='green')\n", + "plt.title('Distribution of Customer Rating Counts', fontsize=20)\n", + "plt.xlabel('Number of Ratings')\n", + "plt.ylabel('Frequency')\n", + "plt.show()\n" + ], + "metadata": { + "id": "m_iNgMNVk0Mo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Trimming Dataset Based on Benchmarks" + ], + "metadata": { + "id": "iZwX7h7ClCbv" + } + }, + { + "cell_type": "code", + "source": [ + "# Trim the dataset by removing less frequently reviewed movies and customers\n", + "print('Original Shape: {}'.format(df.shape))\n", + "df = df[~df['Movie_Id'].isin(drop_movie_list)]\n", + "df = df[~df['Cust_Id'].isin(drop_cust_list)]\n", + "print('After Trim Shape: {}'.format(df.shape))\n" + ], + "metadata": { + "id": "qF-jcVXPlCp0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Create Pivot Table" + ], + "metadata": { + "id": "ZBMxZidWlC4b" + } + }, + { + "cell_type": "code", + "source": [ + "# Create a pivot table for movies and customer ratings\n", + "df_p = pd.pivot_table(df, values='Rating', index='Cust_Id', columns='Movie_Id')\n", + "\n", + "# Display pivot table shape\n", + "print(df_p.shape)\n" + ], + "metadata": { + "id": "hPSuWXkVlDGK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Load Movie Titles" + ], + "metadata": { + "id": "nTmRZY-BleZ-" + } + }, + { + "cell_type": "code", + "source": [ + "# Load movie titles dataset and set Movie_Id as index\n", + "df_title = pd.read_csv('../input/movie_titles.csv', encoding=\"ISO-8859-1\", header=None, names=['Movie_Id', 'Year', 'Name'])\n", + "df_title.set_index('Movie_Id', inplace=True)\n", + "\n", + "# Display first 10 rows of movie titles\n", + "print(df_title.head(10))\n" + ], + "metadata": { + "id": "4fNnCXK4lev8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Collaborative Filtering - Matrix Factorization Using SVD\n" + ], + "metadata": { + "id": "6gUWKnx9lgNh" + } + }, + { + "cell_type": "code", + "source": [ + "# Using collaborative filtering to build a recommendation system\n", + "reader = Reader()\n", + "\n", + "# Load dataset from DataFrame\n", + "data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)\n", + "\n", + "# Use Singular Value Decomposition (SVD) model\n", + "svd = SVD()\n", + "\n", + "# Perform cross-validation on the dataset\n", + "cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)\n" + ], + "metadata": { + "id": "wFZoSX9Wlggw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Movie Recommendation System" + ], + "metadata": { + "id": "MpEDpZ2ql2Wy" + } + }, + { + "cell_type": "code", + "source": [ + "# Build movie recommendation system based on user input and correlation\n", + "def recommend(movie_title, min_count):\n", + " print(\"Top 10 recommended movies are:\")\n", + "\n", + " # Filter based on minimum count of ratings\n", + " movie_id = df_title.index[df_title['Name'] == movie_title][0]\n", + " target_movie_ratings = df_p[movie_id]\n", + " similar_to_target = df_p.corrwith(target_movie_ratings)\n", + "\n", + " corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])\n", + " corr_target.dropna(inplace=True)\n", + "\n", + " corr_summary = corr_target.join(df_movie_summary['count'])\n", + " recommendations = corr_summary[corr_summary['count'] > min_count].sort_values('PearsonR', ascending=False).head(10)\n", + "\n", + " print(recommendations)\n", + "\n", + "# Call the recommendation function for a sample movie\n", + "recommend(\"What the #$*! Do We Know!?\", 100)\n" + ], + "metadata": { + "id": "iFAeOSbel3DQ" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file