From d75527f7eecc4e2fcdd18ab157412506717c8adb Mon Sep 17 00:00:00 2001 From: navanchauhan Date: Mon, 7 Nov 2022 23:36:11 -0700 Subject: add blog post --- .../2022-05-21-Similar-Movies-Recommender.html | 36 ++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'docs/posts/2022-05-21-Similar-Movies-Recommender.html') diff --git a/docs/posts/2022-05-21-Similar-Movies-Recommender.html b/docs/posts/2022-05-21-Similar-Movies-Recommender.html index 5d2d6fe..f45b45e 100644 --- a/docs/posts/2022-05-21-Similar-Movies-Recommender.html +++ b/docs/posts/2022-05-21-Similar-Movies-Recommender.html @@ -63,7 +63,8 @@

First, I needed to check the total number of records in Trakt’s database.

-
import requests
+
+
import requests
 import os
 
 trakt_id = os.getenv("TRAKT_ID")
@@ -87,14 +88,16 @@
 res = requests.get(f"{api_base}/search/movie",headers=headers,params=params)
 total_items = res.headers["x-pagination-item-count"]
 print(f"There are {total_items} movies")
-
+
+
There are 333946 movies
 

First, I needed to declare the database schema in (database.py):

-
import sqlalchemy
+
+
import sqlalchemy
 from sqlalchemy import create_engine
 from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey, PickleType
 from sqlalchemy import insert
@@ -129,13 +132,15 @@
     meta.create_all(engine)
     Session = sessionmaker(bind=engine)
     return engine, Session
-
+
+

In the end, I could have dropped the embeddings field from the table schema as I never got around to using it.

Scripting Time

-
from database import *
+
+
from database import *
 from tqdm import tqdm
 import requests
 import os
@@ -228,7 +233,8 @@
                 except IntegrityError:
                     trans.rollback()
     req_count += 1
-
+
+

(Note: I was well within the rate-limit so I did not have to slow down or implement any other measures)

@@ -263,7 +269,8 @@ As of writing this post, I did not include any other database except Trakt.

  • Installing the Python module (pinecone-client)

  • -
    import pandas as pd
    +
    +
    import pandas as pd
     import pinecone
     from sentence_transformers import SentenceTransformer
     from tqdm import tqdm 
    @@ -293,7 +300,8 @@ As of writing this post, I did not include any other database except Trakt. 

    str(value), embeddings[idx].tolist() )) index.upsert(to_send) -
    +
    +

    That's it!

    @@ -304,7 +312,8 @@ As of writing this post, I did not include any other database except Trakt.

    To find similar items, we will first have to map the name of the movie to its trakt_id, get the embeddings we have for that id and then perform a similarity search. It is possible that this additional step of mapping could be avoided by storing information as metadata in the index.

    -
    def get_trakt_id(df, title: str):
    +
    +
    def get_trakt_id(df, title: str):
       rec = df[df["title"].str.lower()==movie_name.lower()]
       if len(rec.trakt_id.values.tolist()) > 1:
         print(f"multiple values found... {len(rec.trakt_id.values)}")
    @@ -344,11 +353,13 @@ It is possible that this additional step of mapping could be avoided by storing
           "runtime": df.runtime.values[0],
           "year": df.year.values[0]
       }
    -
    +
    +

    Testing it Out

    -
    movie_name = "Now You See Me"
    +
    +
    movie_name = "Now You See Me"
     
     movie_trakt_id = get_trakt_id(df, movie_name)
     print(movie_trakt_id)
    @@ -360,7 +371,8 @@ It is possible that this additional step of mapping could be avoided by storing
     for trakt_id in movie_ids:
       deets = get_deets_by_trakt_id(df, trakt_id)
       print(f"{deets['title']} ({deets['year']}): {deets['overview']}")
    -
    +
    +

    Output:

    -- cgit v1.2.3