EDA / Statistical Analysis of Engagement for CUBRT on Instagram

Prerequisites

Instaloader was used to download all the posts along with their metadata from instagram, and followers count at intervals of 7 days were calculated manually using Excel and rate of growth from Instagram using the formula:

past_followers = Current Followers / (1 + (Percentage Growth / 100 ))

Date of Analysis: Thursday, 15th June 2023 Current Followers: 1664

curacing team folder contains the data downloaded from Instaloader

!ls
EDA.html         curacingteam     filename.csv     temp.json
EDA.ipynb        experiments      post_analysis.md
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
import lzma
from sklearn.linear_model import LinearRegression
from datetime import timedelta, datetime

Conversion to Pandas DF

Instaloader downloads the metadata as a JSON file and then compresses it. Here, some of the features are selected and all of the files are loaded and parsed

global_df = []

def to_xlsx(file):
    info = {}

    with open(file) as f:
        data = json.load(f)
        try:
            info["post_type_name"] = data["node"]["__typename"]
        except KeyError:
            info["post_type_name"] = "null"
        try:
            info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
        except (IndexError, KeyError):
            info["post_text"] = "null"
        try:
            info["account_name"] = data["node"]["owner"]["id"]
        except KeyError:
            info["account_name"] = "null"
        try:
            info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"]
        except KeyError:
            info["shortcode"] = 0
        try:
            dt = data["node"]["taken_at_timestamp"]
            info["timestamp"] = datetime.fromtimestamp(dt)
        except KeyError:
            info["timestamp"] = 0
        try:
            info["like_count"] = data["node"]["edge_media_preview_like"]["count"]
        except KeyError:
            info["like_count"] = 0
        try:
            info["comment_count"] = data["node"]["edge_media_to_comment"]["count"]
        except KeyError:
            info["comment_count"] = 0
        try:
            info["video_view_count"] = data["node"]["video_view_count"]
        except (IndexError, KeyError):
            info["video_view_count"] = 0
        try:
            info["comments_disabled"] = data["node"]["comments_disabled"]
        except KeyError:
            info["comments_disabled"] = "null"

        # AFEGITS

        try:
            info["full_name"] = data["node"]["owner"]["full_name"]
        except KeyError:
            info["full_name"] = "null"
        try:
            info["is_professional_account"] = data["node"]["owner"]["is_professional_account"]
        except KeyError:
            info["is_professional_account"] = "null"
        try:
            info["is_business_account"] = data["node"]["owner"]["is_business_account"]
        except KeyError:
            info["is_business_account"] = "null"
        try:
            info["is_verified"] = data["node"]["owner"]["is_verified"]
        except KeyError:
            info["is_verified"] = "null"
        try:
            info["is_video"] = data["node"]["is_video"]
        except KeyError:
            info["is_video"] = "null"
        try:
            info["category_name"] = data["node"]["owner"]["category_name"]
        except KeyError:
            info["category_name"] = "null"

        return info # returning the dict with all data
json_files = glob.glob("./curacingteam/*.json.xz") # Path To JSON FILES
ammount = len(json_files)
print(f"Found {ammount} posts")
Found 208 posts
for json_file in json_files:
    with open("./temp.json","w") as f:
        f.write(lzma.open(json_file,mode='rt',encoding='utf-8').read())
    info = to_xlsx('./temp.json')  # This is the main function
    
    df = pd.DataFrame({
        "post_date": info["timestamp"],
        "account_id": info["account_name"],
        "full_name": info["full_name"],
        "text": info["post_text"],
        "post_shortcode":  info["shortcode"],
        "like_count": info["like_count"],
        "comment_count": info["comment_count"],
        "is_video": info["is_video"],
        "video_view_count": info["video_view_count"],
        "comments_policy": info["comments_disabled"],
        "is_professional": info["is_professional_account"],
        "is_business": info["is_business_account"],
        "is_verified": info["is_verified"],
        "person_category": info["category_name"],
        "post_type": info["post_type_name"]
    }, index=[1])

    global_df.append(df)

final = pd.concat(global_df)
#final.to_csv("filename.csv", index=False) # Your Filename
print("Converted...")
Converted...
final
post_date account_id full_name text post_shortcode like_count comment_count is_video video_view_count comments_policy is_professional is_business is_verified person_category post_type
1 2022-11-02 21:34:27 4312829205 CU Boulder Racing Team Attending Atlanta Motorsports Park for the @ch... https://www.instagram.com/p/CkdvA6oLIAa 147 5 False 0 False True True False Amateur Sports Team GraphSidecar
1 2022-01-26 08:33:50 4312829205 CU Boulder Racing Team Applications are now open for both the Enduran... https://www.instagram.com/p/CZLXGjaLQ4p 79 1 False 0 False True True False Amateur Sports Team GraphImage
1 2021-08-13 04:20:42 4312829205 CU Boulder Racing Team “Scary as s**t but very very fun” - Ryan after... https://www.instagram.com/p/CSfeLQVrQzJ 65 0 False 0 False True True False Amateur Sports Team GraphImage
1 2023-05-21 19:39:40 4312829205 CU Boulder Racing Team The 2023 Formula SAE Competition in Michigan i... https://www.instagram.com/p/Csgg3OWLRIt 902 16 False 0 False True True False Amateur Sports Team GraphSidecar
1 2022-05-10 01:19:24 4312829205 CU Boulder Racing Team Hey folks, we're still running our crowdfundin... https://www.instagram.com/p/CdWYDkcOA0i 64 0 False 0 False True True False Amateur Sports Team GraphImage
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1 2018-04-24 01:06:23 4312829205 CU Boulder Racing Team The anticipation before a race 💨\n•\n•\n•\n#cu... https://www.instagram.com/p/Bh7Md85gSoa 45 2 False 0 False True True False Amateur Sports Team GraphImage
1 2020-12-25 10:12:40 4312829205 CU Boulder Racing Team Merry Christmas and Happy Holidays to all! We ... https://www.instagram.com/p/CJNS09Frl_U -1 3 False 0 False True True False Amateur Sports Team GraphImage
1 2023-01-24 21:06:18 4312829205 CU Boulder Racing Team As we begin to focus our time on the manufactu... https://www.instagram.com/p/CnzZxWuLJDL 87 0 False 0 False True True False Amateur Sports Team GraphImage
1 2023-02-27 21:28:41 4312829205 CU Boulder Racing Team Teamwork is a vital part of any group and our ... https://www.instagram.com/p/CpK_W_prd5j 96 2 False 0 False True True False Amateur Sports Team GraphImage
1 2018-10-17 00:48:46 4312829205 CU Boulder Racing Team The close up! Also, the funding campaign is st... https://www.instagram.com/p/BpAWW0xgp_v 29 1 False 0 False True True False Amateur Sports Team GraphImage

208 rows × 15 columns

Data Interpolation

Since we only have the followers count for 7 day intervals for the past 90 days, we need to interpolate the data. A quick linear regression model is okay in this case

# Input dictionary
dic = {0: 1664, 7: 1661, 14: 1657, 21: 1639, 28: 1589, 30: 1568, 37: 1538, 44: 1532, 51: 1534, 58: 1529, 65: 1522, 72: 1515, 79: 1511, 90: 1482, 2636: 10}

# Extract keys and values from the dictionary
keys = np.array(list(dic.keys())).reshape(-1, 1)
values = np.array(list(dic.values()))

# Create a LinearRegression model
model = LinearRegression()

# Fit the model to the data
model.fit(keys, values)

# Predict the number of followers for a new day
new_day = 0
predicted_followers = model.predict([[new_day]])

print("Predicted number of followers for day", new_day, ":", predicted_followers[0])
Predicted number of followers for day 0 : 1593.2997615182705
df = final

# Exclude rows with 'post_date' of type int
df = df[~df['post_date'].apply(lambda x: isinstance(x, int))]

# Then convert to datetime
df['post_date'] = pd.to_datetime(df['post_date'])

# Let's assume 'today' is the latest date in the dataframe
#today = df['post_date'].max()
today = datetime.strptime('15/06/23 15:25:00', '%d/%m/%y %H:%M:%S')

def magic_function(row):
    
    days = (today - row['post_date']).days
    return model.predict([[days]])[0]

"""# Convert the date to the number of days before today
df['days_before_today'] = (today - df['post_date']).dt.days

# Convert the dictionary to a DataFrame for interpolation
dic_df = pd.DataFrame(list(dic.items()), columns=['days_before_today', 'followers'])

# Sort the DataFrame based on 'days_before_today'
dic_df = dic_df.sort_values('days_before_today')

# Interpolate the followers for the days not present in the dictionary
dic_df = dic_df.set_index('days_before_today').reindex(range(dic_df.days_before_today.min(), dic_df.days_before_today.max() + 1)).interpolate().reset_index()

# Merge the two DataFrames on 'days_before_today'
df = pd.merge(df, dic_df, on='days_before_today', how='left')

df['followers'] = df['followers'].replace(np.nan, 1000)"""

df['followers'] = df.apply(magic_function, axis=1) #df.apply(lambda row : model.predict([[(today - row['post_date']).dt.days]])[0])

# Rename the column to 'followers_at_time'
df = df.rename(columns={'followers': 'followers_at_time'})

Engagement

Engagement is calculated by taking the sum of the likes, video view counts and comments, divided by followers at that time, and then whole multiplied by 100

# Calculate the engagement_percentage
df['engagement_percentage'] = (df['like_count'] + df['comment_count'] + df['video_view_count']) / df['followers_at_time'] * 100
processed_df = df

Analysis

By Post Type

f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.scatterplot(data=processed_df, x="post_date", y="engagement_percentage", hue="post_type", ax=axs[0])
sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
f.tight_layout()

Even though videos are our most well performing post type, we don't post enough

f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.scatterplot(data=processed_df, x="post_date", y="like_count", hue="post_type", ax=axs[0])
sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
f.tight_layout()

There hasn't been a huge increase in the number of likes we get per post. Sure, it is steadily increasing, but we have had this account since 2017. Although, our frequency has definitely gotten better since 2022!

There also is a big jump that happened because of the 2023 FSAE Competition. 2022 did not show such a big jump.

f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.scatterplot(data=processed_df, x="post_date", y="comment_count", hue="post_type", ax=axs[0])
sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
f.tight_layout()

We need to figure out how to get people to comment more, as this is factored in when engagement is calculated