From 3e483ecac6e504ebedd4bb5dc55c5572651adc3e Mon Sep 17 00:00:00 2001 From: navanchauhan Date: Thu, 15 Jun 2023 19:34:09 +0530 Subject: added EDA CUBRT --- Resources/EDA.html | 1456 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1456 insertions(+) create mode 100644 Resources/EDA.html (limited to 'Resources') diff --git a/Resources/EDA.html b/Resources/EDA.html new file mode 100644 index 0000000..7db9c7f --- /dev/null +++ b/Resources/EDA.html @@ -0,0 +1,1456 @@ + + + + + + + + + + + + + + + + CU Boulder Racing Team + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+
+
+ + + +
+ + + + +
+ + + + + + + + +

EDA / Statistical Analysis of Engagement for CUBRT on Instagram

Prerequisites

Instaloader was used to download all the posts along with their metadata from instagram, and followers count at intervals of 7 days were calculated manually using Excel and rate of growth from Instagram using the formula:

+
past_followers = Current Followers / (1 + (Percentage Growth / 100 ))
+
+

Date of Analysis: Thursday, 15th June 2023 +Current Followers: 1664

+ +

curacing team folder contains the data downloaded from Instaloader

+ + + + + + + +
+
!ls
+
+ +
+ +
EDA.html         curacingteam     filename.csv     temp.json
+EDA.ipynb        experiments      post_analysis.md
+
+ + + + + + + + +
+
import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import json
+import glob
+import lzma
+from sklearn.linear_model import LinearRegression
+from datetime import timedelta, datetime
+
+ +
+

Conversion to Pandas DF

Instaloader downloads the metadata as a JSON file and then compresses it. Here, some of the features are selected and all of the files are loaded and parsed

+ + + + + + + +
+
global_df = []
+
+def to_xlsx(file):
+    info = {}
+
+    with open(file) as f:
+        data = json.load(f)
+        try:
+            info["post_type_name"] = data["node"]["__typename"]
+        except KeyError:
+            info["post_type_name"] = "null"
+        try:
+            info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
+        except (IndexError, KeyError):
+            info["post_text"] = "null"
+        try:
+            info["account_name"] = data["node"]["owner"]["id"]
+        except KeyError:
+            info["account_name"] = "null"
+        try:
+            info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"]
+        except KeyError:
+            info["shortcode"] = 0
+        try:
+            dt = data["node"]["taken_at_timestamp"]
+            info["timestamp"] = datetime.fromtimestamp(dt)
+        except KeyError:
+            info["timestamp"] = 0
+        try:
+            info["like_count"] = data["node"]["edge_media_preview_like"]["count"]
+        except KeyError:
+            info["like_count"] = 0
+        try:
+            info["comment_count"] = data["node"]["edge_media_to_comment"]["count"]
+        except KeyError:
+            info["comment_count"] = 0
+        try:
+            info["video_view_count"] = data["node"]["video_view_count"]
+        except (IndexError, KeyError):
+            info["video_view_count"] = 0
+        try:
+            info["comments_disabled"] = data["node"]["comments_disabled"]
+        except KeyError:
+            info["comments_disabled"] = "null"
+
+        # AFEGITS
+
+        try:
+            info["full_name"] = data["node"]["owner"]["full_name"]
+        except KeyError:
+            info["full_name"] = "null"
+        try:
+            info["is_professional_account"] = data["node"]["owner"]["is_professional_account"]
+        except KeyError:
+            info["is_professional_account"] = "null"
+        try:
+            info["is_business_account"] = data["node"]["owner"]["is_business_account"]
+        except KeyError:
+            info["is_business_account"] = "null"
+        try:
+            info["is_verified"] = data["node"]["owner"]["is_verified"]
+        except KeyError:
+            info["is_verified"] = "null"
+        try:
+            info["is_video"] = data["node"]["is_video"]
+        except KeyError:
+            info["is_video"] = "null"
+        try:
+            info["category_name"] = data["node"]["owner"]["category_name"]
+        except KeyError:
+            info["category_name"] = "null"
+
+        return info # returning the dict with all data
+
+ +
+ + + + + + +
+
json_files = glob.glob("./curacingteam/*.json.xz") # Path To JSON FILES
+ammount = len(json_files)
+print(f"Found {ammount} posts")
+
+ +
+ +
Found 208 posts
+
+ + + + + + + + +
+
for json_file in json_files:
+    with open("./temp.json","w") as f:
+        f.write(lzma.open(json_file,mode='rt',encoding='utf-8').read())
+    info = to_xlsx('./temp.json')  # This is the main function
+    
+    df = pd.DataFrame({
+        "post_date": info["timestamp"],
+        "account_id": info["account_name"],
+        "full_name": info["full_name"],
+        "text": info["post_text"],
+        "post_shortcode":  info["shortcode"],
+        "like_count": info["like_count"],
+        "comment_count": info["comment_count"],
+        "is_video": info["is_video"],
+        "video_view_count": info["video_view_count"],
+        "comments_policy": info["comments_disabled"],
+        "is_professional": info["is_professional_account"],
+        "is_business": info["is_business_account"],
+        "is_verified": info["is_verified"],
+        "person_category": info["category_name"],
+        "post_type": info["post_type_name"]
+    }, index=[1])
+
+    global_df.append(df)
+
+final = pd.concat(global_df)
+#final.to_csv("filename.csv", index=False) # Your Filename
+print("Converted...")
+
+ +
+ +
Converted...
+
+ + + + + + + + +
+
final
+
+ +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
post_dateaccount_idfull_nametextpost_shortcodelike_countcomment_countis_videovideo_view_countcomments_policyis_professionalis_businessis_verifiedperson_categorypost_type
12022-11-02 21:34:274312829205CU Boulder Racing TeamAttending Atlanta Motorsports Park for the @ch...https://www.instagram.com/p/CkdvA6oLIAa1475False0FalseTrueTrueFalseAmateur Sports TeamGraphSidecar
12022-01-26 08:33:504312829205CU Boulder Racing TeamApplications are now open for both the Enduran...https://www.instagram.com/p/CZLXGjaLQ4p791False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12021-08-13 04:20:424312829205CU Boulder Racing Team“Scary as s**t but very very fun” - Ryan after...https://www.instagram.com/p/CSfeLQVrQzJ650False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12023-05-21 19:39:404312829205CU Boulder Racing TeamThe 2023 Formula SAE Competition in Michigan i...https://www.instagram.com/p/Csgg3OWLRIt90216False0FalseTrueTrueFalseAmateur Sports TeamGraphSidecar
12022-05-10 01:19:244312829205CU Boulder Racing TeamHey folks, we're still running our crowdfundin...https://www.instagram.com/p/CdWYDkcOA0i640False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
................................................
12018-04-24 01:06:234312829205CU Boulder Racing TeamThe anticipation before a race 💨\n•\n•\n•\n#cu...https://www.instagram.com/p/Bh7Md85gSoa452False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12020-12-25 10:12:404312829205CU Boulder Racing TeamMerry Christmas and Happy Holidays to all! We ...https://www.instagram.com/p/CJNS09Frl_U-13False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12023-01-24 21:06:184312829205CU Boulder Racing TeamAs we begin to focus our time on the manufactu...https://www.instagram.com/p/CnzZxWuLJDL870False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12023-02-27 21:28:414312829205CU Boulder Racing TeamTeamwork is a vital part of any group and our ...https://www.instagram.com/p/CpK_W_prd5j962False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
12018-10-17 00:48:464312829205CU Boulder Racing TeamThe close up! Also, the funding campaign is st...https://www.instagram.com/p/BpAWW0xgp_v291False0FalseTrueTrueFalseAmateur Sports TeamGraphImage
+

208 rows × 15 columns

+
+ + + +

Data Interpolation

Since we only have the followers count for 7 day intervals for the past 90 days, we need to interpolate the data. A quick linear regression model is okay in this case

+ + + + + + + +
+
# Input dictionary
+dic = {0: 1664, 7: 1661, 14: 1657, 21: 1639, 28: 1589, 30: 1568, 37: 1538, 44: 1532, 51: 1534, 58: 1529, 65: 1522, 72: 1515, 79: 1511, 90: 1482, 2636: 10}
+
+# Extract keys and values from the dictionary
+keys = np.array(list(dic.keys())).reshape(-1, 1)
+values = np.array(list(dic.values()))
+
+# Create a LinearRegression model
+model = LinearRegression()
+
+# Fit the model to the data
+model.fit(keys, values)
+
+# Predict the number of followers for a new day
+new_day = 0
+predicted_followers = model.predict([[new_day]])
+
+print("Predicted number of followers for day", new_day, ":", predicted_followers[0])
+
+ +
+ +
Predicted number of followers for day 0 : 1593.2997615182705
+
+ + + + + + + + +
+
df = final
+
+# Exclude rows with 'post_date' of type int
+df = df[~df['post_date'].apply(lambda x: isinstance(x, int))]
+
+# Then convert to datetime
+df['post_date'] = pd.to_datetime(df['post_date'])
+
+# Let's assume 'today' is the latest date in the dataframe
+#today = df['post_date'].max()
+today = datetime.strptime('15/06/23 15:25:00', '%d/%m/%y %H:%M:%S')
+
+def magic_function(row):
+    
+    days = (today - row['post_date']).days
+    return model.predict([[days]])[0]
+
+"""# Convert the date to the number of days before today
+df['days_before_today'] = (today - df['post_date']).dt.days
+
+# Convert the dictionary to a DataFrame for interpolation
+dic_df = pd.DataFrame(list(dic.items()), columns=['days_before_today', 'followers'])
+
+# Sort the DataFrame based on 'days_before_today'
+dic_df = dic_df.sort_values('days_before_today')
+
+# Interpolate the followers for the days not present in the dictionary
+dic_df = dic_df.set_index('days_before_today').reindex(range(dic_df.days_before_today.min(), dic_df.days_before_today.max() + 1)).interpolate().reset_index()
+
+# Merge the two DataFrames on 'days_before_today'
+df = pd.merge(df, dic_df, on='days_before_today', how='left')
+
+df['followers'] = df['followers'].replace(np.nan, 1000)"""
+
+df['followers'] = df.apply(magic_function, axis=1) #df.apply(lambda row : model.predict([[(today - row['post_date']).dt.days]])[0])
+
+# Rename the column to 'followers_at_time'
+df = df.rename(columns={'followers': 'followers_at_time'})
+
+ +
+

Engagement

Engagement is calculated by taking the sum of the likes, video view counts and comments, divided by followers at that time, and then whole multiplied by 100

+ + + + + + + +
+
# Calculate the engagement_percentage
+df['engagement_percentage'] = (df['like_count'] + df['comment_count'] + df['video_view_count']) / df['followers_at_time'] * 100
+
+ +
+ + + + + + +
+
processed_df = df
+
+ +
+

Analysis

+

By Post Type

+ + + + + + +
+
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
+sns.scatterplot(data=processed_df, x="post_date", y="engagement_percentage", hue="post_type", ax=axs[0])
+sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
+f.tight_layout()
+
+ +
+ + + + + + + +

Even though videos are our most well performing post type, we don't post enough

+ + + + + + + +
+
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
+sns.scatterplot(data=processed_df, x="post_date", y="like_count", hue="post_type", ax=axs[0])
+sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
+f.tight_layout()
+
+ +
+ + + + + + + +

There hasn't been a huge increase in the number of likes we get per post. Sure, it is steadily increasing, but we have had this account since 2017. Although, our frequency has definitely gotten better since 2022!

+

There also is a big jump that happened because of the 2023 FSAE Competition. 2022 did not show such a big jump.

+ + + + + + + +
+
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
+sns.scatterplot(data=processed_df, x="post_date", y="comment_count", hue="post_type", ax=axs[0])
+sns.histplot(data=processed_df, x="post_type", hue="post_type", shrink=.8, alpha=.8, legend=False, ax=axs[1])
+f.tight_layout()
+
+ +
+ + + + + + + +

We need to figure out how to get people to comment more, as this is factored in when engagement is calculated

+ + + + + + + +
+
 
+
+ +
+ +
+ +
+
+
+ + + + + + + + + + + + -- cgit v1.2.3