!pip install -q whatstk
!pip install -q plotly
!pip install -q kaleido
# change the path to whatever your file name is
chat = '[filename].txt'
import pandas as pd
from whatstk import WhatsAppChat
chat = WhatsAppChat.from_source(chat)
df = chat.df
df
# list of current names in the chat
# df['username'].unique()
# change names
# df['username'] = df['username'].replace({'fren 1' : 'Batman',
# 'fren 2' : 'Superman',
# 'fren 3' : 'Plastic Man',
# 'fren 4' : 'Martian Manhunter',
# 'fren 5' : 'The Silent Type',
# 'fren 6' : 'Media Guy'
# })
# change multiple names at once
# for that one person who always has a new number
# df['username'] = df['username'].replace(['num1', 'num2', 'num3', 'num4', 'num5'], 'five number fred')
# just to check if all the names have been changed correctly
# df['username'].unique()
# create day of the week column
df['day_of_week'] = df['date'].dt.day_name()
#create empty df
all_data = pd.DataFrame(columns = ['name', 'day_of_week', 'messages', 'media', 'URLs'])
only_media = df[df['message'] == '<Media omitted>'] # create a df with only media
only_urls = df[df['message'].str.contains("https://")] # create a df with only urls
# total messages per person
for name in df['username'].unique():
for day in df['day_of_week'].unique():
# by_name = df[df['username']==name] # filter by name
# by_day = by_name[by_name['day_of_week']==day] #filter by day
# by_day.message.count() # count
# the above code is condensed by substitution below, so multiple dfs are't created
messages = df[df['username']==name]\
[df[df['username']==name]['day_of_week']==day]\
.message.count()
# count media the same way messages were counted
# media_by_name = only_media[only_media['username']==name] # filter media df by name
# media_by_day = media_by_name[media_by_name['day_of_week']==day] # filter media df by day
# media_by_day.message.count() #count
# the above code is condensed by substitution below, so multiple dfs are't created
media = only_media[only_media['username']==name]\
[only_media[only_media['username']==name]['day_of_week']==day]\
.message.count()
# count urls the same way messages were counted
# urls_by_name = only_urls[only_urls['username']==name] # filter media df by name
# urls_by_day = urls_by_name[urls_by_name['day_of_week']==day] # filter media df by day
# urls_by_day.message.count() #count
# the above code is condensed by substitution below, so multiple dfs are't created
urls = only_urls[only_urls['username']==name]\
[only_urls[only_urls['username']==name]['day_of_week']==day]\
.message.count()
all_data.loc[len(all_data.index)] = [name, day, messages, media, urls] # add a new row each iteration to the empty df
all_data
Group Stats
Total
Creating a new dataframe to plot
# creating a new df from 'data'
group_count = {'stat':['messages', 'media', 'URLs'],
'count':[all_data.messages.sum(), all_data.media.sum(), all_data.URLs.sum()]}
group = pd.DataFrame(group_count)
group
import plotly.express as px
# add the start and end dates of the chat to the title using pandas dt.date.min() and max()
# format with strftime('%d %b %Y')
# title should be "graph name + ( + start date + to + end date + )"
title = 'Chat Stats' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig0 = px.bar(group, x='stat', y='count', text='count', title=title)
fig0.update_traces(marker_color='springgreen')
fig0.show()
import plotly.graph_objects as go
x = ['messages', 'media', 'URLs']
y = [all_data.messages.sum(), all_data.media.sum(), all_data.URLs.sum()]
fig1 = go.Figure(data=[go.Bar(
x=x, y=y,
text=y,
textposition='auto',
marker_color='springgreen'
)])
title = 'Group Stats' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig1.update_layout(title_text=title)
fig1.show()
# save as image
fig1.write_image('1_groupstats.png')
individual_data = pd.DataFrame(columns = ['name', 'messages', 'media', 'URLs']) # create empty df
only_media = df[df['message'] == '<Media omitted>'] # create a df with only media
only_urls = df[df['message'].str.contains("https://")] # create a df with only urls
# total messages per person
for name in df['username'].unique():
# by_name = df[df['username']==name] # filter by name
# by_name.message.count() # count
# the above code is condensed by substitution below, so multiple dfs are't created
messages = df[df['username']==name]\
.message.count()
# count media the same way messages were counted
# media_by_name = only_media[only_media['username']==name] # filter media df by name
# media_by_name.message.count() #count
# the above code is condensed by substitution below, so multiple dfs are't created
media = only_media[only_media['username']==name]\
.message.count()
# count urls the same way messages were counted
# urls_by_name = only_urls[only_urls['username']==name] # filter media df by name
# urls_by_name.message.count() #count
# the above code is condensed by substitution below, so multiple dfs are't created
urls = only_urls[only_urls['username']==name]\
.message.count()
individual_data.loc[len(individual_data.index)] = [name, messages, media, urls] # add a new row each iteration
individual_data
title = 'Individual Stats' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
people = individual_data['name']
fig2 = go.Figure(data=[
go.Bar(name='messages', x=people, y=individual_data['messages'], text=individual_data['messages'], textposition='auto'),
go.Bar(name='media', x=people, y=individual_data['media'], text=individual_data['media'], textposition='auto'),
go.Bar(name='URLs', x=people, y=individual_data['URLs'], text=individual_data['URLs'], textposition='auto')
])
# Change the bar mode
fig2.update_layout(barmode='group',
title_text= title
)
fig2.show()
# save as image
fig2.write_image('2_individualstats.png')
Percentage by each contact
title = 'Total Messages By Percentage' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig3_a = px.pie(all_data, values='messages', names='name', title=title)
fig3_a.show()
Media
title = 'Total Media By Percentage' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig3_b = px.pie(all_data, values='media', names='name', title=title)
fig3_b.show()
URLs
title = 'Total URLs By Percentage' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig3_c = px.pie(all_data, values='URLs', names='name', title=title)
fig3_c.show()
# save as images
fig3_a.write_image('3_percentage_messages.png')
fig3_b.write_image('4_percentage_media.png')
fig3_c.write_image('5_percentage_urls.png')
Messages by day of the week
title = 'Messages By Day Of The Week' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig5 = px.bar(all_data, x='day_of_week', y='messages', color='name', title=title)
fig5.show()
# save as image
fig5.write_image('6_messages.png')
Media by day of the week
title = 'Media By Day Of The Week' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig6 = px.bar(all_data, x='day_of_week', y='media', color='name', title=title)
fig6.show()
# save as image
fig6.write_image('7_media.png')
URLs by day of the week
title = 'URLs By Day Of The Week' + ' ' + '(' + str(df['date'].dt.date.min().strftime('%d %b %Y')) + ' to ' + str(df['date'].dt.date.max().strftime('%d %b %Y')) + ')'
fig7 = px.bar(all_data, x='day_of_week', y='URLs', color='name', title=title)
fig7.show()
# save as image
fig7.write_image('8_urls.png')
from zipfile import ZipFile
file_paths = ['1_groupstats.png', '2_individualstats.png', '3_percentage_messages.png', '4_percentage_media.png', '5_percentage_urls.png', '6_messages.png', '7_media.png', '8_urls.png']
# add dates to the filename
dates = str(df['date'].dt.date.min().strftime('%d %b %Y')) + '_' + 'to' + '_' + str(df['date'].dt.date.max().strftime('%d %b %Y'))
with ZipFile(dates+'_CoolWhatsappGraphs.zip', 'w') as zip:
for file in file_paths:
zip.write(file)