import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
import seaborn as sns
feed_df = pd.read_csv('Backup_RSS_Feed-scrap.csv')
feed_df.head()
feed_df['date'] = pd.to_datetime(feed_df['date'])
feed_df.head()
feed_df
feed_df['count'] = feed_df.groupby(['Newspaper'])['title'].transform('count')
nr_posts_published = feed_df[['Newspaper', 'count']].copy()
nr_posts_published.drop_duplicates(inplace=True)
nr_posts_published
plt.bar(nr_posts_published['Newspaper'], nr_posts_published['count'])
plt.title('Posts released by Newssources via their RSS-Feed')
plt.show()
feed_df['time'] = pd.to_timedelta(feed_df['date'].dt.time.astype(str))
feed_df['time'] = feed_df['time'].dt.floor('Min')
posts_per_timestamp = feed_df[['time', 'Newspaper']]
#posts_per_timestamp['count'] = feed_df.groupby(['time', 'Newspaper'])['Newspaper'].transform('count')
posts_per_timestamp['seconds'] = [x.seconds for x in posts_per_timestamp.time]
posts_per_timestamp['minutes'] = (posts_per_timestamp['seconds'] / 60).astype('int32')
posts_per_timestamp.drop_duplicates(inplace=True)
posts_per_timestamp.head()
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
fig, axs = plt.subplots(5, figsize=(10,10))
hours = [dt.time(i*2).strftime('%H:00') for i in range(12)]
for i, source in enumerate(np.unique(posts_per_timestamp['Newspaper'])):
df_ = posts_per_timestamp[posts_per_timestamp['Newspaper']==source]
axs[i].hist(df_['seconds'], bins = 48)
axs[i].set_title('Post Frequency of ' + source)
axs[i].set_xticks([])
plt.xticks(np.linspace(0, np.max(df_['seconds']), 12), hours, rotation = 45)
axs[-1].set(xlabel='Time', ylabel='Posts per half-hour')
fig.show()
def find_in_str(series, strings = 'corona|covid|covid19|china-virus|chinavirus|covid-19'):
return series.str.contains(strings, case = False)
feed_df['subject_corona'] = find_in_str(feed_df['summary'])
plt.title('Total News vs. News about Corona')
plt.bar(['Total News', 'News about Corona'], [len(feed_df['summary']), feed_df['subject_corona'].sum()])
feed_df['subject_corona']