import pandas as pd
from ddbapi import zp_issues, zp_pages, list_column, filter
df = zp_pages(
publication_date='[1850-01-01T12:00:00Z TO 1900-12-31T12:00:00Z]',
place_of_distribution='Berlin',
plainpagefulltext=["Sansibar", "Wilhelm"]
)
df
df.to_excel('newspaper.xlsx', index=False)
df.to_csv('newspaper.csv', sep=';', index=False)
df['zdb_id'].value_counts().reset_index()
# Monday = 0, Sunday = 6
df[df['publication_date'].dt.dayofweek == 6]
df.iloc[1067]['plainpagefulltext'].split()
df.query('plainpagefulltext.str.contains("\WMarine\W") and plainpagefulltext.str.contains("\WMehrheit\W")', engine='python')
for np in df['paper_title'].unique():
print('Frühste Ausgabe der ' + np + ": " + str(df.query('paper_title == @np')['publication_date'].min()))
df['paper_title'].value_counts().nlargest(10).plot(kind="barh")
df.query('paper_title.str.contains("ausgabe")', engine="python")
df.query('provider == "Bibliothek der Friedrich-Ebert-Stiftung"')
list_column(df['place_of_distribution']).value_counts()
merged_fulltext = ''
for p in df['plainpagefulltext']:
merged_fulltext += p
def count_word_frequency(str):
counts = dict()
words = str.split()
for word in words:
if word.lower() in counts:
counts[word.lower()] += 1
else:
counts[word.lower()] = 1
counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
return counts
word_count = count_word_frequency(merged_fulltext)
word_count
word_count['bismarck']
df['publication_date'].value_counts().plot(figsize=(20,10), ylabel="Count", xlabel="Year")