Zeitungsportal API

import pandas as pd from ddbapi import zp_issues, zp_pages, list_column, filter

df = zp_pages( publication_date='[1850-01-01T12:00:00Z TO 1900-12-31T12:00:00Z]', place_of_distribution='Berlin', plainpagefulltext=["Sansibar", "Wilhelm"] ) df

df.to_excel('newspaper.xlsx', index=False)

df.to_csv('newspaper.csv', sep=';', index=False)

df['zdb_id'].value_counts().reset_index()

# Monday = 0, Sunday = 6 df[df['publication_date'].dt.dayofweek == 6]

df.iloc[1067]['plainpagefulltext'].split()

df.query('plainpagefulltext.str.contains("\WMarine\W") and plainpagefulltext.str.contains("\WMehrheit\W")', engine='python')

for np in df['paper_title'].unique(): print('Frühste Ausgabe der ' + np + ": " + str(df.query('paper_title == @np')['publication_date'].min()))

df['paper_title'].value_counts().nlargest(10).plot(kind="barh")

df.query('paper_title.str.contains("ausgabe")', engine="python")

df.query('provider == "Bibliothek der Friedrich-Ebert-Stiftung"')

list_column(df['place_of_distribution']).value_counts()

merged_fulltext = '' for p in df['plainpagefulltext']: merged_fulltext += p

def count_word_frequency(str): counts = dict() words = str.split() for word in words: if word.lower() in counts: counts[word.lower()] += 1 else: counts[word.lower()] = 1 counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True)) return counts word_count = count_word_frequency(merged_fulltext) word_count

word_count['bismarck']

df['publication_date'].value_counts().plot(figsize=(20,10), ylabel="Count", xlabel="Year")