import pandas as pd
cities = pd.DataFrame({
"name": ["Athens", "Bratislava", "Copenhagen", "Dublin"],
"area": [39, 367.6, 86.2, 115],
"elevation": [170, 152, 14, 20],
"population": [664046, 429564, 602481, 553165]
}
)
cities
file = open("anscombe.json")
anscombe_json = file.read()
file.close()
print(anscombe_json)
import requests
response = requests.get('https://www.tagesschau.de/xml/rss2/')
tagesschau_xml = response.text
print(tagesschau_xml[:1500]) # this displays the first 1000 characters
base_url = "https://hub.culturegraph.org/entityfacts/"
gnd_id = "118564943" # GND identifier found on Wikipedia page for Käthe Kollwitz (scroll to bottom)
gnd_response = requests.get(base_url+gnd_id).text
print(gnd_response[:1000])
kitas = pd.read_csv(
"https://opendata.potsdam.de/explore/dataset/kitaboerse-20161108/download/",
sep=";")
kitas.head()
import json
anscombe_dict = json.loads(anscombe_json)
anscombe = pd.DataFrame.from_dict(anscombe_dict)
anscombe
import xml.etree.ElementTree as ET
tagesschau = ET.fromstring(tagesschau_xml)
tagesschau
# create two empty lists
dates = []
titles = []
# go through all item elements in the tree
for item in tagesschau.findall('.//item'):
# extract date information and titles
dates.append( item.find('.//pubDate').text )
titles.append( item.find('.//title').text )
# create a dataframe containing the two columns
tagesschau_df = pd.DataFrame(
{'date': dates,
'title': titles
})
tagesschau_df
import bs4
permalink = "https://en.wikipedia.org/w/index.php?title=List_of_largest_cities&oldid=1219932710"
wiki = requests.get(permalink).text
soup = bs4.BeautifulSoup(wiki, "html.parser")
# extract the table with the class wikitable
table = soup.find('table', class_='wikitable')
# create some empty lists of city names and populations
names = []
pops = []
# iterate over all rows, <tr> is the html element for table rows
rows = table.find_all('tr')
for row in rows:
# the city name is in the first vertical heading cell (th)
heads = row.find_all('th')
# we extract the name if there is actually at least one th cell
if (len(heads)>0):
# extract text regardless if encapsulated with a link or not
name = heads[0].get_text(strip=True)
# if there is no th cell, we skip and go to the next row
else: continue
# we gather all regular table cells (td)
cells = row.find_all('td')
if (len(cells)>2):
# the population number is in the 7th td cell
pop = cells[6]
# if there is no value, we're omitting this city
if(pop is None): continue
# we remove commas and any extra whitespace
pop = pop.text.replace(",", "").strip()
# if the content of the string is not numeric we, do not include it
if (pop.isnumeric()== False): continue
# finally adding city population and name
pops.append(pop)
names.append(name)
else: continue
# generate a dataframe based on the city names & populations
df = pd.DataFrame({"city": names, "population": pops})
df.population = pd.to_numeric(df.population) # cast string to numeric values
df
kitas.head()
kitas.describe()
kitas.info()
kitas = kitas.rename(columns={"name_der_kindertagesbetreuungseinrichtung": "name"})
kitas["name"]
kitas[["name", "betrieb_e"]]
kitas [kitas["betrieb_e"].notnull()] [["name", "betrieb_e"]]
kitas['name_copy'] = kitas['name']
kitas = kitas.set_index("name_copy")
kitas.iloc[0]
kitas.loc["Kastanienhof, Hort"]
kitas.at["Kastanienhof, Hort", "e_mail"]
kitas.iat[0, 14]
kitas[kitas['trager'].str.contains('AWO')]
import re # regular expressions module, use for a case-insensitive query
kitas[kitas['name'].str.contains('Spatzen', flags=re.IGNORECASE, regex=True)] ["name"]