7 Viele Dimensionen

import pandas as pd import altair as alt

url = "https://gist.github.com/scotthmurray/f71065a5694f22259bf9/raw/ce891b9fe7ec3c5cab3308f4cd0c8eeccc36f6c7/Better%2520Life%2520Index%2520Data.csv" oecd = pd.read_csv(url) # to keep things consistent with the other examples we turn column names to lowercase oecd.columns = map(str.lower, oecd.columns)

geonames_full = pd.read_csv("https://www.geonames.org/countryInfoCSV", sep='\t', keep_default_na=False) geonames_full.head()

geonames = geonames_full[['name', 'iso alpha3', 'areaInSqKm', 'population']]

geonames.columns = ["country", "code", "area", "population"] geonames = geonames.set_index("code") geonames

worldbank_full = pd.read_csv("worldbank_gdp.csv", header=2)

worldbank = worldbank_full[ ["Country Code", "2020"] ] worldbank.columns = ["code", "gdp"] worldbank = worldbank.set_index("code") worldbank

# https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide ecdc_full = pd.read_csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv")

ecdc = ecdc_full[ ["cases", "deaths", "countryterritoryCode"] ] ecdc.columns = ["covid19_cases", "covid19_deaths", "code"] ecdc = ecdc.set_index("code")

ecdc = ecdc.groupby("code").sum() ecdc

multiple = geonames.join(worldbank).join(ecdc) # from now on, we do not need the country codes anymore; we can remove them: multiple = multiple.reset_index(drop=True) multiple

multiple = multiple.dropna().reset_index(drop=True) multiple

import requests endpoint = "https://query.wikidata.org/sparql" # triple quotes start and end multi-line strings sparql = """ SELECT ?countryLabel ( MAX(?populations) AS ?population ) ?gdp ( MAX(?areas) AS ?area ) WHERE { ?country wdt:P31 wd:Q3624078; wdt:P463 wd:Q458; wdt:P1082 ?populations; wdt:P8744/wdt:P2131 ?gdp; p:P2046/psn:P2046/wikibase:quantityAmount ?areas. ?gdp_statement ps:P2131 ?gdp. FILTER NOT EXISTS { ?country p:P2131/pq:P585 ?gdp_date_ . FILTER (?gdp_date_ > ?gdp_date) } SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } GROUP BY ?countryLabel ?area ?population ?gdp """ res = requests.get(endpoint, params = {'format': 'json', 'query': sparql}) response = res.text

print(response[:1000])

import json # let's define the parsing steps as a function, so that you can reuse it later def wikidata_to_dataframe(data): # parse json data from response and get results: results = json.loads(data)["results"]["bindings"] # column names we draw from the first result cols = [ val for val in results[0] ] rows = [] # to get the values from this, we need to loop through the results: for result in results: values = [ result[val]["value"] for val in result ] rows.append(values) # with rows and cols we can create a DataFrame: return pd.DataFrame(rows, columns=cols) wikidata = wikidata_to_dataframe(response) wikidata = wikidata.rename(columns={'countryLabel':'country'}) wikidata

wikidata.info()

cols = wikidata.columns[1:] wikidata[cols] = wikidata[cols].apply(pd.to_numeric) wikidata.info()

sparql = """ # 1. enter your query here """ # 2. uncomment the following lines # res = requests.get(endpoint, params = {'format': 'json', 'query': sparql}) # your_wikidata = wikidata_to_dataframe(res.text) # your_wikidata

oecd[oecd.columns[1:]].corr()

# import a few components from the machine learning library scikit-learn: from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline # create a data pipeline, which includes scaling to normalize the data # and by initializing the PCA with the number of principal components: 2 pipe = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=2))]) # run the PCA on all columns (except the first which is the country name) principal_components = pipe.fit_transform(oecd[oecd.columns[1:]]) # let's have a look principal_components

# first turn principal_components into a DataFrame pca_positions = pd.DataFrame(principal_components, columns = ['x', 'y']) # … and combine it with the source dataframe oecd oecd_pca = pd.concat([oecd, pca_positions], axis = 1)

alt.Chart(oecd_pca).mark_circle().encode( x=alt.X('x', axis=alt.Axis(labels=False)), y=alt.Y('y', axis=alt.Axis(labels=False)), tooltip='country' ).properties(width=400, height=400)

# 1. run PCA on the numeric columns # 2. combine the positions with original data # 3. display a scatterplot

# import the umap library import umap.umap_ as umap # there are a bunch of warnings generated by the umap package that we will omit warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") # there are two main parameters, which you need to tweak reducer = umap.UMAP( n_neighbors=15, # balances local versus global structure min_dist=.4 # .01 for tight clumps, large for loose 1 ) # the datasets have a similar structure; you can replace it with multiple or wikidata: data = oecd # we use again the StandardScaler for normalization # and add the UMAP reducer afterwards into the pipeline pipe = Pipeline([('scaling', StandardScaler()), ('umap', reducer)]) # start the normalization and reduction steps embedding = pipe.fit_transform(data[data.columns[1:]]) # turn the resulting embedding into dataframe, in which the positions are x and y umap_positions = pd.DataFrame(embedding, columns=["x", "y"]) # … and merge it with the original dataset data_umap = pd.concat([data, umap_positions], axis = 1) # to center the scatterplot around generated positions, # we adjust the scales according to the smallest and largest x and y values: x_domain = [data_umap["x"].min(), data_umap["x"].max() ] y_domain = [data_umap["y"].min(), data_umap["y"].max() ]

# display scatterplot and pass domains for x and y to scale parameter alt.Chart(data_umap).mark_circle().encode( alt.X('x', scale=alt.Scale(domain=x_domain), axis=alt.Axis(labels=False)), alt.Y('y', scale=alt.Scale(domain=y_domain), axis=alt.Axis(labels=False)), tooltip="country" ).properties(width=400, height=400)

data = wikidata cols = ["area", "population", "gdp"] alt.Chart(data).mark_circle().encode( # the data dimensions used for the encoding are specified below under repeat # they are all quantitative and we remove the axis labels to avoid clutter alt.X(alt.repeat("column"), type='quantitative', axis=alt.Axis(labels=False)), alt.Y(alt.repeat("row"), type='quantitative', axis=alt.Axis(labels=False)), # we add a tooltip with the country's name tooltip = "country" ).properties( width=150, height=150).repeat( # specify which data columns are used column=cols, row=cols )

corr = oecd[oecd.columns[1:]].corr() # first we reset the index and call it dim1 corr = corr.reset_index().rename(columns={'index': 'dim1'}) # turn correlation data into long form corr = pd.melt(corr, id_vars="dim1", var_name='dim2', value_name='corr') # add a label column for rounded correlation values corr['label'] = corr['corr'].map('{:.1f}'.format)

corr

# we create a layered chart, with the base taking in the correlation data corr # and the basic layout based on the dimensions base = alt.Chart(corr).encode( x='dim1:O', y='dim2:O' ).properties(width=500, height=500) # a textual layer displaying rounded correlation values text = base.mark_text().encode( text='label' ) # heatmap of the correlation values plot = base.mark_rect().encode( color='corr:Q' ) # both layers are combined plot + text

# to coordinate hover highlights we create a selection selection = alt.selection_point(on='mouseover', fields=['country']) # the definitions of the base are used by the three sub-charts base = alt.Chart(wikidata).mark_bar().encode( # adjust opacity based on hover selection opacity = alt.condition(selection, alt.value(1), alt.value(.5)), x = alt.X("country:O").sort("-y").axis(None), tooltip=['country','population', 'area', 'gdp'] ).properties( width=600, height=150 ).add_params(selection) # create a chart for each dimension pop = base.encode(y = "population") area = base.encode(y = "area") gdp = base.encode(y = "gdp") # combine them with ampersands pop & area & gdp