import pandas as pd
import altair as alt
url = "https://gist.github.com/scotthmurray/f71065a5694f22259bf9/raw/ce891b9fe7ec3c5cab3308f4cd0c8eeccc36f6c7/Better%2520Life%2520Index%2520Data.csv"
oecd = pd.read_csv(url)
# to keep things consistent with the other examples we turn column names to lowercase
oecd.columns = map(str.lower, oecd.columns)
geonames_full = pd.read_csv("https://www.geonames.org/countryInfoCSV", sep='\t', keep_default_na=False)
geonames_full.head()
geonames = geonames_full[['name', 'iso alpha3', 'areaInSqKm', 'population']]
geonames.columns = ["country", "code", "area", "population"]
geonames = geonames.set_index("code")
geonames
worldbank_full = pd.read_csv("worldbank_gdp.csv", header=2)
worldbank = worldbank_full[ ["Country Code", "2020"] ]
worldbank.columns = ["code", "gdp"]
worldbank = worldbank.set_index("code")
worldbank
# https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide
ecdc_full = pd.read_csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv")
ecdc = ecdc_full[ ["cases", "deaths", "countryterritoryCode"] ]
ecdc.columns = ["covid19_cases", "covid19_deaths", "code"]
ecdc = ecdc.set_index("code")
ecdc = ecdc.groupby("code").sum()
ecdc
multiple = geonames.join(worldbank).join(ecdc)
# from now on, we do not need the country codes anymore; we can remove them:
multiple = multiple.reset_index(drop=True)
multiple
multiple = multiple.dropna().reset_index(drop=True)
multiple
import requests
endpoint = "https://query.wikidata.org/sparql"
# triple quotes start and end multi-line strings
sparql = """
SELECT ?countryLabel ( MAX(?populations) AS ?population ) ?gdp ( MAX(?areas) AS ?area )
WHERE {
?country wdt:P31 wd:Q3624078;
wdt:P463 wd:Q458;
wdt:P1082 ?populations;
wdt:P8744/wdt:P2131 ?gdp;
p:P2046/psn:P2046/wikibase:quantityAmount ?areas.
?gdp_statement ps:P2131 ?gdp.
FILTER NOT EXISTS {
?country p:P2131/pq:P585 ?gdp_date_ .
FILTER (?gdp_date_ > ?gdp_date)
}
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
}
}
GROUP BY ?countryLabel ?area ?population ?gdp
"""
res = requests.get(endpoint, params = {'format': 'json', 'query': sparql})
response = res.text
print(response[:1000])
import json
# let's define the parsing steps as a function, so that you can reuse it later
def wikidata_to_dataframe(data):
# parse json data from response and get results:
results = json.loads(data)["results"]["bindings"]
# column names we draw from the first result
cols = [ val for val in results[0] ]
rows = []
# to get the values from this, we need to loop through the results:
for result in results:
values = [ result[val]["value"] for val in result ]
rows.append(values)
# with rows and cols we can create a DataFrame:
return pd.DataFrame(rows, columns=cols)
wikidata = wikidata_to_dataframe(response)
wikidata = wikidata.rename(columns={'countryLabel':'country'})
wikidata
wikidata.info()
cols = wikidata.columns[1:]
wikidata[cols] = wikidata[cols].apply(pd.to_numeric)
wikidata.info()
sparql = """
# 1. enter your query here
"""
# 2. uncomment the following lines
# res = requests.get(endpoint, params = {'format': 'json', 'query': sparql})
# your_wikidata = wikidata_to_dataframe(res.text)
# your_wikidata
oecd[oecd.columns[1:]].corr()
# import a few components from the machine learning library scikit-learn:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# create a data pipeline, which includes scaling to normalize the data
# and by initializing the PCA with the number of principal components: 2
pipe = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=2))])
# run the PCA on all columns (except the first which is the country name)
principal_components = pipe.fit_transform(oecd[oecd.columns[1:]])
# let's have a look
principal_components
# first turn principal_components into a DataFrame
pca_positions = pd.DataFrame(principal_components, columns = ['x', 'y'])
# … and combine it with the source dataframe oecd
oecd_pca = pd.concat([oecd, pca_positions], axis = 1)
alt.Chart(oecd_pca).mark_circle().encode(
x=alt.X('x', axis=alt.Axis(labels=False)),
y=alt.Y('y', axis=alt.Axis(labels=False)),
tooltip='country'
).properties(width=400, height=400)
# 1. run PCA on the numeric columns
# 2. combine the positions with original data
# 3. display a scatterplot
# import the umap library
import umap.umap_ as umap
# there are a bunch of warnings generated by the umap package that we will omit
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
# there are two main parameters, which you need to tweak
reducer = umap.UMAP(
n_neighbors=15, # balances local versus global structure
min_dist=.4 # .01 for tight clumps, large for loose 1
)
# the datasets have a similar structure; you can replace it with multiple or wikidata:
data = oecd
# we use again the StandardScaler for normalization
# and add the UMAP reducer afterwards into the pipeline
pipe = Pipeline([('scaling', StandardScaler()), ('umap', reducer)])
# start the normalization and reduction steps
embedding = pipe.fit_transform(data[data.columns[1:]])
# turn the resulting embedding into dataframe, in which the positions are x and y
umap_positions = pd.DataFrame(embedding, columns=["x", "y"])
# … and merge it with the original dataset
data_umap = pd.concat([data, umap_positions], axis = 1)
# to center the scatterplot around generated positions,
# we adjust the scales according to the smallest and largest x and y values:
x_domain = [data_umap["x"].min(), data_umap["x"].max() ]
y_domain = [data_umap["y"].min(), data_umap["y"].max() ]
# display scatterplot and pass domains for x and y to scale parameter
alt.Chart(data_umap).mark_circle().encode(
alt.X('x', scale=alt.Scale(domain=x_domain), axis=alt.Axis(labels=False)),
alt.Y('y', scale=alt.Scale(domain=y_domain), axis=alt.Axis(labels=False)),
tooltip="country"
).properties(width=400, height=400)
data = wikidata
cols = ["area", "population", "gdp"]
alt.Chart(data).mark_circle().encode(
# the data dimensions used for the encoding are specified below under repeat
# they are all quantitative and we remove the axis labels to avoid clutter
alt.X(alt.repeat("column"), type='quantitative', axis=alt.Axis(labels=False)),
alt.Y(alt.repeat("row"), type='quantitative', axis=alt.Axis(labels=False)),
# we add a tooltip with the country's name
tooltip = "country"
).properties( width=150, height=150).repeat(
# specify which data columns are used
column=cols,
row=cols
)
corr = oecd[oecd.columns[1:]].corr()
# first we reset the index and call it dim1
corr = corr.reset_index().rename(columns={'index': 'dim1'})
# turn correlation data into long form
corr = pd.melt(corr, id_vars="dim1", var_name='dim2', value_name='corr')
# add a label column for rounded correlation values
corr['label'] = corr['corr'].map('{:.1f}'.format)
corr
# we create a layered chart, with the base taking in the correlation data corr
# and the basic layout based on the dimensions
base = alt.Chart(corr).encode(
x='dim1:O',
y='dim2:O'
).properties(width=500, height=500)
# a textual layer displaying rounded correlation values
text = base.mark_text().encode( text='label' )
# heatmap of the correlation values
plot = base.mark_rect().encode(
color='corr:Q'
)
# both layers are combined
plot + text
# to coordinate hover highlights we create a selection
selection = alt.selection_point(on='mouseover', fields=['country'])
# the definitions of the base are used by the three sub-charts
base = alt.Chart(wikidata).mark_bar().encode(
# adjust opacity based on hover selection
opacity = alt.condition(selection, alt.value(1), alt.value(.5)),
x = alt.X("country:O").sort("-y").axis(None),
tooltip=['country','population', 'area', 'gdp']
).properties(
width=600, height=150
).add_params(selection)
# create a chart for each dimension
pop = base.encode(y = "population")
area = base.encode(y = "area")
gdp = base.encode(y = "gdp")
# combine them with ampersands
pop & area & gdp