# Import all the packages we need:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Additionally, stats:
from scipy import stats
## Lastly, seaborn-violins:
import seaborn as sns
## Even later, regular expressions
import re
## And we forgot, some other things
import random
from collections import Counter
## Load the dataset into a DataFrame named 'echr'
echr = pd.read_csv('ECtHR.csv', index_col=0)
echr
echr.describe(include='all')
echr['respondent'].unique()
# This shows us all respondents i.e., both cases
# against a single state and cases against multiple states (they carry
# a ';' between them).
# Calculate the number of respondents
echr['num_respondents'] = (echr['respondent'].str.len() + 1) // 4
echr['one_respondent'] = (echr['num_respondents'] == 1)
# Add variable 'key_case' which is true for the most important cases
echr['key_case'] = (echr['importance'] == 'Key case')
# For ease of calculation, add new column 'importance_int' for importance level with type int
# Importance level of 'Key case' corresponds to 0
echr['importance_int'] = echr['importance'].replace('Key case', 0).astype(int)
# Convert boolean variable 'separateopinion' into float for ease of calculation later on
# (groupby does not work with boolean)
echr['separateopinion_float'] = echr['separateopinion'].astype(float)
# Calculate the number of individual violations found
echr['num_violations'] = echr['violation'].str.split(';').str.len()
# Comparing the average importance level for one respondent vs multiple respondents
echr.groupby('one_respondent').mean()[['importance_int', 'separateopinion_float']]
# Comparing the average importance level for any number of respondents
echr.groupby('num_respondents').mean()['importance_int']
# Checking the extremes (comparing the percentage of the most important cases and of the least important cases) for one respondent
echr.loc[echr['one_respondent'], 'importance_int'].value_counts(normalize=True)
# Checking the extremes (comparing the percentage of the most important cases and of the least important cases) for multiple respondents
echr.loc[~echr['one_respondent'], 'importance_int'].value_counts(normalize=True)
# Comparing the percentages for one respondent vs multiple respondents in the following situation:
# given a certain importance level, what is the likelihood that a separateopinion was filed
echr.groupby(['importance_int', 'one_respondent']).mean()['separateopinion_float']
pd.crosstab(index=echr['importance'], columns=echr['num_respondents'], margins=True)
stats.chi2_contingency(
pd.crosstab(index=echr['importance'], columns=echr['num_respondents'], margins=False))
stats.chi2_contingency(
pd.crosstab(index=echr['key_case'], columns=echr['one_respondent'], margins=False),
correction=True) # Yates' correction for 2x2 contingency tables
# Number of judgments per year
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').count()['ecli'])
plt.grid()
plt.xlabel('Year')
plt.ylabel('Number of Judgments')
plt.title('Timeline: Number of Judgements per Year')
# Average importance level of cases per year
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').mean()['importance_int'])
plt.ylim(0, 3)
plt.grid(axis='y')
plt.xlabel('Year')
plt.ylabel('Average importance level')
plt.title('Average importance level of cases per year\n(0 most important to 3 least important)')
# Number of cases per year and importance level, stacked
df = echr.groupby(['year', 'importance_int']).count()['ecli'].unstack().fillna(0)[[0, 1, 2, 3]]\
.cumsum(axis=1) # calling cumsum to stack the areas
plt.figure(figsize=(15, 10))
# Make stacked areas
plt.fill_between(df.index, df[2], df[3], color='#CAD593', label='Importance level 3')
plt.fill_between(df.index, df[1], df[2], color='#A1C349', label='Importance level 2')
plt.fill_between(df.index, df[0], df[1], color='#87A330', label='Importance level 1')
plt.fill_between(df.index, 0, df[0], color='#243010', label='Importance level 0')
plt.grid()
plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('Number of cases')
plt.title('Number of cases in each importance level per year')
# When writing our research paper, need to clarify explicitly whether the data point in 2009 is 1600 or 1100.
# Share of cases per year and importance level, stacked
df = echr.groupby(['year', 'importance_int']).count()['ecli'].unstack().fillna(0)[[0, 1, 2, 3]]
# First dividing each year by total number of cases, then calling cumsum to stack the areas
df = df.divide(df.sum(axis=1), axis=0).cumsum(axis=1)
plt.figure(figsize=(15, 10))
# Make stacked areas
plt.fill_between(df.index, df[2], df[3], color='#CAD593', label='Importance level 3')
plt.fill_between(df.index, df[1], df[2], color='#A1C349', label='Importance level 2')
plt.fill_between(df.index, df[0], df[1], color='#87A330', label='Importance level 1')
plt.fill_between(df.index, 0, df[0], color='#243010', label='Importance level 0')
plt.grid()
plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('Share of cases')
plt.title('Share of cases in each importance level per year')
# Some rows do not indicate which articles are concerned in the case
print(echr['article'].isna().sum())
# Try to infer some of the articles from 'conclusion'
echr.loc[echr['article'].isna(), 'conclusion'].unique()
# Use regular expressions to find article mentions in 'conclusion'
echr.loc[echr['article'].isna(), 'conclusion'].str.extract("(P?\d[0-9a-z\-]*)", expand=False).unique()
# Fill in gaps in 'article'
echr.loc[echr['article'].isna(), 'article'] \
= echr.loc[echr['article'].isna(), 'conclusion'].str.extract("(P?\d[0-9a-z\-]*)")
# Some rows indicate neither violations nor non-violations.
# This may distort some of the following calculations.
# Add boolean variable 'violation_information' for masking
echr['violation_information'] = echr['violation'].notna() | echr['nonviolation'].notna()
print((~echr['violation_information']).sum()) # Number of cases with missing 'violation' and 'nonviolation'
# Check whether 'conclusion' contains information on the cases with missing 'violation' and 'nonviolation'
echr.loc[~echr['violation_information'], 'conclusion'].unique()
# Try to extract missing violation information from 'conclusion' with regular expression
violation_regex = 'Violati?on (?:of|de) *(?:Art\.? *|Article *)?([P\-0-9]+)'
nonviolation_regex = 'No violati?on (?:of|de) *(?:Art\.? *|Article *)?([P\-0-9]+)'
# Apply regular expression to extract all mentions of violations from 'conclusion'
df = echr.loc[~echr['violation_information'], 'conclusion'].str.extractall(
violation_regex).unstack(fill_value=None)
df.columns = [0, 1]
# Join multiple violations with ';' into one string
df.loc[df[1].notna(), 0] += ';' + df.loc[df[1].notna(), 1]
# Feed the retrieved values back into 'echr'
echr.loc[~echr['violation_information'], 'violation'] = df[0]
# Apply regular expression to extract all mentions of non-violations from 'conclusion'
df = echr.loc[~echr['violation_information'], 'conclusion'].str.extractall(
nonviolation_regex).unstack(fill_value=None)
df.columns = [0] # At most one match for non-violations is found
# Feed the retrieved values back into 'echr'
echr.loc[~echr['violation_information'], 'nonviolation'] = df[0]
# Recalculate 'violation_information' after sanitising data
echr['violation_information'] = echr['violation'].notna() | echr['nonviolation'].notna()
print((~echr['violation_information']).sum()) # Number of cases with missing 'violation' and 'nonviolation'
# Check 'conclusion' for the cases with missing violation information
# to see if the regular expression matched all obvious cases
echr.loc[~echr['violation_information'], 'conclusion'].unique()
# Find unique article combinations in 'article'
# Article combinations are indicated with '+', so only split on ';'
articles = echr['article'].dropna().str.split(';', expand=True)
pd.concat([articles[col].dropna() for col in articles.columns]).unique()
# Number of unique articles in 'articles'
len(articles)
# Find unique articles in 'article'
# Split on '+' or ';'
df = echr['article'].dropna().str.split(';|\+', expand=True)
articles = pd.concat([df[col].dropna() for col in df.columns]).unique()
articles
# Find unique article combinations **violated**
df = echr['violation'].dropna().str.split(';', expand=True)
pd.concat([df[col].dropna() for col in df.columns]).unique()
# Find unique article combinations **not violated**
df = echr['nonviolation'].dropna().str.split(';', expand=True)
pd.concat([df[col].dropna() for col in df.columns]).unique()
# Find unique articles **violated** or **not violated**
# Concatenate matches on 'violation' and 'nonviolation'
df = pd.concat(
[echr['violation'].dropna().str.split(';|\+', expand=True),
echr['nonviolation'].dropna().str.split(';|\+', expand=True)],
axis=1)
# Then, only keep unique values across all columns
pd.concat([df.iloc[:, i].dropna() for i in range(len(df.columns))]).unique()
# Check how many unique articles have been found violated or not violated
# These are far fewer than are concerned according to 'article' column
len(pd.concat([df.iloc[:, i].dropna() for i in range(len(df.columns))]).unique())
# For each article x, set boolean variable 'violation_x' to whether it has been violated in the corresponding case
# Analogously for non-violations
# 'concerned_x' is 'violation_x' OR 'nonviolation_x' to indicate whether x has been at stake in the case
for article in articles:
echr['concerned_' + article] = echr['article'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False)
echr['violation_' + article] = echr['violation'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False)
echr['nonviolation_' + article] = echr['nonviolation'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False)
# Defragment dataframe in memory to improve performance
echr = echr.copy()
# Generate keys for ease of access
violation = 'violation_' + articles
nonviolation = 'nonviolation_' + articles
concerned = 'concerned_' + articles
# Generate descriptive information on the cases concerning each article
# The average 'importance_int', 'num_respondents', 'key_case' where any given article is concerned
article_importance = pd.concat(
[echr.loc[echr[article], ['importance_int', 'num_respondents', 'key_case']].mean()
for article in concerned],
keys=articles, axis=1).T
# The number of cases in which any given article is concerned, violated, and non-violated
article_importance['num_cases'] = echr[concerned].sum().tolist()
article_importance['num_violations'] = echr[violation].sum().tolist()
article_importance['num_nonviolations'] = echr[nonviolation].sum().tolist()
article_importance
This chart is empty
Chart was probably not set up properly in the notebook
# Per article, the number of cases in which it is concerned
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(echr[concerned].sum().sort_values(ascending=False))
# Filter out articles that are at stake in only few cases
key_articles = articles[echr[concerned].sum() >= 2000]
key_violation = ['violation_' + article for article in key_articles]
key_nonviolation = ['nonviolation_' + article for article in key_articles]
key_concerned = ['concerned_' + article for article in key_articles]
key_articles
# Function to prettify article names
def pretty_article(article_string):
return 'Art. ' + re.sub("-(\w+)", "(\\1)",
re.sub("^P(\d+)-(.*)", "\\2 of Protocol No. \\1", article_string))
# Number of judgments per year per article concerned
# Note: A case may concern several articles
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').sum()[key_concerned], label=list(map(pretty_article, key_articles)))
plt.grid()
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Judgments')
plt.title('Timeline: Number of Judgements per Article concerned')
# Plot the average importance level of cases concerning each article
df = article_importance.sort_values('importance_int', ascending=False)
# Exclude articles with fewer than 500 instances
df = df[df['num_cases'] > 500]
plt.figure(figsize=(7, 20))
plt.grid(axis='x')
# Horizontal bar plot
plt.barh(
df.index,
df['importance_int'],
height=(df['num_cases'] / df['num_cases'].max()), # Make the height of the bars proportional to the 'num_cases'
label=list(map(pretty_article, df.index)),
zorder=2)
# Draw in the average importance leven across all cases
avg_importance = echr['importance_int'].mean()
plt.axvline(avg_importance, color='r')
plt.yticks(ticks=range(len(df)), labels=list(map(pretty_article, df.index)))
#plt.xlabel('ECHR Articles')
plt.xlabel('Importance level')
plt.title('Average importance per ECHR article\n\nBar Width: number of cases relating to this article\nRed line: Average importance level across all cases')
# Plot the average importance level of cases concerning each article
df = article_importance.sort_values('num_respondents', ascending=True)
# Exclude articles with fewer than 500 instances
df = df[df['num_cases'] > 500]
plt.figure(figsize=(7, 20))
plt.grid(axis='x')
# Horizontal bar plot
plt.barh(
df.index,
df['num_respondents'],
height=(df['num_cases'] / df['num_cases'].max()), # Make the height of the bars proportional to the 'num_cases'
label=list(map(pretty_article, df.index)),
zorder=2)
# Draw in the average number across all cases
avg_importance = echr['num_respondents'].mean()
plt.axvline(avg_importance, color='r')
plt.yticks(ticks=range(len(df)), labels=list(map(pretty_article, df.index)))
plt.xlim(1, 1.05) # Note: the y-axis starts at 1 because the difference between
# the values is too small to be visible otherwise
#plt.xlabel('ECHR Articles')
plt.xlabel('Number of respondents')
plt.title('Average number of respondents per ECHR article\n\nBar Width: number of cases relating to this article\nRed line: Average number of respondents across all cases')
# Number of judgments per year concerning article 14
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').sum()["concerned_14"], label="concerned_14")
plt.grid()
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Judgments')
plt.title('Timeline: Number of Judgments per Year concerning Art. 14 ECHR')
# Number of judgments per year concerning article 6
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').sum()[["concerned_6", "concerned_13", "concerned_14"]],
label=["Article 6", "Article 13", "Article 14"])
plt.grid()
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Judgments')
plt.title('Timeline: Number of Judgments per Year concerning Art. 6 ECHR')
# Importance of Articles 6, 13, 14 cases over time
plt.figure(figsize=(15, 10))
# Filter for whether each Article is concerned, then
# group by year to calculate average importance level
# Note: the groups of cases concerning different articles are not mutually exclusive
plt.plot(echr[echr['concerned_6']].groupby('year').mean()['importance_int'], label='Article 6')
plt.plot(echr[echr['concerned_13']].groupby('year').mean()['importance_int'], label='Article 13')
plt.plot(echr[echr['concerned_14']].groupby('year').mean()['importance_int'], label='Article 14')
plt.grid()
plt.legend()
plt.ylabel('Importance level')
plt.xlabel('Year')
plt.title('Average importance level of cases per Article concerned by year')
# Article 6
# Independence of importance level
print(pd.crosstab(echr['importance_int'], echr['concerned_6'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_6'])), "\n\n")
# Independence of number of respondents
print(pd.crosstab(echr['num_respondents'], echr['concerned_6'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_6'])))
# Article 13
# Independence of importance level
print(pd.crosstab(echr['importance_int'], echr['concerned_13'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_13'])), "\n\n")
# Independence of number of respondents
print(pd.crosstab(echr['num_respondents'], echr['concerned_13'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_13'])))
# Article 14
# Independence of importance level
print(pd.crosstab(echr['importance_int'], echr['concerned_14'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_14'])), "\n\n")
# Independence of number of respondents
print(pd.crosstab(echr['num_respondents'], echr['concerned_14'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_14'])))
# Article 1(1) of Protocol No. 1
# Independence of importance level
print(pd.crosstab(echr['importance_int'], echr['concerned_P1-1-1'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_P1-1-1'])), "\n\n")
# Independence of number of respondents
print(pd.crosstab(echr['num_respondents'], echr['concerned_P1-1-1'], margins=True), "\n")
print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_P1-1-1'])))
# Compare the number of cases in each importance level which ...
# (blue) concern Article 6
# (orange) concern Article 6(1)
# Note: these sets of cases are not mutually exclusive
plt.figure(figsize=(7.5, 5))
x = np.array([0, 1, 2, 3])
plt.bar(x - 0.22, echr['importance_int'][echr['concerned_6']].value_counts().loc[x], 0.4, label='... Article 6')
plt.bar(x + 0.22, echr['importance_int'][echr['concerned_6-1']].value_counts().loc[x], 0.4, label='... Article 6(1)')
plt.xticks(ticks=x, labels=x)
plt.legend(loc='upper left')
plt.ylabel('Number of cases concerning ...')
plt.xlabel('Importance level')
plt.title('Number of cases in each importance level\nrelating to Article 6 and 6(1)')
# Compare the number of cases in each importance level where ...
# (blue) a non-violation of Article 14 was found
# (orange) a violation of Article 14 was found
# Note: these sets of cases are not mutually exclusive
print((echr['violation_14'] & echr['nonviolation_14']).sum())
plt.figure(figsize=(7.5, 5))
x = np.array([0, 1, 2, 3])
plt.bar(x - 0.22, echr['importance_int'][echr['violation_14']].value_counts().loc[x], 0.4, label='... violation of Art. 14')
plt.bar(x + 0.22, echr['importance_int'][echr['nonviolation_14']].value_counts().loc[x], 0.4, label='... non-violation of Art. 14')
plt.xticks(ticks=x, labels=x)
plt.legend(loc='upper left')
plt.ylabel('Number of cases with ...')
plt.xlabel('Importance level')
plt.title('Number of cases in each importance level with\nviolation vs. non-violation of Article 14')
# Compare the number of cases in each importance level where ...
# (left) Article 14 is concerned
# (right) Article 14 is not concerned
# Note: these sets of cases *are* mutually exclusive
# Cross-tabulate the importance level and whether 14 is concerned
df = pd.crosstab(echr['concerned_14'], echr['importance_int'])
# Divide by the total to get the share of each importance level
df = df.divide(df.sum(axis=1), axis=0)
plt.figure(figsize=(7.5, 5))
# Plot stacked bars
plt.bar([1, 0], df[3], bottom=df[2] + df[1] + df[0], label='Importance level 3', color='#CAD593')
plt.bar([1, 0], df[2], bottom=df[1] + df[0], label='Importance level 2', color='#A1C349')
plt.bar([1, 0], df[1], bottom=df[0], label='Importance level 1', color='#87A330')
plt.bar([1, 0], df[0], label='Importance level 0', color='#243010')
plt.legend()
plt.title('Distribution of importance levels\nfor Article 14 concerned vs. not concerned')
plt.ylabel('Share of cases per importance level')
plt.xlabel('Article 14...')
plt.xticks(ticks=[0, 1], labels=['...concerned', '...not concerned'])
df
# Extend the previous plot to Articles 6 and 13 for comparison
# Cross-tabulate the importance level and concern for each article
df = pd.concat([
pd.crosstab(echr['concerned_6'], echr['importance_int']),
pd.crosstab(echr['concerned_13'], echr['importance_int']),
pd.crosstab(echr['concerned_14'], echr['importance_int'])],
axis=0)
# Divide by the total to get the share of each importance level
df = df.divide(df.sum(axis=1), axis=0)
plt.figure(figsize=(15, 5))
# Plot stacked bars in groups
# At x = 0 and 1 for Art. 6
# 3 and 4 for Art. 13
# 3 and 4 for Art. 14
plt.bar([1, 0, 4, 3, 7, 6], df[3], bottom=df[2] + df[1] + df[0], label='Importance level 3', color='#CAD593')
plt.bar([1, 0, 4, 3, 7, 6], df[2], bottom=df[1] + df[0], label='Importance level 2', color='#A1C349')
plt.bar([1, 0, 4, 3, 7, 6], df[1], bottom=df[0], label='Importance level 1', color='#87A330')
plt.bar([1, 0, 4, 3, 7, 6], df[0], label='Importance level 0', color='#243010')
#plt.legend(loc=(.2475, .7))
plt.legend(loc='upper left')
plt.title('Distribution of importance levels\nfor Articles 6, 13, 14 concerned vs. not concerned')
plt.ylabel('Share of cases per importance level')
#plt.xlabel('')
plt.xticks(ticks=[0, 1, 3, 4, 6, 7],
labels=['Art. 6 concerned', '...not concerned',
'Art. 13 concerned', '...not concerned',
'Art. 14 concerned', '...not concerned'])
# Overview over the number of counts on which a violation was found
# tabulated against the importance level
pd.crosstab(echr['num_violations'], echr['importance_int'])
#Let us prepare the violins
import random
from collections import Counter
echrNum = echr.copy()
# Calculate the number of respondents and reduce to three columns
echrNum['num respondents'] = (echr['respondent'].str.len() + 1) // 4
echrNum['one respondent'] = (echr['num_respondents'] == 1) #.replace({True: 1, False: 0})
echrNum['one respondent_int'] = (echr['num_respondents'] == 1).replace({True: 1, False: 0}).map(int)
echrNum['key case'] = (echr['importance'] == 'Key case') #.replace({True: 1, False: 0})
echrNum['importance_int'] = echr['importance'].replace('Key case', 0).map(int).astype(int).map(int)
echrNum['num violations'] = (echr['violation'].str.split(';').str.len())
print(Counter(echrNum['importance_int']))
print(Counter(echrNum['num violations']))
echrNum = echrNum[echrNum['num violations'].notna()]
echrNum['num violations'] = echrNum['num violations'].map(int)
print(Counter(echrNum['num violations']))
print(Counter(echrNum['num respondents']))
## Violin showing the KDE of Importance/Multiple Respondents/Article Violations Distributions.
# Note: I suggest bw of 0.7, since for 0.6 there
# will be a dot between 1 and 2,
# which would not make sense for article violations.
# N.B.: It's clear thus far in our exploratory analysis that
# our data is very discretised.
import seaborn as sns
fig, ax = plt.subplots()
sns.violinplot('importance_int', 'num violations',bw=0.7,gridsize =200, hue="one respondent", scale="width", inner = "quartile", split = True,data=echrNum, cut = 0,ax=ax)
ax.set_ylim(1,10)
ax.set_yticks(range(1,11))
fig.set_size_inches(8.5, 6)
plt.xlabel('Importance Level')
plt.ylabel('Article Violations')
plt.title('KDE of Importance/Multiple Respondents/Article Violations Distributions')
## Violin for illustrating KDE of Importance/Multiple Respondents/Separate Opinion Distributions
fig, ax = plt.subplots()
sns.violinplot('separateopinion', 'importance_int',bw=0.5,gridsize =200, hue="one respondent", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
ax.set_ylim(0,3)
ax.set_yticks(range(0,4))
fig.set_size_inches(8.5, 6)
plt.xlabel('Presence of a Separate Opinion')
plt.ylabel('Importance Level')
plt.title('KDE of Importance/Multiple Respondents/Separate Opinion Distributions')
# Count of frequencies for each value in separateopinion (=True)?
absfrequency=echr['separateopinion'].value_counts(sort=True)
relfrequency=echr['separateopinion'].value_counts(normalize=True).round(2)
print(absfrequency)
## Violin illustrating KDE of Article Violations/Multiple Respondents/Separate Opinion Distributions
fig, ax = plt.subplots()
sns.violinplot('separateopinion', 'num violations',bw=0.3,gridsize =200, hue="one respondent", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
ax.set_ylim(1,10)
ax.set_yticks(range(1,11))
fig.set_size_inches(8.5, 6)
plt.xlabel('Presence of a Separate Opinion')
plt.ylabel('Article Violations')
plt.title('KDE of Art. Violations/Multiple Respondents/Separate Opinion Distributions')
## Violin illustrating KDE of Importance/Number of Respondents/Separate Opinion Distributions
fig, ax = plt.subplots()
sns.violinplot('num respondents','importance_int',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
ax.set_ylim(0,3)
ax.set_yticks(range(0,4))
fig.set_size_inches(8.5, 6)
plt.xlabel('Number of Respondents')
plt.ylabel('Importance Level')
plt.title('KDE of Importance/Number of Respondents/Separate Opinion Distributions')
## Violin illustrating KDE of Importance/Chamber Category/Separate Opinion Distributions
# Let's first remove the 'Committee' category, as we want to visualize
# those judgments of the Grand Chamber and the Chamber.
categroyRemoved = echrNum[echrNum['doctypebranch'] != 'COMMITTEE']
fig, ax = plt.subplots()
sns.violinplot('doctypebranch', 'importance_int',bw=0.5,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=categroyRemoved, cut = 0)
#sns.violinplot('doctypebranch', 'importance_int',bw=0.5,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 2)
ax.set_ylim(0,3)
ax.set_yticks(range(0,4))
fig.set_size_inches(8.5, 6)
plt.xlabel('Category of Chamber')
plt.ylabel('Importance Level')
plt.title('KDE of Importance/Chamber Category/Separate Opinion Distributions')
## Violin illustrating KDE of Article Violations/Chamber Category/Separate Opinion Distributions
fig, ax = plt.subplots()
sns.violinplot('doctypebranch', 'num violations',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
fig = plt.gcf()
fig.set_size_inches(8.5, 6)
plt.xlabel('Category of Chamber')
plt.ylabel('Article Violations')
plt.title('KDE of Article Violations/Chamber Category/Separate Opinion Distributions')
#fig.savefig('test2png.png', dpi=100)
#plt.scatter(echrNum['num_respondents']+(random.random()*4.2), echrNum['num_violations']+random.random()*0.2, alpha=0.2, s=20+100*echrNum['key_case'], cmap='viridis')
# >>> Moved to scrap
O = pd.crosstab(index=echr['key_case'], columns=echr['one_respondent'], margins=True)
O
# >>> Moved to scrap
E = pd.concat(
3 * [O.loc[:, 'All']],
keys=[True, False, 'All'], axis=1
).mul(O.loc['All', :], axis=1) / O.loc['All', 'All']
E
# >>> Moved to scrap
O.plot(kind='bar')
# >>> Moved to scrap
# Calculate chi squared statistic
((O - E)**2 / E).sum().sum()
# >>> Moved to scrap
stats.chi2_contingency(O.iloc[0:2, 0:2], correction=False)
# >>> Moved to scrap
stats.fisher_exact(O.iloc[0:2, 0:2])
# >>> Moved to scrap
stats.barnard_exact(O.iloc[0:2, 0:2])
# >>> Moved to scrap
# Find unique articles violated or not violated
articles = pd.concat(
tuple(zip(*echr['violation'].dropna().str.split(';|\+', expand=True).items()))[1] \
+ tuple(zip(*echr['nonviolation'].dropna().str.split(';|\+', expand=True).items()))[1]
).dropna().unique()
articles
# >>> Moved to scrap
plt.figure(figsize=(15, 10))
plt.scatter(article_importance['num_respondents'], article_importance['importance_int'], s=article_importance['num_cases'], alpha=.3)
plt.xlabel('XXX')
plt.ylabel('XXX')
plt.title('XXX')
# >>> Moved to scrap
# Share of judgments per year concerning article 14
plt.figure(figsize=(15, 10))
plt.plot(echr.groupby('year').mean()["concerned_14"], label="concerned_14")
plt.grid()
plt.legend()
plt.xlabel('Year')
plt.ylabel('Share of Judgments')
plt.title('Timeline: Share of Judgments per Year concerning Art. 14 ECHR')
# >>> Moved to scrap
article_importance['num_respondents'].plot.kde()
plt.xlabel('XXX')
plt.title('XXX')
plt.grid()
# >>> Moved to scrap
plt.scatter(echr['num_violations'], echr['importance_int'])
# >>> Moved to scrap
df = echr[['num_violations', 'importance_int']].value_counts().sort_index().reset_index()
plt.scatter(df['num_violations'], df['importance_int'], s=df[0], alpha=.8)
plt.xlabel('Article Violations')
plt.ylabel('Importance Level')
plt.title('Scattered Distribution of Importance/Article Violations')
# >>> Moved to scrap
x = np.array([0, 1, 2, 3])
plt.bar(x - 0.22,
echr['importance_int'][echr['concerned_14']].value_counts().loc[x] / echr['concerned_14'].sum(),
0.4, label='Concerned')
plt.bar(x + 0.22,
echr['importance_int'][~echr['concerned_14']].value_counts().loc[x] / (~echr['concerned_14']).sum(),
0.4, label='Not concerned')
plt.title('Relative importance of Article 14 cases vs. other cases')
plt.legend()
plt.show()
# >>> Moved to scrap
stats.chi2_contingency(pd.crosstab(echr['article'].isna(), echr['importance_int']))
# >>> Moved to scrap
# Average importance of cases with and without violation information
print(echr.loc[echr['violation_information'], 'importance_int'].mean())
print(echr.loc[~echr['violation_information'], 'importance_int'].mean())
pd.crosstab(echr['article'].isna(), echr['importance_int'])
# >>> Moved to scrap
# Find unique individual articles at stake
articles = pd.concat(
[echr['violation'].dropna().str.split(';|\+', expand=True)[i] for i in range(22)]
+ [echr['nonviolation'].dropna().str.split(';|\+', expand=True)[i] for i in range(22)]
).dropna().unique()
article_violation = pd.concat(
[echr['violation'].str.contains('(?<!\d|-)' + article + '(?!\d)', na=False)
for article in articles],
keys=articles, axis=1)
article_nonviolation = pd.concat(
[echr['nonviolation'].str.contains('(?<!\d|-)' + article + '(?!\d)', na=False)
for article in articles],
keys=articles, axis=1)
#Reindex
article_concerned = article_nonviolation | article_violation
article_concerned = article_concerned.reindex(sorted(article_concerned.columns), axis=1)
key_articles = articles[article_concerned.sum() >= 500]
print(key_articles)
# Cleanup & Scrap after changing naming conventions for `article_concerned`, `article_violation`, `article_nonviolation`:
# With `x` the article number
#`article_concerned[x]` => `echr['concerned_x']`
#`article_violation[x]` => `echr['violation_x']`
#`article_nonviolation[x]` => `echr['nonviolation_x']`
# >>> Violin moved to scrap
# Scrapped since here we were simply testing how
# python would react.
fig, ax = plt.subplots()
sns.violinplot('importance_int', 'one respondent_int',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
ax.set_ylim(0,1)
ax.set_yticks(range(0,2))
fig.set_size_inches(8.5, 6)
# >>> Violin moved to scrap
# Scrapped since here we were simply testing how
# python would react.
fig, ax = plt.subplots()
sns.violinplot('importance_int', 'num respondents',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0)
fig = plt.gcf()
fig.set_size_inches(8.5, 6)