Scrapbook for LA Group Project - Shared

# Import all the packages we need: import pandas as pd import numpy as np import matplotlib.pyplot as plt ## Additionally, stats: from scipy import stats ## Lastly, seaborn-violins: import seaborn as sns ## Even later, regular expressions import re ## And we forgot, some other things import random from collections import Counter

## Load the dataset into a DataFrame named 'echr' echr = pd.read_csv('ECtHR.csv', index_col=0) echr

echr.describe(include='all')

echr['respondent'].unique() # This shows us all respondents i.e., both cases # against a single state and cases against multiple states (they carry # a ';' between them).

# Calculate the number of respondents echr['num_respondents'] = (echr['respondent'].str.len() + 1) // 4 echr['one_respondent'] = (echr['num_respondents'] == 1) # Add variable 'key_case' which is true for the most important cases echr['key_case'] = (echr['importance'] == 'Key case') # For ease of calculation, add new column 'importance_int' for importance level with type int # Importance level of 'Key case' corresponds to 0 echr['importance_int'] = echr['importance'].replace('Key case', 0).astype(int) # Convert boolean variable 'separateopinion' into float for ease of calculation later on # (groupby does not work with boolean) echr['separateopinion_float'] = echr['separateopinion'].astype(float) # Calculate the number of individual violations found echr['num_violations'] = echr['violation'].str.split(';').str.len()

# Comparing the average importance level for one respondent vs multiple respondents echr.groupby('one_respondent').mean()[['importance_int', 'separateopinion_float']]

# Comparing the average importance level for any number of respondents echr.groupby('num_respondents').mean()['importance_int']

# Checking the extremes (comparing the percentage of the most important cases and of the least important cases) for one respondent echr.loc[echr['one_respondent'], 'importance_int'].value_counts(normalize=True)

# Checking the extremes (comparing the percentage of the most important cases and of the least important cases) for multiple respondents echr.loc[~echr['one_respondent'], 'importance_int'].value_counts(normalize=True)

# Comparing the percentages for one respondent vs multiple respondents in the following situation: # given a certain importance level, what is the likelihood that a separateopinion was filed echr.groupby(['importance_int', 'one_respondent']).mean()['separateopinion_float']

pd.crosstab(index=echr['importance'], columns=echr['num_respondents'], margins=True)

stats.chi2_contingency( pd.crosstab(index=echr['importance'], columns=echr['num_respondents'], margins=False))

stats.chi2_contingency( pd.crosstab(index=echr['key_case'], columns=echr['one_respondent'], margins=False), correction=True) # Yates' correction for 2x2 contingency tables

# Number of judgments per year plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').count()['ecli']) plt.grid() plt.xlabel('Year') plt.ylabel('Number of Judgments') plt.title('Timeline: Number of Judgements per Year')

# Average importance level of cases per year plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').mean()['importance_int']) plt.ylim(0, 3) plt.grid(axis='y') plt.xlabel('Year') plt.ylabel('Average importance level') plt.title('Average importance level of cases per year\n(0 most important to 3 least important)')

# Number of cases per year and importance level, stacked df = echr.groupby(['year', 'importance_int']).count()['ecli'].unstack().fillna(0)[[0, 1, 2, 3]]\ .cumsum(axis=1) # calling cumsum to stack the areas plt.figure(figsize=(15, 10)) # Make stacked areas plt.fill_between(df.index, df[2], df[3], color='#CAD593', label='Importance level 3') plt.fill_between(df.index, df[1], df[2], color='#A1C349', label='Importance level 2') plt.fill_between(df.index, df[0], df[1], color='#87A330', label='Importance level 1') plt.fill_between(df.index, 0, df[0], color='#243010', label='Importance level 0') plt.grid() plt.legend(loc='upper left') plt.xlabel('Year') plt.ylabel('Number of cases') plt.title('Number of cases in each importance level per year') # When writing our research paper, need to clarify explicitly whether the data point in 2009 is 1600 or 1100.

# Share of cases per year and importance level, stacked df = echr.groupby(['year', 'importance_int']).count()['ecli'].unstack().fillna(0)[[0, 1, 2, 3]] # First dividing each year by total number of cases, then calling cumsum to stack the areas df = df.divide(df.sum(axis=1), axis=0).cumsum(axis=1) plt.figure(figsize=(15, 10)) # Make stacked areas plt.fill_between(df.index, df[2], df[3], color='#CAD593', label='Importance level 3') plt.fill_between(df.index, df[1], df[2], color='#A1C349', label='Importance level 2') plt.fill_between(df.index, df[0], df[1], color='#87A330', label='Importance level 1') plt.fill_between(df.index, 0, df[0], color='#243010', label='Importance level 0') plt.grid() plt.legend(loc='upper left') plt.xlabel('Year') plt.ylabel('Share of cases') plt.title('Share of cases in each importance level per year')

# Some rows do not indicate which articles are concerned in the case print(echr['article'].isna().sum()) # Try to infer some of the articles from 'conclusion' echr.loc[echr['article'].isna(), 'conclusion'].unique()

# Use regular expressions to find article mentions in 'conclusion' echr.loc[echr['article'].isna(), 'conclusion'].str.extract("(P?\d[0-9a-z\-]*)", expand=False).unique()

# Fill in gaps in 'article' echr.loc[echr['article'].isna(), 'article'] \ = echr.loc[echr['article'].isna(), 'conclusion'].str.extract("(P?\d[0-9a-z\-]*)")

# Some rows indicate neither violations nor non-violations. # This may distort some of the following calculations. # Add boolean variable 'violation_information' for masking echr['violation_information'] = echr['violation'].notna() | echr['nonviolation'].notna() print((~echr['violation_information']).sum()) # Number of cases with missing 'violation' and 'nonviolation' # Check whether 'conclusion' contains information on the cases with missing 'violation' and 'nonviolation' echr.loc[~echr['violation_information'], 'conclusion'].unique()

# Try to extract missing violation information from 'conclusion' with regular expression violation_regex = 'Violati?on (?:of|de) *(?:Art\.? *|Article *)?([P\-0-9]+)' nonviolation_regex = 'No violati?on (?:of|de) *(?:Art\.? *|Article *)?([P\-0-9]+)'

# Apply regular expression to extract all mentions of violations from 'conclusion' df = echr.loc[~echr['violation_information'], 'conclusion'].str.extractall( violation_regex).unstack(fill_value=None) df.columns = [0, 1] # Join multiple violations with ';' into one string df.loc[df[1].notna(), 0] += ';' + df.loc[df[1].notna(), 1] # Feed the retrieved values back into 'echr' echr.loc[~echr['violation_information'], 'violation'] = df[0]

# Apply regular expression to extract all mentions of non-violations from 'conclusion' df = echr.loc[~echr['violation_information'], 'conclusion'].str.extractall( nonviolation_regex).unstack(fill_value=None) df.columns = [0] # At most one match for non-violations is found # Feed the retrieved values back into 'echr' echr.loc[~echr['violation_information'], 'nonviolation'] = df[0]

# Recalculate 'violation_information' after sanitising data echr['violation_information'] = echr['violation'].notna() | echr['nonviolation'].notna() print((~echr['violation_information']).sum()) # Number of cases with missing 'violation' and 'nonviolation' # Check 'conclusion' for the cases with missing violation information # to see if the regular expression matched all obvious cases echr.loc[~echr['violation_information'], 'conclusion'].unique()

# Find unique article combinations in 'article' # Article combinations are indicated with '+', so only split on ';' articles = echr['article'].dropna().str.split(';', expand=True) pd.concat([articles[col].dropna() for col in articles.columns]).unique()

# Number of unique articles in 'articles' len(articles)

# Find unique articles in 'article' # Split on '+' or ';' df = echr['article'].dropna().str.split(';|\+', expand=True) articles = pd.concat([df[col].dropna() for col in df.columns]).unique() articles

# Find unique article combinations **violated** df = echr['violation'].dropna().str.split(';', expand=True) pd.concat([df[col].dropna() for col in df.columns]).unique()

# Find unique article combinations **not violated** df = echr['nonviolation'].dropna().str.split(';', expand=True) pd.concat([df[col].dropna() for col in df.columns]).unique()

# Find unique articles **violated** or **not violated** # Concatenate matches on 'violation' and 'nonviolation' df = pd.concat( [echr['violation'].dropna().str.split(';|\+', expand=True), echr['nonviolation'].dropna().str.split(';|\+', expand=True)], axis=1) # Then, only keep unique values across all columns pd.concat([df.iloc[:, i].dropna() for i in range(len(df.columns))]).unique()

# Check how many unique articles have been found violated or not violated # These are far fewer than are concerned according to 'article' column len(pd.concat([df.iloc[:, i].dropna() for i in range(len(df.columns))]).unique())

# For each article x, set boolean variable 'violation_x' to whether it has been violated in the corresponding case # Analogously for non-violations # 'concerned_x' is 'violation_x' OR 'nonviolation_x' to indicate whether x has been at stake in the case for article in articles: echr['concerned_' + article] = echr['article'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False) echr['violation_' + article] = echr['violation'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False) echr['nonviolation_' + article] = echr['nonviolation'].str.contains('(?<!\d|-)' + article + '(?!\d)', regex=True, na=False) # Defragment dataframe in memory to improve performance echr = echr.copy()

# Generate keys for ease of access violation = 'violation_' + articles nonviolation = 'nonviolation_' + articles concerned = 'concerned_' + articles

# Generate descriptive information on the cases concerning each article # The average 'importance_int', 'num_respondents', 'key_case' where any given article is concerned article_importance = pd.concat( [echr.loc[echr[article], ['importance_int', 'num_respondents', 'key_case']].mean() for article in concerned], keys=articles, axis=1).T # The number of cases in which any given article is concerned, violated, and non-violated article_importance['num_cases'] = echr[concerned].sum().tolist() article_importance['num_violations'] = echr[violation].sum().tolist() article_importance['num_nonviolations'] = echr[nonviolation].sum().tolist() article_importance

This chart is empty

Chart was probably not set up properly in the notebook

# Per article, the number of cases in which it is concerned with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(echr[concerned].sum().sort_values(ascending=False))

# Filter out articles that are at stake in only few cases key_articles = articles[echr[concerned].sum() >= 2000] key_violation = ['violation_' + article for article in key_articles] key_nonviolation = ['nonviolation_' + article for article in key_articles] key_concerned = ['concerned_' + article for article in key_articles] key_articles

# Function to prettify article names def pretty_article(article_string): return 'Art. ' + re.sub("-(\w+)", "(\\1)", re.sub("^P(\d+)-(.*)", "\\2 of Protocol No. \\1", article_string))

# Number of judgments per year per article concerned # Note: A case may concern several articles plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').sum()[key_concerned], label=list(map(pretty_article, key_articles))) plt.grid() plt.legend() plt.xlabel('Year') plt.ylabel('Number of Judgments') plt.title('Timeline: Number of Judgements per Article concerned')

# Plot the average importance level of cases concerning each article df = article_importance.sort_values('importance_int', ascending=False) # Exclude articles with fewer than 500 instances df = df[df['num_cases'] > 500] plt.figure(figsize=(7, 20)) plt.grid(axis='x') # Horizontal bar plot plt.barh( df.index, df['importance_int'], height=(df['num_cases'] / df['num_cases'].max()), # Make the height of the bars proportional to the 'num_cases' label=list(map(pretty_article, df.index)), zorder=2) # Draw in the average importance leven across all cases avg_importance = echr['importance_int'].mean() plt.axvline(avg_importance, color='r') plt.yticks(ticks=range(len(df)), labels=list(map(pretty_article, df.index))) #plt.xlabel('ECHR Articles') plt.xlabel('Importance level') plt.title('Average importance per ECHR article\n\nBar Width: number of cases relating to this article\nRed line: Average importance level across all cases')

# Plot the average importance level of cases concerning each article df = article_importance.sort_values('num_respondents', ascending=True) # Exclude articles with fewer than 500 instances df = df[df['num_cases'] > 500] plt.figure(figsize=(7, 20)) plt.grid(axis='x') # Horizontal bar plot plt.barh( df.index, df['num_respondents'], height=(df['num_cases'] / df['num_cases'].max()), # Make the height of the bars proportional to the 'num_cases' label=list(map(pretty_article, df.index)), zorder=2) # Draw in the average number across all cases avg_importance = echr['num_respondents'].mean() plt.axvline(avg_importance, color='r') plt.yticks(ticks=range(len(df)), labels=list(map(pretty_article, df.index))) plt.xlim(1, 1.05) # Note: the y-axis starts at 1 because the difference between # the values is too small to be visible otherwise #plt.xlabel('ECHR Articles') plt.xlabel('Number of respondents') plt.title('Average number of respondents per ECHR article\n\nBar Width: number of cases relating to this article\nRed line: Average number of respondents across all cases')

# Number of judgments per year concerning article 14 plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').sum()["concerned_14"], label="concerned_14") plt.grid() plt.legend() plt.xlabel('Year') plt.ylabel('Number of Judgments') plt.title('Timeline: Number of Judgments per Year concerning Art. 14 ECHR')

# Number of judgments per year concerning article 6 plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').sum()[["concerned_6", "concerned_13", "concerned_14"]], label=["Article 6", "Article 13", "Article 14"]) plt.grid() plt.legend() plt.xlabel('Year') plt.ylabel('Number of Judgments') plt.title('Timeline: Number of Judgments per Year concerning Art. 6 ECHR')

# Importance of Articles 6, 13, 14 cases over time plt.figure(figsize=(15, 10)) # Filter for whether each Article is concerned, then # group by year to calculate average importance level # Note: the groups of cases concerning different articles are not mutually exclusive plt.plot(echr[echr['concerned_6']].groupby('year').mean()['importance_int'], label='Article 6') plt.plot(echr[echr['concerned_13']].groupby('year').mean()['importance_int'], label='Article 13') plt.plot(echr[echr['concerned_14']].groupby('year').mean()['importance_int'], label='Article 14') plt.grid() plt.legend() plt.ylabel('Importance level') plt.xlabel('Year') plt.title('Average importance level of cases per Article concerned by year')

# Article 6 # Independence of importance level print(pd.crosstab(echr['importance_int'], echr['concerned_6'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_6'])), "\n\n") # Independence of number of respondents print(pd.crosstab(echr['num_respondents'], echr['concerned_6'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_6'])))

# Article 13 # Independence of importance level print(pd.crosstab(echr['importance_int'], echr['concerned_13'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_13'])), "\n\n") # Independence of number of respondents print(pd.crosstab(echr['num_respondents'], echr['concerned_13'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_13'])))

# Article 14 # Independence of importance level print(pd.crosstab(echr['importance_int'], echr['concerned_14'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_14'])), "\n\n") # Independence of number of respondents print(pd.crosstab(echr['num_respondents'], echr['concerned_14'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_14'])))

# Article 1(1) of Protocol No. 1 # Independence of importance level print(pd.crosstab(echr['importance_int'], echr['concerned_P1-1-1'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['importance_int'], echr['concerned_P1-1-1'])), "\n\n") # Independence of number of respondents print(pd.crosstab(echr['num_respondents'], echr['concerned_P1-1-1'], margins=True), "\n") print(stats.chi2_contingency(pd.crosstab(echr['num_respondents'], echr['concerned_P1-1-1'])))

# Compare the number of cases in each importance level which ... # (blue) concern Article 6 # (orange) concern Article 6(1) # Note: these sets of cases are not mutually exclusive plt.figure(figsize=(7.5, 5)) x = np.array([0, 1, 2, 3]) plt.bar(x - 0.22, echr['importance_int'][echr['concerned_6']].value_counts().loc[x], 0.4, label='... Article 6') plt.bar(x + 0.22, echr['importance_int'][echr['concerned_6-1']].value_counts().loc[x], 0.4, label='... Article 6(1)') plt.xticks(ticks=x, labels=x) plt.legend(loc='upper left') plt.ylabel('Number of cases concerning ...') plt.xlabel('Importance level') plt.title('Number of cases in each importance level\nrelating to Article 6 and 6(1)')

# Compare the number of cases in each importance level where ... # (blue) a non-violation of Article 14 was found # (orange) a violation of Article 14 was found # Note: these sets of cases are not mutually exclusive print((echr['violation_14'] & echr['nonviolation_14']).sum()) plt.figure(figsize=(7.5, 5)) x = np.array([0, 1, 2, 3]) plt.bar(x - 0.22, echr['importance_int'][echr['violation_14']].value_counts().loc[x], 0.4, label='... violation of Art. 14') plt.bar(x + 0.22, echr['importance_int'][echr['nonviolation_14']].value_counts().loc[x], 0.4, label='... non-violation of Art. 14') plt.xticks(ticks=x, labels=x) plt.legend(loc='upper left') plt.ylabel('Number of cases with ...') plt.xlabel('Importance level') plt.title('Number of cases in each importance level with\nviolation vs. non-violation of Article 14')

# Compare the number of cases in each importance level where ... # (left) Article 14 is concerned # (right) Article 14 is not concerned # Note: these sets of cases *are* mutually exclusive # Cross-tabulate the importance level and whether 14 is concerned df = pd.crosstab(echr['concerned_14'], echr['importance_int']) # Divide by the total to get the share of each importance level df = df.divide(df.sum(axis=1), axis=0) plt.figure(figsize=(7.5, 5)) # Plot stacked bars plt.bar([1, 0], df[3], bottom=df[2] + df[1] + df[0], label='Importance level 3', color='#CAD593') plt.bar([1, 0], df[2], bottom=df[1] + df[0], label='Importance level 2', color='#A1C349') plt.bar([1, 0], df[1], bottom=df[0], label='Importance level 1', color='#87A330') plt.bar([1, 0], df[0], label='Importance level 0', color='#243010') plt.legend() plt.title('Distribution of importance levels\nfor Article 14 concerned vs. not concerned') plt.ylabel('Share of cases per importance level') plt.xlabel('Article 14...') plt.xticks(ticks=[0, 1], labels=['...concerned', '...not concerned'])

# Extend the previous plot to Articles 6 and 13 for comparison # Cross-tabulate the importance level and concern for each article df = pd.concat([ pd.crosstab(echr['concerned_6'], echr['importance_int']), pd.crosstab(echr['concerned_13'], echr['importance_int']), pd.crosstab(echr['concerned_14'], echr['importance_int'])], axis=0) # Divide by the total to get the share of each importance level df = df.divide(df.sum(axis=1), axis=0) plt.figure(figsize=(15, 5)) # Plot stacked bars in groups # At x = 0 and 1 for Art. 6 # 3 and 4 for Art. 13 # 3 and 4 for Art. 14 plt.bar([1, 0, 4, 3, 7, 6], df[3], bottom=df[2] + df[1] + df[0], label='Importance level 3', color='#CAD593') plt.bar([1, 0, 4, 3, 7, 6], df[2], bottom=df[1] + df[0], label='Importance level 2', color='#A1C349') plt.bar([1, 0, 4, 3, 7, 6], df[1], bottom=df[0], label='Importance level 1', color='#87A330') plt.bar([1, 0, 4, 3, 7, 6], df[0], label='Importance level 0', color='#243010') #plt.legend(loc=(.2475, .7)) plt.legend(loc='upper left') plt.title('Distribution of importance levels\nfor Articles 6, 13, 14 concerned vs. not concerned') plt.ylabel('Share of cases per importance level') #plt.xlabel('') plt.xticks(ticks=[0, 1, 3, 4, 6, 7], labels=['Art. 6 concerned', '...not concerned', 'Art. 13 concerned', '...not concerned', 'Art. 14 concerned', '...not concerned'])

# Overview over the number of counts on which a violation was found # tabulated against the importance level pd.crosstab(echr['num_violations'], echr['importance_int'])

#Let us prepare the violins import random from collections import Counter echrNum = echr.copy() # Calculate the number of respondents and reduce to three columns echrNum['num respondents'] = (echr['respondent'].str.len() + 1) // 4 echrNum['one respondent'] = (echr['num_respondents'] == 1) #.replace({True: 1, False: 0}) echrNum['one respondent_int'] = (echr['num_respondents'] == 1).replace({True: 1, False: 0}).map(int) echrNum['key case'] = (echr['importance'] == 'Key case') #.replace({True: 1, False: 0}) echrNum['importance_int'] = echr['importance'].replace('Key case', 0).map(int).astype(int).map(int) echrNum['num violations'] = (echr['violation'].str.split(';').str.len()) print(Counter(echrNum['importance_int'])) print(Counter(echrNum['num violations'])) echrNum = echrNum[echrNum['num violations'].notna()] echrNum['num violations'] = echrNum['num violations'].map(int) print(Counter(echrNum['num violations'])) print(Counter(echrNum['num respondents']))

## Violin showing the KDE of Importance/Multiple Respondents/Article Violations Distributions. # Note: I suggest bw of 0.7, since for 0.6 there # will be a dot between 1 and 2, # which would not make sense for article violations. # N.B.: It's clear thus far in our exploratory analysis that # our data is very discretised. import seaborn as sns fig, ax = plt.subplots() sns.violinplot('importance_int', 'num violations',bw=0.7,gridsize =200, hue="one respondent", scale="width", inner = "quartile", split = True,data=echrNum, cut = 0,ax=ax) ax.set_ylim(1,10) ax.set_yticks(range(1,11)) fig.set_size_inches(8.5, 6) plt.xlabel('Importance Level') plt.ylabel('Article Violations') plt.title('KDE of Importance/Multiple Respondents/Article Violations Distributions')

## Violin for illustrating KDE of Importance/Multiple Respondents/Separate Opinion Distributions fig, ax = plt.subplots() sns.violinplot('separateopinion', 'importance_int',bw=0.5,gridsize =200, hue="one respondent", scale="width", inner = "box", split = True,data=echrNum, cut = 0) ax.set_ylim(0,3) ax.set_yticks(range(0,4)) fig.set_size_inches(8.5, 6) plt.xlabel('Presence of a Separate Opinion') plt.ylabel('Importance Level') plt.title('KDE of Importance/Multiple Respondents/Separate Opinion Distributions')

# Count of frequencies for each value in separateopinion (=True)? absfrequency=echr['separateopinion'].value_counts(sort=True) relfrequency=echr['separateopinion'].value_counts(normalize=True).round(2) print(absfrequency)

## Violin illustrating KDE of Article Violations/Multiple Respondents/Separate Opinion Distributions fig, ax = plt.subplots() sns.violinplot('separateopinion', 'num violations',bw=0.3,gridsize =200, hue="one respondent", scale="width", inner = "box", split = True,data=echrNum, cut = 0) ax.set_ylim(1,10) ax.set_yticks(range(1,11)) fig.set_size_inches(8.5, 6) plt.xlabel('Presence of a Separate Opinion') plt.ylabel('Article Violations') plt.title('KDE of Art. Violations/Multiple Respondents/Separate Opinion Distributions')

## Violin illustrating KDE of Importance/Number of Respondents/Separate Opinion Distributions fig, ax = plt.subplots() sns.violinplot('num respondents','importance_int',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0) ax.set_ylim(0,3) ax.set_yticks(range(0,4)) fig.set_size_inches(8.5, 6) plt.xlabel('Number of Respondents') plt.ylabel('Importance Level') plt.title('KDE of Importance/Number of Respondents/Separate Opinion Distributions')

## Violin illustrating KDE of Importance/Chamber Category/Separate Opinion Distributions # Let's first remove the 'Committee' category, as we want to visualize # those judgments of the Grand Chamber and the Chamber. categroyRemoved = echrNum[echrNum['doctypebranch'] != 'COMMITTEE'] fig, ax = plt.subplots() sns.violinplot('doctypebranch', 'importance_int',bw=0.5,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=categroyRemoved, cut = 0) #sns.violinplot('doctypebranch', 'importance_int',bw=0.5,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 2) ax.set_ylim(0,3) ax.set_yticks(range(0,4)) fig.set_size_inches(8.5, 6) plt.xlabel('Category of Chamber') plt.ylabel('Importance Level') plt.title('KDE of Importance/Chamber Category/Separate Opinion Distributions')

## Violin illustrating KDE of Article Violations/Chamber Category/Separate Opinion Distributions fig, ax = plt.subplots() sns.violinplot('doctypebranch', 'num violations',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0) fig = plt.gcf() fig.set_size_inches(8.5, 6) plt.xlabel('Category of Chamber') plt.ylabel('Article Violations') plt.title('KDE of Article Violations/Chamber Category/Separate Opinion Distributions') #fig.savefig('test2png.png', dpi=100) #plt.scatter(echrNum['num_respondents']+(random.random()*4.2), echrNum['num_violations']+random.random()*0.2, alpha=0.2, s=20+100*echrNum['key_case'], cmap='viridis')

# >>> Moved to scrap O = pd.crosstab(index=echr['key_case'], columns=echr['one_respondent'], margins=True) O

# >>> Moved to scrap E = pd.concat( 3 * [O.loc[:, 'All']], keys=[True, False, 'All'], axis=1 ).mul(O.loc['All', :], axis=1) / O.loc['All', 'All'] E

# >>> Moved to scrap O.plot(kind='bar')

# >>> Moved to scrap # Calculate chi squared statistic ((O - E)**2 / E).sum().sum()

# >>> Moved to scrap stats.chi2_contingency(O.iloc[0:2, 0:2], correction=False)

# >>> Moved to scrap stats.fisher_exact(O.iloc[0:2, 0:2])

# >>> Moved to scrap stats.barnard_exact(O.iloc[0:2, 0:2])

# >>> Moved to scrap # Find unique articles violated or not violated articles = pd.concat( tuple(zip(*echr['violation'].dropna().str.split(';|\+', expand=True).items()))[1] \ + tuple(zip(*echr['nonviolation'].dropna().str.split(';|\+', expand=True).items()))[1] ).dropna().unique() articles

# >>> Moved to scrap plt.figure(figsize=(15, 10)) plt.scatter(article_importance['num_respondents'], article_importance['importance_int'], s=article_importance['num_cases'], alpha=.3) plt.xlabel('XXX') plt.ylabel('XXX') plt.title('XXX')

# >>> Moved to scrap # Share of judgments per year concerning article 14 plt.figure(figsize=(15, 10)) plt.plot(echr.groupby('year').mean()["concerned_14"], label="concerned_14") plt.grid() plt.legend() plt.xlabel('Year') plt.ylabel('Share of Judgments') plt.title('Timeline: Share of Judgments per Year concerning Art. 14 ECHR')

# >>> Moved to scrap article_importance['num_respondents'].plot.kde() plt.xlabel('XXX') plt.title('XXX') plt.grid()

# >>> Moved to scrap plt.scatter(echr['num_violations'], echr['importance_int'])

# >>> Moved to scrap df = echr[['num_violations', 'importance_int']].value_counts().sort_index().reset_index() plt.scatter(df['num_violations'], df['importance_int'], s=df[0], alpha=.8) plt.xlabel('Article Violations') plt.ylabel('Importance Level') plt.title('Scattered Distribution of Importance/Article Violations')

# >>> Moved to scrap x = np.array([0, 1, 2, 3]) plt.bar(x - 0.22, echr['importance_int'][echr['concerned_14']].value_counts().loc[x] / echr['concerned_14'].sum(), 0.4, label='Concerned') plt.bar(x + 0.22, echr['importance_int'][~echr['concerned_14']].value_counts().loc[x] / (~echr['concerned_14']).sum(), 0.4, label='Not concerned') plt.title('Relative importance of Article 14 cases vs. other cases') plt.legend() plt.show()

# >>> Moved to scrap stats.chi2_contingency(pd.crosstab(echr['article'].isna(), echr['importance_int']))

# >>> Moved to scrap # Average importance of cases with and without violation information print(echr.loc[echr['violation_information'], 'importance_int'].mean()) print(echr.loc[~echr['violation_information'], 'importance_int'].mean()) pd.crosstab(echr['article'].isna(), echr['importance_int'])

# >>> Moved to scrap # Find unique individual articles at stake articles = pd.concat( [echr['violation'].dropna().str.split(';|\+', expand=True)[i] for i in range(22)] + [echr['nonviolation'].dropna().str.split(';|\+', expand=True)[i] for i in range(22)] ).dropna().unique() article_violation = pd.concat( [echr['violation'].str.contains('(?<!\d|-)' + article + '(?!\d)', na=False) for article in articles], keys=articles, axis=1) article_nonviolation = pd.concat( [echr['nonviolation'].str.contains('(?<!\d|-)' + article + '(?!\d)', na=False) for article in articles], keys=articles, axis=1) #Reindex article_concerned = article_nonviolation | article_violation article_concerned = article_concerned.reindex(sorted(article_concerned.columns), axis=1) key_articles = articles[article_concerned.sum() >= 500] print(key_articles) # Cleanup & Scrap after changing naming conventions for `article_concerned`, `article_violation`, `article_nonviolation`: # With `x` the article number #`article_concerned[x]` => `echr['concerned_x']` #`article_violation[x]` => `echr['violation_x']` #`article_nonviolation[x]` => `echr['nonviolation_x']`

# >>> Violin moved to scrap # Scrapped since here we were simply testing how # python would react. fig, ax = plt.subplots() sns.violinplot('importance_int', 'one respondent_int',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0) ax.set_ylim(0,1) ax.set_yticks(range(0,2)) fig.set_size_inches(8.5, 6)

# >>> Violin moved to scrap # Scrapped since here we were simply testing how # python would react. fig, ax = plt.subplots() sns.violinplot('importance_int', 'num respondents',bw=0.3,gridsize =200, hue="separateopinion", scale="width", inner = "box", split = True,data=echrNum, cut = 0) fig = plt.gcf() fig.set_size_inches(8.5, 6)