Capstone Project: SportsStats (Week3)
Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
pd.set_option('display.float_format', lambda x: '%.2f' % x)
Data
df_atletas = pd.read_csv('/work/data/athlete_events.csv')
df_paises = pd.read_csv('/work/data/noc_regions.csv')
Merge DataFrames
SELECT A.ID,
A.Name,
A.Sex,
A.Age,
A.Height,
A.Weight,
A.Medal,
A.Team,
P.Region AS 'Region',
A.Year,
A.Season,
A.City,
A.Games,
A.Event,
A.Sport
FROM df_atletas A
LEFT JOIN df_paises P
ON A.NOC = P.NOC
Null values
valores_nulos = df_juegos.isna()
columnas_nulas = valores_nulos.any()
columnas_nulas = columnas_nulas.reset_index().values.tolist()
print([i for i in columnas_nulas if i[1]==True])
Region Input
Countries with most medals won by medal type
SELECT Region,
SUM(case Medal when 'Gold' then 1 else 0 end) AS Count_Gold,
SUM(case Medal when 'Silver' then 1 else 0 end) AS Count_Silver,
SUM(case Medal when 'Bronze' then 1 else 0 end) AS Count_Bronze,
Count(Medal) AS Total
FROM df_juegos
WHERE Medal IS NOT NULL --AND Region like '%Per%'
GROUP BY Region
ORDER BY Total DESC
df_usa = df_juegos.query('Region=="USA"')
df_usa.Medal.value_counts()
df_rusia = df_juegos.query('Region=="Russia"')
df_rusia.Medal.value_counts()
Sports with the highest number of winners
top_10_sport_usa = df_usa.Sport.value_counts().sort_values(ascending=False).head(10)
top_10_sport_usa
Top 10 Athletics USA - Athletics
top_10_Athletics_usa = df_usa.query('Sport=="Athletics"').Name.value_counts().sort_values(ascending=False).head(10)
top_10_Athletics_usa
Top 10 Athletics USA - Swimming
top_10_Swimming_usa = df_usa.query('Sport=="Swimming"').Name.value_counts().sort_values(ascending=False).head(10)
top_10_Swimming_usa
Top10 Sport of USA
plt.figure(figsize=(12,6))
sn.barplot(x=top_10_sport_usa.index, y=top_10_sport_usa)
plt.title('Top 10 Sport of USA')
Comparison between countries
input_region_1
Peru
input_region_2
Chile
SELECT Region,
Year,
Sum(CASE medal
WHEN 'Gold' THEN 1
ELSE 0
END) AS Count_Gold,
Sum(CASE medal
WHEN 'Silver' THEN 1
ELSE 0
END) AS Count_Silver,
Sum(CASE medal
WHEN 'Bronze' THEN 1
ELSE 0
END) AS Count_Bronze,
Count(medal) AS Total
FROM df_juegos
WHERE Region in ({{input_region_1}}, {{input_region_2}}) AND Medal IS NOT NULL
GROUP BY Region, Year
ORDER BY Region, Year
SELECT Region, SUM(Total)
FROM df_region_medal
GROUP BY Region
Athletes Top
SELECT T.*
FROM (SELECT id,
NAME,
year,
sport,
city,
Sum(CASE medal
WHEN 'Gold' THEN 1
ELSE 0
END) AS Count_Gold,
Sum(CASE medal
WHEN 'Silver' THEN 1
ELSE 0
END) AS Count_Silver,
Sum(CASE medal
WHEN 'Bronze' THEN 1
ELSE 0
END) AS Count_Bronze,
Count(medal) AS Total
FROM df_juegos
WHERE medal IS NOT NULL
GROUP BY id,
NAME,
year,
sport,
city
ORDER BY total DESC,
count_gold DESC) T
INNER JOIN
(SELECT sport,
Max(count_gold) AS 'Max_Count_Gold',
Max(total) AS 'Max_Total'
FROM df_top
WHERE total > 1
GROUP BY sport) M
ON T.Sport = M.Sport
AND T.Count_Gold = M.Max_Count_Gold
AND T.Total = M.Max_Total
Participation by year
SELECT Year, Season, Count(*) as Total
FROM df_atletas
GROUP BY Year, Season
ORDER BY Year
SELECT region,
season,
Round(Avg(count_athletes), 0) AS 'avg_participation'
FROM (SELECT region,
season,
year,
Count(id) AS 'Count_Athletes'
FROM df_juegos
WHERE region IS NOT NULL
GROUP BY region,
season,
year)
GROUP BY region,
season
ORDER BY 'avg_participation' DESC
Region Bottom
SELECT Region,
year,
sport,
Count(ID) AS Total
FROM df_juegos
WHERE medal IS NULL AND Region is not null
GROUP BY Region,
year,
sport
ORDER BY Total desc
LIMIT 1000
SELECT Region, Year, Sum(total) as 'Total'
FROM df_bottom
GROUP BY Region, YEAR
order by 3 desc
LIMIT 1000