import pandas as pd
df_survey = pd.read_csv('kaggle_survey_2020_responses.csv',
dtype={'Time from Start to Finish (seconds)': object})
#Se declara el dipo de dato de la columa (0) para evitar el error
#Configuración de la impresión de columnas y filas al máximo para ver la estructura completa del dataframe
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df_survey.head(3)
df_survey.shape
df_survey.dtypes[0:10]
df_survey.describe().T[0:5]
df_survey0 = pd.read_csv('kaggle_survey_2020_responses.csv', usecols=['Q1','Q2','Q3'])
df_survey0.head(5)
questions = df_survey.iloc[:1,:].T
data = df_survey.iloc[1:,:]
questions.head(5)
data['Q1'].value_counts(normalize=True).mul(100).round(2).sort_index()
data['Q1'].value_counts(normalize=True).mul(100).round(2).sort_index().plot.bar(color="mediumturquoise")
pd.set_option("display.max_rows", None)
data[['Q3','Q4']].groupby(['Q3','Q4']).agg(count=('Q4', 'count')).sort_values(['Q3','count'], ascending=[True,False])
questions2 = questions.loc[['Q3', 'Q4', 'Q5', 'Q7_Part_1', 'Q24']]
data2 = data[['Q3','Q4','Q5','Q7_Part_1','Q24']]
#Se retira el limite al ancho de columna para obtener las preguntas completas.
pd.set_option('display.max_colwidth', None)
questions2
data2.head()
#Reemplazar una respuesta extensa por una recortada
pd.options.mode.chained_assignment = None
data2['Q4'] = data2['Q4'].replace(["Some college/university study without earning a bachelor’s degree"]
, "University study without degree")
q4 = data2['Q4'].value_counts(normalize=True, dropna=False).mul(100).reset_index()
q4.plot.bar(x='index', y='Q4'
, xlabel = 'Level of formal education'
, title='Highest level of formal education (or plan to attain within the next 2 years)'
, color="lightpink"
, legend=False)