1. Introduction to Data Analysis with Pandas

Run to view results

import pandas as pd # Replace the URL with the Wikipedia page URL containing the population data wikipedia_url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" # Read the tables from the Wikipedia page tables = pd.read_html(wikipedia_url) # Identify the table that contains the population data based on your page structure # You may need to check the 'tables' list and find the appropriate index # For example, if the population table is the first table, you can use tables[0] population_table = tables[0] # Clean the data # You may need to perform specific cleaning steps based on the structure of your data # Here are some common cleaning steps, but you may need to customize them based on your data # Drop rows with missing values population_table = population_table.dropna() # Rename columns if needed # population_table = population_table.rename(columns={"OldColumn1": "NewColumn1", "OldColumn2": "NewColumn2"}) # Convert population data to numeric type population_table['Population'] = pd.to_numeric(population_table['Population'], errors='coerce') # Drop rows with missing or invalid population values population_table = population_table.dropna(subset=['Population']) # Display the cleaned DataFrame print(population_table) # Optionally, save the cleaned DataFrame to a new CSV file population_table.to_csv('population_data.csv', index=False)

Run to view results

import pandas as pd # Example: Creating a DataFrame named 'isocodes' (ISO codes data) isocodes_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN']} isocodes = pd.DataFrame(isocodes_data) # Example: Creating a DataFrame named 'pop' (population data) pop_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'Population': [331, 38, 67, 1380]} # Replace with your actual population data pop = pd.DataFrame(pop_data) # Example of merging 'isocodes' and 'pop' merged_df = pd.merge(left=isocodes, right=pop, on='ISO_Code', how='inner') # Display the merged DataFrame print(merged_df) # Optionally, save the merged DataFrame to a new CSV file merged_df.to_csv('merged_data.csv', index=False)

Run to view results

import pandas as pd # Example of creating a DataFrame named 'isocodes' isocodes_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'Country': ['United States', 'Canada', 'United Kingdom', 'India']} isocodes = pd.DataFrame(isocodes_data) # Example of creating a DataFrame named 'gdp_per_capita' gdp_per_capita_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'GDP_Per_Capita': [65000, 48000, 42000, 2100]} # Replace with your actual GDP per capita data gdp_per_capita = pd.DataFrame(gdp_per_capita_data) # Example of creating a DataFrame named 'pop' (population data) pop_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'Population': [331, 38, 67, 1380]} # Replace with your actual population data pop = pd.DataFrame(pop_data) # Now you can proceed with the merges merged_df = pd.merge(isocodes, gdp_per_capita, on='ISO_Code', how='inner') merged_df = pd.merge(merged_df, pop, on='ISO_Code', how='inner') # Display the merged DataFrame print(merged_df) # Optionally, save the merged DataFrame to a new CSV file merged_df.to_csv('merged_data.csv', index=False)

Run to view results

import os import matplotlib.pyplot as plt # Add this line to import matplotlib.pyplot # Define folder paths path = './data/' pathout = './output/' pathgraphs = './graphs/' # Create folders if they don't exist os.makedirs(path, exist_ok=True) os.makedirs(pathout, exist_ok=True) os.makedirs(pathgraphs, exist_ok=True) # Save merged DataFrame to a CSV file in the 'data' folder merged_df.to_csv(os.path.join(path, 'final_merged_data.csv'), index=False) # Save a figure to the 'graphs' folder plt.savefig(os.path.join(pathgraphs, 'example_plot.png'))

Run to view results

import os # Define folder paths pathout = './output/' # Define the filename filename = 'Wiki_Data' # Save as CSV csv_file_path = os.path.join(pathout, f'{filename}.csv') merged_df.to_csv(csv_file_path, index=False) print(f'DataFrame saved as CSV: {csv_file_path}') # Save as Excel (XLSX) xlsx_file_path = os.path.join(pathout, f'{filename}.xlsx') merged_df.to_excel(xlsx_file_path, index=False) print(f'DataFrame saved as Excel (XLSX): {xlsx_file_path}') # Save as Stata stata_file_path = os.path.join(pathout, f'{filename}.dta') merged_df.to_stata(stata_file_path, write_index=False) print(f'DataFrame saved as Stata: {stata_file_path}')

Run to view results

import seaborn as sns import matplotlib.pyplot as plt import os # Define folder paths pathgraphs = './graphs/' # Create regression plots sns.set(style="whitegrid") # Scatter plot with a linear fit plt.figure(figsize=(10, 6)) sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, scatter_kws={'s': 20}) plt.title('Scatter Plot with Linear Fit') plt.xlabel('GDP Per Capita') plt.ylabel('Population') plt.tight_layout() # Save the plot in different formats filename = 'regplot_linear_fit' for ext in ['png', 'pdf', 'jpg']: file_path = os.path.join(pathgraphs, f'{filename}.{ext}') plt.savefig(file_path) print(f'Plot saved as {ext}: {file_path}') # Scatter plot with a polynomial fit plt.figure(figsize=(10, 6)) sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, order=2, scatter_kws={'s': 20}) plt.title('Scatter Plot with Polynomial Fit') plt.xlabel('GDP Per Capita') plt.ylabel('Population') plt.tight_layout() # Save the plot in different formats filename = 'regplot_polynomial_fit' for ext in ['png', 'pdf', 'jpg']: file_path = os.path.join(pathgraphs, f'{filename}.{ext}') plt.savefig(file_path) print(f'Plot saved as {ext}: {file_path}') # Scatter plot with a lowess fit plt.figure(figsize=(10, 6)) sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, lowess=True, scatter_kws={'s': 20}) plt.title('Scatter Plot with Lowess Fit') plt.xlabel('GDP Per Capita') plt.ylabel('Population') plt.tight_layout() # Save the plot in different formats filename = 'regplot_lowess_fit' for ext in ['png', 'pdf', 'jpg']: file_path = os.path.join(pathgraphs, f'{filename}.{ext}') plt.savefig(file_path) print(f'Plot saved as {ext}: {file_path}') # Scatter plot with a robust fit plt.figure(figsize=(10, 6)) sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, robust=True, scatter_kws={'s': 20}) plt.title('Scatter Plot with Robust Fit') plt.xlabel('GDP Per Capita') plt.ylabel('Population') plt.tight_layout() # Save the plot in different formats filename = 'regplot_robust_fit' for ext in ['png', 'pdf', 'jpg']: file_path = os.path.join(pathgraphs, f'{filename}.{ext}') plt.savefig(file_path) print(f'Plot saved as {ext}: {file_path}') # Display the plots plt.show()

Run to view results

import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import os # Define folder paths path = './data/' pathout = './output/' pathgraphs = './graphs/' # Create folders if they don't exist os.makedirs(path, exist_ok=True) os.makedirs(pathout, exist_ok=True) os.makedirs(pathgraphs, exist_ok=True) # Example: Creating two DataFrames data1 = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'GDP_Per_Capita': [65000, 48000, 42000, 2100]} data2 = {'ISO_Code': ['US', 'CA', 'GB', 'IN'], 'Population_Growth': [2.1, 1.2, 0.8, 1.7]} df1 = pd.DataFrame(data1) df2 = pd.DataFrame(data2) # Merge DataFrames on 'ISO_Code' merged_df = pd.merge(df1, df2, on='ISO_Code', how='inner') # Check the columns in merged_df print("Columns of merged_df:", merged_df.columns) # Define folder paths for saving plots pathgraphs = './graphs/' os.makedirs(pathgraphs, exist_ok=True) # Create regression plots sns.set(style="whitegrid") # Scatter plot with a linear fit plt.figure(figsize=(10, 6)) sns.regplot(x='GDP_Per_Capita', y='Population_Growth', data=merged_df, scatter_kws={'s': 20}) plt.title('Scatter Plot with Linear Fit') plt.xlabel('GDP Per Capita') plt.ylabel('Population Growth') plt.tight_layout() # Save the plot in different formats filename = 'regplot_linear_fit_population_growth' for ext in ['png', 'pdf', 'jpg']: file_path = os.path.join(pathgraphs, f'{filename}.{ext}') plt.savefig(file_path) print(f'Plot saved as {ext}: {file_path}') # Continue with additional regression plots or other analyses... # Display the plots plt.show()

Run to view results