Run to view results
import pandas as pd
# Replace the URL with the Wikipedia page URL containing the population data
wikipedia_url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
# Read the tables from the Wikipedia page
tables = pd.read_html(wikipedia_url)
# Identify the table that contains the population data based on your page structure
# You may need to check the 'tables' list and find the appropriate index
# For example, if the population table is the first table, you can use tables[0]
population_table = tables[0]
# Clean the data
# You may need to perform specific cleaning steps based on the structure of your data
# Here are some common cleaning steps, but you may need to customize them based on your data
# Drop rows with missing values
population_table = population_table.dropna()
# Rename columns if needed
# population_table = population_table.rename(columns={"OldColumn1": "NewColumn1", "OldColumn2": "NewColumn2"})
# Convert population data to numeric type
population_table['Population'] = pd.to_numeric(population_table['Population'], errors='coerce')
# Drop rows with missing or invalid population values
population_table = population_table.dropna(subset=['Population'])
# Display the cleaned DataFrame
print(population_table)
# Optionally, save the cleaned DataFrame to a new CSV file
population_table.to_csv('population_data.csv', index=False)
Run to view results
import pandas as pd
# Example: Creating a DataFrame named 'isocodes' (ISO codes data)
isocodes_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN']}
isocodes = pd.DataFrame(isocodes_data)
# Example: Creating a DataFrame named 'pop' (population data)
pop_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'Population': [331, 38, 67, 1380]} # Replace with your actual population data
pop = pd.DataFrame(pop_data)
# Example of merging 'isocodes' and 'pop'
merged_df = pd.merge(left=isocodes, right=pop, on='ISO_Code', how='inner')
# Display the merged DataFrame
print(merged_df)
# Optionally, save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_data.csv', index=False)
Run to view results
import pandas as pd
# Example of creating a DataFrame named 'isocodes'
isocodes_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'Country': ['United States', 'Canada', 'United Kingdom', 'India']}
isocodes = pd.DataFrame(isocodes_data)
# Example of creating a DataFrame named 'gdp_per_capita'
gdp_per_capita_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'GDP_Per_Capita': [65000, 48000, 42000, 2100]} # Replace with your actual GDP per capita data
gdp_per_capita = pd.DataFrame(gdp_per_capita_data)
# Example of creating a DataFrame named 'pop' (population data)
pop_data = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'Population': [331, 38, 67, 1380]} # Replace with your actual population data
pop = pd.DataFrame(pop_data)
# Now you can proceed with the merges
merged_df = pd.merge(isocodes, gdp_per_capita, on='ISO_Code', how='inner')
merged_df = pd.merge(merged_df, pop, on='ISO_Code', how='inner')
# Display the merged DataFrame
print(merged_df)
# Optionally, save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_data.csv', index=False)
Run to view results
import os
import matplotlib.pyplot as plt # Add this line to import matplotlib.pyplot
# Define folder paths
path = './data/'
pathout = './output/'
pathgraphs = './graphs/'
# Create folders if they don't exist
os.makedirs(path, exist_ok=True)
os.makedirs(pathout, exist_ok=True)
os.makedirs(pathgraphs, exist_ok=True)
# Save merged DataFrame to a CSV file in the 'data' folder
merged_df.to_csv(os.path.join(path, 'final_merged_data.csv'), index=False)
# Save a figure to the 'graphs' folder
plt.savefig(os.path.join(pathgraphs, 'example_plot.png'))
Run to view results
import os
# Define folder paths
pathout = './output/'
# Define the filename
filename = 'Wiki_Data'
# Save as CSV
csv_file_path = os.path.join(pathout, f'{filename}.csv')
merged_df.to_csv(csv_file_path, index=False)
print(f'DataFrame saved as CSV: {csv_file_path}')
# Save as Excel (XLSX)
xlsx_file_path = os.path.join(pathout, f'{filename}.xlsx')
merged_df.to_excel(xlsx_file_path, index=False)
print(f'DataFrame saved as Excel (XLSX): {xlsx_file_path}')
# Save as Stata
stata_file_path = os.path.join(pathout, f'{filename}.dta')
merged_df.to_stata(stata_file_path, write_index=False)
print(f'DataFrame saved as Stata: {stata_file_path}')
Run to view results
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Define folder paths
pathgraphs = './graphs/'
# Create regression plots
sns.set(style="whitegrid")
# Scatter plot with a linear fit
plt.figure(figsize=(10, 6))
sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, scatter_kws={'s': 20})
plt.title('Scatter Plot with Linear Fit')
plt.xlabel('GDP Per Capita')
plt.ylabel('Population')
plt.tight_layout()
# Save the plot in different formats
filename = 'regplot_linear_fit'
for ext in ['png', 'pdf', 'jpg']:
file_path = os.path.join(pathgraphs, f'{filename}.{ext}')
plt.savefig(file_path)
print(f'Plot saved as {ext}: {file_path}')
# Scatter plot with a polynomial fit
plt.figure(figsize=(10, 6))
sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, order=2, scatter_kws={'s': 20})
plt.title('Scatter Plot with Polynomial Fit')
plt.xlabel('GDP Per Capita')
plt.ylabel('Population')
plt.tight_layout()
# Save the plot in different formats
filename = 'regplot_polynomial_fit'
for ext in ['png', 'pdf', 'jpg']:
file_path = os.path.join(pathgraphs, f'{filename}.{ext}')
plt.savefig(file_path)
print(f'Plot saved as {ext}: {file_path}')
# Scatter plot with a lowess fit
plt.figure(figsize=(10, 6))
sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, lowess=True, scatter_kws={'s': 20})
plt.title('Scatter Plot with Lowess Fit')
plt.xlabel('GDP Per Capita')
plt.ylabel('Population')
plt.tight_layout()
# Save the plot in different formats
filename = 'regplot_lowess_fit'
for ext in ['png', 'pdf', 'jpg']:
file_path = os.path.join(pathgraphs, f'{filename}.{ext}')
plt.savefig(file_path)
print(f'Plot saved as {ext}: {file_path}')
# Scatter plot with a robust fit
plt.figure(figsize=(10, 6))
sns.regplot(x='GDP_Per_Capita', y='Population', data=merged_df, robust=True, scatter_kws={'s': 20})
plt.title('Scatter Plot with Robust Fit')
plt.xlabel('GDP Per Capita')
plt.ylabel('Population')
plt.tight_layout()
# Save the plot in different formats
filename = 'regplot_robust_fit'
for ext in ['png', 'pdf', 'jpg']:
file_path = os.path.join(pathgraphs, f'{filename}.{ext}')
plt.savefig(file_path)
print(f'Plot saved as {ext}: {file_path}')
# Display the plots
plt.show()
Run to view results
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Define folder paths
path = './data/'
pathout = './output/'
pathgraphs = './graphs/'
# Create folders if they don't exist
os.makedirs(path, exist_ok=True)
os.makedirs(pathout, exist_ok=True)
os.makedirs(pathgraphs, exist_ok=True)
# Example: Creating two DataFrames
data1 = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'GDP_Per_Capita': [65000, 48000, 42000, 2100]}
data2 = {'ISO_Code': ['US', 'CA', 'GB', 'IN'],
'Population_Growth': [2.1, 1.2, 0.8, 1.7]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
# Merge DataFrames on 'ISO_Code'
merged_df = pd.merge(df1, df2, on='ISO_Code', how='inner')
# Check the columns in merged_df
print("Columns of merged_df:", merged_df.columns)
# Define folder paths for saving plots
pathgraphs = './graphs/'
os.makedirs(pathgraphs, exist_ok=True)
# Create regression plots
sns.set(style="whitegrid")
# Scatter plot with a linear fit
plt.figure(figsize=(10, 6))
sns.regplot(x='GDP_Per_Capita', y='Population_Growth', data=merged_df, scatter_kws={'s': 20})
plt.title('Scatter Plot with Linear Fit')
plt.xlabel('GDP Per Capita')
plt.ylabel('Population Growth')
plt.tight_layout()
# Save the plot in different formats
filename = 'regplot_linear_fit_population_growth'
for ext in ['png', 'pdf', 'jpg']:
file_path = os.path.join(pathgraphs, f'{filename}.{ext}')
plt.savefig(file_path)
print(f'Plot saved as {ext}: {file_path}')
# Continue with additional regression plots or other analyses...
# Display the plots
plt.show()
Run to view results