import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv("/work/Data/engineers employed with firms-Jul-Aug-Sep-2019.csv")
df.replace(
[
"Sr.",
"PEC Reg No",
"Engineer's Name",
"Father's Name",
"Firm's Name",
"Firm's Lic No",
"Firm's City",
"Engineer's Employment Period",
"Unnamed: 8",
],
np.nan,
inplace=True,
)
df.head()
df.columns
df.rename(
columns={
"Sr.": "ID",
"PEC Reg No": "PEC_Reg_No",
"Engineer's Name": "Engineer_Name",
"Father's Name": "Father_Name",
"Firm's Name": "Company_Name",
"Firm's Lic No": "Firm_License",
"Firm's City": "Firms_City",
"Engineer's Employment Period": "Employment_Start_Date",
"Unnamed: 8": "Employment_End_Date",
},
inplace=True,
)
df["Firms_City"].value_counts()[0:10]
df.info()
fig = px.histogram(
df,
x="Firms_City",
labels={"count": "Number of Employed Engineers", "Firms_City": "Cities"},
title="Firm's Cities Distribution",
).update_xaxes(categoryorder="total descending")
fig.show()
df["Employment_Start_Date"] = pd.to_datetime(df["Employment_Start_Date"]).dt.date
df["Employment_End_Date"] = pd.to_datetime(df["Employment_End_Date"]).dt.date
df.head()
df.to_csv("/work/Clean Data/simple_PEC.csv", index=True)
df["Employment_period"] = pd.to_datetime(df["Employment_End_Date"]) - pd.to_datetime(
df["Employment_Start_Date"]
)
df.head()
fig = px.histogram(
df,
x="Employment_period",
labels={"Employment_period": "Duration of Employment"},
title="Engineer's Employment Period Per Second",
)
fig.show()
Date = (df.Employment_period.dt.days) / 365
fig = px.histogram(
Date,
x="Employment_period",
labels={"Employment_period": "Duration of Employment"},
title="Engineer's Employment Period Per Year",
)
fig.show()
df[["Engineering_Degree", "PEC_No"]] = df.PEC_Reg_No.str.split("/", expand=True)
df[["Firm_Category", "License_no"]] = df.Firm_License.str.split("/", expand=True)
df.head()
fig = px.histogram(
df,
x="Engineering_Degree",
labels={"Engineering_Degree": "Type of Engineers"},
title="Type of Engineers Distribution",
)
fig.show()
df.Engineering_Degree.unique()
fig = px.histogram(df, x="Firm_Category")
fig.show()
Degree_City = (
df[["Firms_City", "Firm_Category"]]
.value_counts()
.rename_axis(["Firms_City", "Firm_Category"])
.reset_index(name="counts")[:100]
)
fig = px.bar(
Degree_City,
x="Firms_City",
y="counts",
color="Firm_Category",
title="Cities and Firm Category",
)
fig.show()
Degree_City = (
df[["Engineering_Degree", "Firm_Category"]]
.value_counts()
.rename_axis(["Engineering_Degree", "Firm_Category"])
.reset_index(name="counts")[:100]
)
fig = px.bar(
Degree_City,
x="Engineering_Degree",
y="counts",
color="Firm_Category",
title="Engineering Degree and Firm Category",
)
fig.show()
df.to_csv("/work/Clean Data/cleaned_engineering_data.csv", index=False)
pak = pd.read_csv("/work/Data/pk.csv")
pak["city"] = pak["city"].str.lower()
df["Firms_City"] = df["Firms_City"].str.lower()
Spatial_data = pd.merge(df, pak, how="left", left_on="Firms_City", right_on="city")
Spatial_data.head()
fig = px.histogram(Spatial_data, x="admin_name")
fig.show()
geo = Spatial_data["city"].value_counts().rename_axis("city").reset_index(name="counts")
clean_geo = geo.merge(pak, how="inner", on="city")
clean_geo.head()
clean_geo = clean_geo[
["city", "lat", "lng", "admin_name", "population_proper", "counts"]
]
clean_geo = clean_geo.rename(
columns={
"lat": "Latitude",
"lng": "Longitude",
"population_proper": "Population",
"admin_name": "Province",
"counts": "Engineers",
}
)
clean_geo.head()
Spatial_data.columns
Eng_data = Spatial_data[
[
"ID",
"Engineer_Name",
"Father_Name",
"Company_Name",
"Employment_Start_Date",
"Employment_End_Date",
"Employment_period",
"Engineering_Degree",
"PEC_No",
"Firm_Category",
"Firms_City",
"License_no",
"lat",
"lng",
"admin_name",
]
]
Eng_data.rename(
columns={
"lat": "Latitude",
"lng": "Longitude",
"population_proper": "Population",
"admin_name": "Province",
},
inplace=True,
)
Eng_data.to_csv("/work/Clean Data/PITC_Engineering.csv", index=True)
clean_geo["text"] = (
clean_geo["city"]
+ "<br> Employed_Engineers "
+ (clean_geo["Engineers"]).astype(str)
)
# limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)]
limits = [(0, 3), (3, 9), (9, 19), (19, 49), (49, 3000)]
colors = ["royalblue", "crimson", "lightseagreen", "orange", "lightgrey"]
scale = 5000
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = clean_geo[lim[0] : lim[1]]
fig.add_trace(
go.Scattergeo(
lon=df_sub["Longitude"],
lat=df_sub["Latitude"],
text=df_sub["text"],
marker=dict(
size=df_sub["Engineers"],
color=colors[i],
line_color="rgb(40,40,40)",
line_width=0.5,
sizemode="area",
),
name=f"Top {lim[0] + 1} - {lim[1]}",
)
)
fig.update_layout(
title_text="Engineers Employment based on Cities ",
showlegend=True,
legend_title="Engineers",
legend_title_font_size=14,
geo=dict(
scope="asia",
landcolor="rgb(217, 217, 217)",
lonaxis=dict(range=[60.578993, 82.65129]),
lataxis=dict(range=[24.407138, 36.885931]),
),
)
fig.show()
estimation = {
"Employed": 12560,
"Brain_Drain": 10000,
"Unemployed/Unknown status": 7440,
}
import matplotlib.pyplot as plt
plt.bar(range(len(estimation)), list(estimation.values()), align="center")
plt.xticks(range(len(estimation)), list(estimation.keys()));
time = df[["Firms_City", "Employment_period", "Engineering_Degree", "Firm_Category"]]
def time_cat(df):
days = df
if days <= 365:
return "One Year"
elif days > 365 and days <= 2 * 365:
return "Two Year"
else:
return "Three Year"
time.loc[:, "Time_cat"] = time.loc[:, "Employment_period"].dt.days.apply(time_cat)
fig = px.parallel_categories(time)
fig.show()
large_scale = pd.read_csv("/work/Data/large-scale-industries-csv.csv")
large_scale.head()
large_scale["Product"].value_counts()
Invalid_Data = pd.read_csv("/work/Data/List of Engineers with Invalid Data.csv")
Invalid_Data.head()
Invalid_Data.rename(
columns={
"Unnamed: 1": "ID",
"Unnamed: 3": "Discipline",
"Unnamed: 5": "PEC Number",
"Unnamed: 7": "Engineers Name",
},
inplace=True,
)
Invalid_Data = Invalid_Data[["ID", "Discipline", "PEC Number", "Engineers Name"]]
Invalid_Data.replace(
["Sr. No.", "Discipline", "PEC Number", "Engineers Name"], np.nan, inplace=True
)
Invalid_Data.dropna().head()
Invalid_Data["Employment_status"] = "Invalid/Unemployed"
Spatial_data["Employment_status"] = "Employed"
fig = px.histogram(Invalid_Data, x="Discipline", title="Invalid/Unemployed Engineers")
fig.show()
Invalid_Data.info()
!nbqa black notebook.ipynb
!nbqa pyupgrade notebook.ipynb --py36-plus
!nbqa isort notebook.ipynb
!nbqa pylint notebook.ipynb --disable=C0114
!nbqa mdformat notebook.ipynb --nbqa-md --nbqa-diff