#install and import pandassql library
!pip install pandasql
import pandas as pd
import pandasql as ps
#Here we import our csv files as pandas DataFrames which emulate our database "tables"
#Data validation has been performed prior and all column data types are correct
df_application = pd.read_csv("application.csv")
df_department = pd.read_csv("department.csv")
df_interview = pd.read_csv("interview.csv")
df_job = pd.read_csv("job.csv")
df_job_department = pd.read_csv("job_department.csv")
df_job_post = pd.read_csv("job_post.csv")
df_job_stage = pd.read_csv("job_stage.csv")
df_scheduled_interview = pd.read_csv("scheduled_interview.csv")
df_scheduled_interviewer = pd.read_csv("scheduled_interviewer.csv")
### Interviews Dataset
#Here we create our Interview dataset by joining the relevant tables to produce our flat structured summary table.
#This allows for all relevant data to be in one table that can be queried by analysts in a simple manner without having to do the joins
#The joins are all according to the provided ERD diagram
#Left most table is `df_job` as one `job_id` may have many `interview_id`s
#We then have to link `df_job` to `df_job_stage` as this table contains the foreign key for `df_interview` viz. `job_stage_id`
#Once those are joined all the other joins can now be completed in a fairly straightforward manner using `job_id` and `interview_id`
q= """
SELECT
a.id as job_id,
a.name as job_name,
a.status as job_status,
a.created_at as job_created_at,
a.closed_at as job_closed_at,
b.id as job_stage_id,
b.name as job_stage_name,
b.created_at as job_stage_created_at,
b.updated_at as job_stage_updated_at,
c.id as interview_id,
c.name as interview_name,
d.id as scheduled_interview_id,
d.application_id,
d.location as scheduled_interview_location,
d.status as scheduled_interview_status,
d.created_at as scheduled_interview_created_at,
d.updated_at as scheduled_interview_updated_at,
d.end as scheduled_interview_end,
d.start as scheduled_interview_start,
d.organizer_id as scheduled_interview_organizer_id,
e.interviewer_id,
e.scorecard_id,
f.department_id,
g.name as department_name,
g.parent_id as department_parent_id,
g.external_id as department_external_id
FROM df_job a
LEFT JOIN df_job_stage b ON a.id = b.job_id
LEFT JOIN df_job_department f ON f.job_id = a.id
LEFT JOIN df_department g ON g.id = f.department_id
LEFT JOIN df_interview c ON b.id = c.job_stage_id
LEFT JOIN df_scheduled_interview d on d.interview_id = c.id
LEFT JOIN df_scheduled_interviewer e on e.scheduled_interview_id = d.id
"""
#Table is saved as `interview_summary`
interview_summary = ps.sqldf(q)
#Now we can use the summary table to answer the questions using straightforward queries
# - Which jobs have the highest number of interviews?
q1= """
SELECT job_name, count(DISTINCT interview_id) as interview_count
FROM interview_summary
GROUP BY job_name
ORDER BY interview_count DESC
"""
ps.sqldf(q1)
# - Which stages in the application have the highest number of interviews?
q2= """
SELECT job_stage_name, count(DISTINCT interview_id) as interview_count
FROM interview_summary
GROUP BY job_stage_name
ORDER BY interview_count DESC
"""
ps.sqldf(q2)
# - Which interviewers perform the most interviews?
q3= """
SELECT interviewer_id, count(DISTINCT interview_id) as interview_count
FROM interview_summary
GROUP BY interviewer_id
ORDER BY interview_count DESC
"""
ps.sqldf(q3)
# - Which department has performed the most interviews?
q4= """
SELECT department_name, count(DISTINCT interview_id) as interview_count
FROM interview_summary
GROUP BY department_name
ORDER BY interview_count DESC
"""
ps.sqldf(q4)
### Applications Dataset
#Here we create our Applications dataset by joining the relevant tables to produce our flat structured summary table.
#The `job_summary` table has the `job` table as the left most. This is so that jobs without applications are still included in our dataset.
#The applications_summary table has the `applications` table as the left most table. This will exclude jobs that do not have any applications.
#Both tables can be used depending on the query use-case
q= """
SELECT
a.id as application_id,
a.candidate_id,
a.applied_at,
a.rejected_at,
a.last_activity_at,
a.prospect,
a.location_address,
a.status,
a.is_deleted,
a.source_id,
a.rejected_reason_id,
a.credited_to_user_id,
a.current_stage_id ,
prospect_pool_id,
prospect_stage_id,
prospect_owner_id,
b.id as job_stage_id,
b.name as job_stage_name,
b.created_at as job_stage_created_at,
b.updated_at as job_stage_updated_at,
b.job_id as job_id,
c.name as job_name,
c.status as job_status,
c.created_at as job_created_at,
c.closed_at as job_closed_at,
d.department_id,
e.parent_id as dept_parent_id,
e.name as dept_name,
e.external_id as dept_external_id,
f.id as job_post_id,
f.title as job_post_title,
f.internal as job_post_internal,
f.external as job_post_external,
f.live as job_post_live,
f.content as job_post_content,
f.internal_content as job_post_internal_content,
f.created_at as job_post_created_at,
f.updated_at as job_post_updated_at,
f.location_name as job_post_location_name
FROM df_job c
LEFT JOIN df_job_stage b ON c.id = b.job_id
LEFT JOIN df_application a ON a.current_stage_id = b.id
LEFT JOIN df_job_department d ON c.id = d.job_id
LEFT JOIN df_department e ON d.department_id = e.id
LEFT JOIN df_job_post f ON f.job_id = b.job_id
"""
job_summary = ps.sqldf(q)
q= """
SELECT
a.id as application_id,
a.candidate_id,
a.applied_at,
a.rejected_at,
a.last_activity_at,
a.prospect,
a.location_address,
a.status,
a.is_deleted,
a.source_id,
a.rejected_reason_id,
a.credited_to_user_id,
a.current_stage_id ,
prospect_pool_id,
prospect_stage_id,
prospect_owner_id,
b.id as job_stage_id,
b.name as job_stage_name,
b.created_at as job_stage_created_at,
b.updated_at as job_stage_updated_at,
b.job_id as job_id,
c.name as job_name,
c.status as job_status,
c.created_at as job_created_at,
c.closed_at as job_closed_at,
d.department_id,
e.parent_id as dept_parent_id,
e.name as dept_name,
e.external_id as dept_external_id,
f.id as job_post_id,
f.title as job_post_title,
f.internal as job_post_internal,
f.external as job_post_external,
f.live as job_post_live,
f.content as job_post_content,
f.internal_content as job_post_internal_content,
f.created_at as job_post_created_at,
f.updated_at as job_post_updated_at,
f.location_name as job_post_location_name
FROM df_application a
LEFT JOIN df_job_stage b ON a.current_stage_id = b.id
LEFT JOIN df_job c ON b.job_id = c.id
LEFT JOIN df_job_department d ON c.id = d.job_id
LEFT JOIN df_department e ON d.department_id = e.id
LEFT JOIN df_job_post f ON f.job_id = b.job_id
"""
applications_summary = ps.sqldf(q)
### Applications Dataset
#Now we can use the summary table to answer the questions using straightforward queries
#- Which jobs have the highest number of applications?
aq1= """
SELECT job_id, job_name, COUNT(DISTINCT application_id) as job_count
FROM applications_summary
GROUP BY job_id, job_name
ORDER BY job_count desc
"""
ps.sqldf(aq1)
#- What is the conversion rate between the stages of jobs?
#For this query we assume the data analyst knows which job stages come after one another and therefore can check the rate of
#two stages that follow each other using a query similar to below
#In this case we assume that "Case Study" is the next stage after "Application Review" for example sake
#and run our query to determine the conversion rate between these two stages
#Note:
#There may be a way to determine this programmatically if the `job_stage_id`s follow each other in increments of 1.
#They seem to follow this pattern in the current file but this may not always be the case
#If it were the case then we could do a self join on job_stage_id = (job_stage_id - 1) to determine the next stages
#However many self joins would be necessary and this table would also require more standardised `name` fields
aq2= """
SELECT CAST(Stage_2_Count AS FLOAT) / CAST(Stage_1_Count AS FLOAT) *100 as conv_rate FROM (
select ( SELECT COUNT(DISTINCT job_stage_id)
FROM job_summary
WHERE job_stage_name like "Application Review"
GROUP BY job_stage_name ) as Stage_1_Count ,
( SELECT COUNT(DISTINCT job_stage_id)
FROM job_summary
WHERE job_stage_name like "Case Study"
GROUP BY job_stage_name ) Stage_2_Count )
"""
ps.sqldf(aq2)
#- Which department has created the most job posts?
aq3 = """
SELECT department_id, dept_name ,COUNT(DISTINCT job_id) as job_count
FROM job_summary
GROUP BY department_id, dept_name
ORDER BY job_count desc
"""
ps.sqldf(aq3)