# import libraries
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# read in the dete survey
dete_df = pd.read_csv("dete_survey.csv")
# print tabular information
dete_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822 entries, 0 to 821
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 822 non-null int64
1 SeparationType 822 non-null object
2 Cease Date 822 non-null object
3 DETE Start Date 822 non-null object
4 Role Start Date 822 non-null object
5 Position 817 non-null object
6 Classification 455 non-null object
7 Region 822 non-null object
8 Business Unit 126 non-null object
9 Employment Status 817 non-null object
10 Career move to public sector 822 non-null bool
11 Career move to private sector 822 non-null bool
12 Interpersonal conflicts 822 non-null bool
13 Job dissatisfaction 822 non-null bool
14 Dissatisfaction with the department 822 non-null bool
15 Physical work environment 822 non-null bool
16 Lack of recognition 822 non-null bool
17 Lack of job security 822 non-null bool
18 Work location 822 non-null bool
19 Employment conditions 822 non-null bool
20 Maternity/family 822 non-null bool
21 Relocation 822 non-null bool
22 Study/Travel 822 non-null bool
23 Ill Health 822 non-null bool
24 Traumatic incident 822 non-null bool
25 Work life balance 822 non-null bool
26 Workload 822 non-null bool
27 None of the above 822 non-null bool
28 Professional Development 808 non-null object
29 Opportunities for promotion 735 non-null object
30 Staff morale 816 non-null object
31 Workplace issue 788 non-null object
32 Physical environment 817 non-null object
33 Worklife balance 815 non-null object
34 Stress and pressure support 810 non-null object
35 Performance of supervisor 813 non-null object
36 Peer support 812 non-null object
37 Initiative 813 non-null object
38 Skills 811 non-null object
39 Coach 767 non-null object
40 Career Aspirations 746 non-null object
41 Feedback 792 non-null object
42 Further PD 768 non-null object
43 Communication 814 non-null object
44 My say 812 non-null object
45 Information 816 non-null object
46 Kept informed 813 non-null object
47 Wellness programs 766 non-null object
48 Health & Safety 793 non-null object
49 Gender 798 non-null object
50 Age 811 non-null object
51 Aboriginal 16 non-null object
52 Torres Strait 3 non-null object
53 South Sea 7 non-null object
54 Disability 23 non-null object
55 NESB 32 non-null object
dtypes: bool(18), int64(1), object(37)
memory usage: 258.6+ KB
# print summary statistics
dete_df.describe(include="all")
# print the first 5 rows
dete_df.head()
# read in the tafe survey
tafe_df = pd.read_csv("tafe_survey.csv", encoding="cp1252")
# print tabular information
tafe_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702 entries, 0 to 701
Data columns (total 72 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Record ID 702 non-null int64
1 Institute 702 non-null object
2 WorkArea 702 non-null object
3 CESSATION YEAR 695 non-null float64
4 Reason for ceasing employment 701 non-null object
5 Contributing Factors. Career Move - Public Sector 437 non-null object
6 Contributing Factors. Career Move - Private Sector 437 non-null object
7 Contributing Factors. Career Move - Self-employment 437 non-null object
8 Contributing Factors. Ill Health 437 non-null object
9 Contributing Factors. Maternity/Family 437 non-null object
10 Contributing Factors. Dissatisfaction 437 non-null object
11 Contributing Factors. Job Dissatisfaction 437 non-null object
12 Contributing Factors. Interpersonal Conflict 437 non-null object
13 Contributing Factors. Study 437 non-null object
14 Contributing Factors. Travel 437 non-null object
15 Contributing Factors. Other 437 non-null object
16 Contributing Factors. NONE 437 non-null object
17 Main Factor. Which of these was the main factor for leaving? 113 non-null object
18 InstituteViews. Topic:1. I feel the senior leadership had a clear vision and direction 608 non-null object
19 InstituteViews. Topic:2. I was given access to skills training to help me do my job better 613 non-null object
20 InstituteViews. Topic:3. I was given adequate opportunities for personal development 610 non-null object
21 InstituteViews. Topic:4. I was given adequate opportunities for promotion within %Institute]Q25LBL% 608 non-null object
22 InstituteViews. Topic:5. I felt the salary for the job was right for the responsibilities I had 615 non-null object
23 InstituteViews. Topic:6. The organisation recognised when staff did good work 607 non-null object
24 InstituteViews. Topic:7. Management was generally supportive of me 614 non-null object
25 InstituteViews. Topic:8. Management was generally supportive of my team 608 non-null object
26 InstituteViews. Topic:9. I was kept informed of the changes in the organisation which would affect me 610 non-null object
27 InstituteViews. Topic:10. Staff morale was positive within the Institute 602 non-null object
28 InstituteViews. Topic:11. If I had a workplace issue it was dealt with quickly 601 non-null object
29 InstituteViews. Topic:12. If I had a workplace issue it was dealt with efficiently 597 non-null object
30 InstituteViews. Topic:13. If I had a workplace issue it was dealt with discreetly 601 non-null object
31 WorkUnitViews. Topic:14. I was satisfied with the quality of the management and supervision within my work unit 609 non-null object
32 WorkUnitViews. Topic:15. I worked well with my colleagues 605 non-null object
33 WorkUnitViews. Topic:16. My job was challenging and interesting 607 non-null object
34 WorkUnitViews. Topic:17. I was encouraged to use my initiative in the course of my work 610 non-null object
35 WorkUnitViews. Topic:18. I had sufficient contact with other people in my job 613 non-null object
36 WorkUnitViews. Topic:19. I was given adequate support and co-operation by my peers to enable me to do my job 609 non-null object
37 WorkUnitViews. Topic:20. I was able to use the full range of my skills in my job 609 non-null object
38 WorkUnitViews. Topic:21. I was able to use the full range of my abilities in my job. ; Category:Level of Agreement; Question:YOUR VIEWS ABOUT YOUR WORK UNIT] 608 non-null object
39 WorkUnitViews. Topic:22. I was able to use the full range of my knowledge in my job 608 non-null object
40 WorkUnitViews. Topic:23. My job provided sufficient variety 611 non-null object
41 WorkUnitViews. Topic:24. I was able to cope with the level of stress and pressure in my job 610 non-null object
42 WorkUnitViews. Topic:25. My job allowed me to balance the demands of work and family to my satisfaction 611 non-null object
43 WorkUnitViews. Topic:26. My supervisor gave me adequate personal recognition and feedback on my performance 606 non-null object
44 WorkUnitViews. Topic:27. My working environment was satisfactory e.g. sufficient space, good lighting, suitable seating and working area 610 non-null object
45 WorkUnitViews. Topic:28. I was given the opportunity to mentor and coach others in order for me to pass on my skills and knowledge prior to my cessation date 609 non-null object
46 WorkUnitViews. Topic:29. There was adequate communication between staff in my unit 603 non-null object
47 WorkUnitViews. Topic:30. Staff morale was positive within my work unit 606 non-null object
48 Induction. Did you undertake Workplace Induction? 619 non-null object
49 InductionInfo. Topic:Did you undertake a Corporate Induction? 432 non-null object
50 InductionInfo. Topic:Did you undertake a Institute Induction? 483 non-null object
51 InductionInfo. Topic: Did you undertake Team Induction? 440 non-null object
52 InductionInfo. Face to Face Topic:Did you undertake a Corporate Induction; Category:How it was conducted? 555 non-null object
53 InductionInfo. On-line Topic:Did you undertake a Corporate Induction; Category:How it was conducted? 555 non-null object
54 InductionInfo. Induction Manual Topic:Did you undertake a Corporate Induction? 555 non-null object
55 InductionInfo. Face to Face Topic:Did you undertake a Institute Induction? 530 non-null object
56 InductionInfo. On-line Topic:Did you undertake a Institute Induction? 555 non-null object
57 InductionInfo. Induction Manual Topic:Did you undertake a Institute Induction? 553 non-null object
58 InductionInfo. Face to Face Topic: Did you undertake Team Induction; Category? 555 non-null object
59 InductionInfo. On-line Topic: Did you undertake Team Induction?process you undertook and how it was conducted.] 555 non-null object
60 InductionInfo. Induction Manual Topic: Did you undertake Team Induction? 555 non-null object
61 Workplace. Topic:Did you and your Manager develop a Performance and Professional Development Plan (PPDP)? 608 non-null object
62 Workplace. Topic:Does your workplace promote a work culture free from all forms of unlawful discrimination? 594 non-null object
63 Workplace. Topic:Does your workplace promote and practice the principles of employment equity? 587 non-null object
64 Workplace. Topic:Does your workplace value the diversity of its employees? 586 non-null object
65 Workplace. Topic:Would you recommend the Institute as an employer to others? 581 non-null object
66 Gender. What is your Gender? 596 non-null object
67 CurrentAge. Current Age 596 non-null object
68 Employment Type. Employment Type 596 non-null object
69 Classification. Classification 596 non-null object
70 LengthofServiceOverall. Overall Length of Service at Institute (in years) 596 non-null object
71 LengthofServiceCurrent. Length of Service at current workplace (in years) 596 non-null object
dtypes: float64(1), int64(1), object(70)
memory usage: 395.0+ KB
# print summary statistics
tafe_df.describe(include="all")
# print the first 5 rows from this dataset
tafe_df.head()
# read in the dete dataset, converting the "Not Started" values to null
dete_df = pd.read_csv("dete_survey.csv", na_values="Not Stated")
dete_df = dete_df.drop(dete_df.columns[28:49], axis = 1)
tafe_df = tafe_df.drop(tafe_df.columns[17:66], axis = 1)
# Update column names for DETE
dete_df.columns = dete_df.columns.str.lower().str.strip().str.replace(" ", "_")
dete_df.columns
# update column names for TAFE to match those of DETE
tafe_df.columns = tafe_df.columns.str.replace('\s+', ' ', regex=True)
tafe_df = tafe_df.rename({
'Record ID': 'id',
'CESSATION YEAR': 'cease_date',
'Reason for ceasing employment': 'separationtype',
'Gender. What is your Gender?': 'gender',
'CurrentAge. Current Age': 'age',
'Employment Type. Employment Type': 'employment_status',
'Classification. Classification': 'position',
'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service',
'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'
}, axis=1)
tafe_df.columns
# list the unique values in DETE separationtype column
dete_df["separationtype"].value_counts()
# Retain rows with resignation listed as the separation type
dete_df = dete_df[dete_df["separationtype"].str.contains(r"Resignation.+")]
# list the unique values in TAFE separationtype column
tafe_df["separationtype"].value_counts()
# Retain rows with resignation listed as the separation type
tafe_df = tafe_df[tafe_df["separationtype"].str.contains(r"Resignation", na=False)]
# regex pattern to extract years from the DETE dataset
pattern = r"([0-9]{4})"
# first, extract the year values from these columns
dete_df["cease_date"] = dete_df["cease_date"].str.extract(pattern)
dete_df["dete_start_date"] = dete_df["dete_start_date"].astype(str).str.extract(pattern)
dete_df["role_start_date"] = dete_df["role_start_date"].astype(str).str.extract(pattern)
# convert these values to date types
dete_df["cease_date"] = pd.to_datetime(dete_df["cease_date"])
dete_df["dete_start_date"] = pd.to_datetime(dete_df["dete_start_date"])
dete_df["role_start_date"] = pd.to_datetime(dete_df["role_start_date"])
# show the info about these columns
dete_df[["cease_date", "dete_start_date", "role_start_date"]].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 311 entries, 3 to 821
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cease_date 300 non-null datetime64[ns]
1 dete_start_date 283 non-null datetime64[ns]
2 role_start_date 270 non-null datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 9.7 KB
# regex pattern to extract years from the TAFE dataset
pattern = r"([0-9]{4})"
# extract the year values
tafe_df["cease_date"] = tafe_df["cease_date"].astype(str).str.extract(pattern)
# convert these values to dates
tafe_df["cease_date"] = pd.to_datetime(tafe_df["cease_date"])
# show the info about these new column
tafe_df[["cease_date"]].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 3 to 701
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cease_date 335 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 5.3 KB
# function for converting values to True, False or NaN
def update_values(data):
if pd.isnull(data):
return np.nan
elif data == "-": # recall that we're interpreting this as false
return False
return True
# update the following columns to either be True, False or NaN
tafe_df[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']] = tafe_df[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']].applymap(update_values)
# add dissatsified column for DETE
dete_df["dissatisfied"] = dete_df[["job_dissatisfaction", "dissatisfaction_with_the_department", "employment_conditions", "work_life_balance", "workload"]].any(axis=1)
# add dissatsified column for TAFE
tafe_df["dissatisfied"] = tafe_df[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']].any(axis=1)
# create a new institute_service column
dete_df["institute_service"] = pd.DatetimeIndex(dete_df["cease_date"]).year - pd.DatetimeIndex(dete_df["dete_start_date"]).year
# create identifier columns for the datasets
dete_df["institute"] = "DETE"
tafe_df["institute"] = "TAFE"
combined_df = pd.concat([dete_df, tafe_df])
combined_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651 entries, 3 to 701
Data columns (total 53 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 651 non-null int64
1 separationtype 651 non-null object
2 cease_date 635 non-null datetime64[ns]
3 dete_start_date 283 non-null datetime64[ns]
4 role_start_date 270 non-null datetime64[ns]
5 position 598 non-null object
6 classification 161 non-null object
7 region 265 non-null object
8 business_unit 32 non-null object
9 employment_status 597 non-null object
10 career_move_to_public_sector 311 non-null object
11 career_move_to_private_sector 311 non-null object
12 interpersonal_conflicts 311 non-null object
13 job_dissatisfaction 311 non-null object
14 dissatisfaction_with_the_department 311 non-null object
15 physical_work_environment 311 non-null object
16 lack_of_recognition 311 non-null object
17 lack_of_job_security 311 non-null object
18 work_location 311 non-null object
19 employment_conditions 311 non-null object
20 maternity/family 311 non-null object
21 relocation 311 non-null object
22 study/travel 311 non-null object
23 ill_health 311 non-null object
24 traumatic_incident 311 non-null object
25 work_life_balance 311 non-null object
26 workload 311 non-null object
27 none_of_the_above 311 non-null object
28 gender 592 non-null object
29 age 596 non-null object
30 aboriginal 7 non-null object
31 torres_strait 0 non-null object
32 south_sea 3 non-null object
33 disability 8 non-null object
34 nesb 9 non-null object
35 dissatisfied 651 non-null bool
36 institute_service 563 non-null object
37 institute 651 non-null object
38 Institute 340 non-null object
39 WorkArea 340 non-null object
40 Contributing Factors. Career Move - Public Sector 332 non-null object
41 Contributing Factors. Career Move - Private Sector 332 non-null object
42 Contributing Factors. Career Move - Self-employment 332 non-null object
43 Contributing Factors. Ill Health 332 non-null object
44 Contributing Factors. Maternity/Family 332 non-null object
45 Contributing Factors. Dissatisfaction 332 non-null object
46 Contributing Factors. Job Dissatisfaction 332 non-null object
47 Contributing Factors. Interpersonal Conflict 332 non-null object
48 Contributing Factors. Study 332 non-null object
49 Contributing Factors. Travel 332 non-null object
50 Contributing Factors. Other 332 non-null object
51 Contributing Factors. NONE 332 non-null object
52 role_service 290 non-null object
dtypes: bool(1), datetime64[ns](3), int64(1), object(48)
memory usage: 270.2+ KB
combined_df = combined_df.dropna(thresh=500, axis=1)
combined_df
# show what the different year values look like
combined_df["institute_service"].value_counts(dropna=False).head(10)
# match numbers after the "-", of before the ".". This will grab "5" of "5.0", but "6" of "5-6"
combined_df["institute_service"] = combined_df["institute_service"].astype(str).str.extract(r"(?<!\.)\b(\d+)\b(?!-)").astype(float)
# function that maps the number of years to a service category
def map_institute_service(data):
if pd.isnull(data):
return np.nan
elif data < 3:
return "New"
elif data < 7:
return "Experienced"
elif data < 11:
return "Established"
return "Veteran"
# apply the mapped function to the institute_service column, putting the results in a new column service_category
combined_df["service_category"] = combined_df["institute_service"].apply(map_institute_service)
# create a bar chart showing proportion of dissatisfied employees in each service category
ax = combined_df.pivot_table(index="service_category", values="dissatisfied").loc[["New", "Experienced", "Established", "Veteran"]].plot(kind="bar", figsize=(12,6), rot=0)
# create x label, y label and title
ax.set_xlabel("Service Category")
ax.set_ylabel("Proportion of Employees")
plt.title("Proportion of Dissatisfied Employees in Each Service Category")
# show plot
plt.show()
# create a bar chart showing proportion of dissatisfied employees by employment status
ax = combined_df.pivot_table(index="employment_status", values="dissatisfied").loc[["Casual", "Contract/casual", "Temporary Part-time", "Permanent Part-time", "Temporary Full-time", "Permanent Full-time"]].plot(kind="bar", figsize=(12,6))
# create x label, y label and title
ax.set_xlabel("Employment Status")
ax.set_ylabel("Proportion of Employees")
plt.title("Proportion of Dissatisfied Employees for Each Employment Status")
# show plot
plt.show()
# create a bar chart showing proportion of dissatisfied employees by employment status
ax = combined_df.pivot_table(index="gender", values="dissatisfied").loc[["Male", "Female"]].plot(kind="bar", figsize=(12,6))
# create x label, y label and title
ax.set_xlabel("Gender")
ax.set_ylabel("Proportion of Employees")
plt.title("Proportion of Dissatisfied Employees for Each Gender")
# show plot
plt.show()