import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats # statistical libraries
from sklearn.model_selection import train_test_split # train test split package
from sklearn.model_selection import train_test_split # train test split package
from sklearn.linear_model import LinearRegression # Linear Regression model
from sklearn.ensemble import RandomForestRegressor # RF Regression
from sklearn.tree import DecisionTreeRegressor # DT Regression
from sklearn.metrics import r2_score, mean_squared_error as mse # r2_score, how much of our independent variable describes the dependent var?
Final Presentation : Airline Customer Satisfaction
Group 6 : Aldo Winaldy (13131810018), Bryan Leandro (13131810053), Hananto Luthfi (13131810090)
Preprocessing Data
df_orig = pd.read_csv('/work/Invistico_Airline.csv')
df = df_orig.copy()
df_orig.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 satisfaction 129880 non-null object
1 Gender 129880 non-null object
2 Customer Type 129880 non-null object
3 Age 129880 non-null int64
4 Type of Travel 129880 non-null object
5 Class 129880 non-null object
6 Flight Distance 129880 non-null int64
7 Seat comfort 129880 non-null int64
8 Departure/Arrival time convenient 129880 non-null int64
9 Food and drink 129880 non-null int64
10 Gate location 129880 non-null int64
11 Inflight wifi service 129880 non-null int64
12 Inflight entertainment 129880 non-null int64
13 Online support 129880 non-null int64
14 Ease of Online booking 129880 non-null int64
15 On-board service 129880 non-null int64
16 Leg room service 129880 non-null int64
17 Baggage handling 129880 non-null int64
18 Checkin service 129880 non-null int64
19 Cleanliness 129880 non-null int64
20 Online boarding 129880 non-null int64
21 Departure Delay in Minutes 129880 non-null int64
22 Arrival Delay in Minutes 129487 non-null float64
dtypes: float64(1), int64(17), object(5)
memory usage: 22.8+ MB
df
satisfaction = df['satisfaction']
df['satisfaction'].value_counts()
df['satisfaction']=pd.get_dummies(df['satisfaction'],drop_first=True)
df
df = df[['Seat comfort', 'Departure/Arrival time convenient',
'Food and drink', 'Gate location', 'Inflight wifi service', 'Inflight entertainment',
'Online support', 'Ease of Online booking', 'On-board service', 'Leg room service',
'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'satisfaction']]
df
df = df.fillna(df.mean())
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Seat comfort 129880 non-null int64
1 Departure/Arrival time convenient 129880 non-null int64
2 Food and drink 129880 non-null int64
3 Gate location 129880 non-null int64
4 Inflight wifi service 129880 non-null int64
5 Inflight entertainment 129880 non-null int64
6 Online support 129880 non-null int64
7 Ease of Online booking 129880 non-null int64
8 On-board service 129880 non-null int64
9 Leg room service 129880 non-null int64
10 Baggage handling 129880 non-null int64
11 Checkin service 129880 non-null int64
12 Cleanliness 129880 non-null int64
13 Online boarding 129880 non-null int64
14 Departure Delay in Minutes 129880 non-null int64
15 Arrival Delay in Minutes 129880 non-null float64
16 satisfaction 129880 non-null uint8
dtypes: float64(1), int64(15), uint8(1)
memory usage: 16.0 MB
x = df.iloc[:,0:-1]
y = df.iloc[:,-1:]
x # Independent Variables
y # Dependent Variable
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)
Regression
Linear Regression
lrr = LinearRegression()
lrr.fit(x_train, y_train)
y_pred = lrr.predict(x_test)
print(y_test.values)
print("-----")
print(y_pred)
[[0]
[1]
[1]
...
[1]
[0]
[0]]
-----
[[ 0.51206341]
[ 0.61601768]
[ 0.08836135]
...
[-0.0079119 ]
[ 0.24268663]
[-0.24490088]]
r2_score(y_test, y_pred)
mse(y_test, y_pred)**0.5
lrr.coef_
lrr.intercept_
Random Forest Regressor
rfr = RandomForestRegressor(max_depth = 16, n_estimators = 5)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test) # we UPDATE the Y_pred here, so be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
0.8190236159616479
0.2118281014470138
Decision Tree Regressor
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test) # we UPDATE the Y_pred here, be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)
0.7052061043279028
0.2703534173812515
Feature Importances
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(rfr, x_test, y_test, random_state = 1)
for i in pimp.importances_mean.argsort()[-16:]:
print(x.columns[i], pimp.importances_mean[i])
Feature Importances:
Departure Delay in Minutes 0.004929566071657976
Arrival Delay in Minutes 0.010513522999515513
Online support 0.0844439616778756
Departure/Arrival time convenient 0.09033910771315787
Food and drink 0.09270193220411967
Checkin service 0.09432515227788081
Leg room service 0.09478783113583841
Baggage handling 0.1269858099483751
Online boarding 0.13542571729360234
Inflight wifi service 0.13928177123494107
Cleanliness 0.185600048185699
Gate location 0.21735067634948296
On-board service 0.2216866932642211
Ease of Online booking 0.24075030185472474
Inflight entertainment 0.40927635681251645
Seat comfort 0.8449954984090748
Random Forest Regressor is the regression model that has the highest accuracy (82%) and lowest RMSE (0.21)
Seat comfort, Inflight entertainment, and Ease of Online booking is the most importance factor for satisfied customers airline
FACTOR ANALYSIS
df = df_orig.copy()
df = df.iloc[:,7:24]
df = df.fillna(df.mean()) # fill the missing data
df
pip install factor_analyzer
Requirement already satisfied: factor_analyzer in /usr/local/lib/python3.7/site-packages (0.3.2)
Requirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from factor_analyzer) (1.19.5)
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from factor_analyzer) (1.6.3)
Requirement already satisfied: pandas in /shared-libs/python3.7/py/lib/python3.7/site-packages (from factor_analyzer) (1.2.4)
Requirement already satisfied: scikit-learn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from factor_analyzer) (0.24.2)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas->factor_analyzer) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas->factor_analyzer) (2021.1)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->factor_analyzer) (1.16.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->factor_analyzer) (1.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->factor_analyzer) (2.1.0)
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
WARNING: You are using pip version 21.1.2; however, version 21.1.3 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
from factor_analyzer import FactorAnalyzer
Testing Assumptions
# Barlett's for identify matrix
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
calculate_bartlett_sphericity(df)
# Kaiser-Meyer-Olkin to measure the suitability of data for factor analysis
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(df)
print(kmo_model)
0.7486093921059667
Identify the Number of Factors
fa = FactorAnalyzer() # instantiate the Factor Analyzer
fa.fit(df) # Fit the dataframe using Factor Analyzer
ev, v = fa.get_eigenvalues() # Identify the eigenvalues
ev # display the eigenvalues
fa = FactorAnalyzer(4, rotation='varimax')
fa.fit(df)
print(fa.loadings_)
[[ 1.79518949e-01 1.20488641e-01 7.55201511e-01 -1.59275102e-02]
[-3.37452603e-02 4.99953654e-02 6.30689961e-01 7.29147237e-03]
[ 4.47664410e-02 1.68582893e-02 9.03919852e-01 -1.02764307e-02]
[-3.04687307e-02 -3.92697892e-02 6.08192518e-01 6.60108864e-03]
[ 7.33383730e-01 8.21334306e-04 -1.34725524e-03 -1.75035255e-02]
[ 4.30395676e-01 1.53955146e-01 2.91909560e-01 -1.99576990e-02]
[ 7.89033732e-01 1.06360508e-01 1.07024289e-02 -1.50826594e-02]
[ 7.38392629e-01 4.77409098e-01 1.04904942e-02 -9.84459916e-03]
[ 1.04830361e-01 7.10737040e-01 2.81060552e-02 -2.08693397e-02]
[ 9.27081134e-02 5.43779814e-01 5.38249747e-02 1.72097225e-02]
[ 3.63456997e-02 7.68173037e-01 2.98533429e-02 6.62963569e-03]
[ 1.56948699e-01 2.89291689e-01 1.80382605e-02 -1.30429325e-02]
[ 3.05768744e-02 7.89521566e-01 2.24876084e-02 -4.60529706e-02]
[ 8.45439208e-01 9.40510796e-02 -1.08614591e-02 5.62178709e-04]
[-2.15945575e-02 -2.05878384e-02 -3.07489109e-03 9.72341669e-01]
[-2.33186696e-02 -2.58066715e-02 -4.48357538e-03 9.86521955e-01]]
lmatrix = pd.DataFrame(fa.loadings_, index = list(df.columns), columns = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4'])
lmatrix #loading matrix
Sort the data frame based on each factor
lmatrix.sort_values('Factor 1', ascending=False)
lmatrix.sort_values('Factor 2', ascending=False)
lmatrix.sort_values('Factor 3', ascending=False)
lmatrix.sort_values('Factor 4', ascending=False)
Factor 1 = Online boarding, Online support, Ease of online booking, Inflight wifi service, and Inflight entertainment
Factor 2 = Cleanliness, Baggage handling, On-board Service, and Leg room service
Factor 3 = Food and Drink, Seat comfort, Departure/Arrival time convenient, and Gate location
Factor 4 = Arrival Delay in Minutes and Departure Delay in Minutes