pip install apyori
import numpy as np
import pandas as pd
from apyori import apriori
from re import sub
student_record_10k = pd.read_csv('student_record_10k.csv')
elective_courses=[]
for i in range(len(student_record_10k['elective'])):
elective_courses.append(str.split(sub(r"[\[\]\'\n]","",student_record_10k['elective'][i])))
association_rules = apriori(elective_courses, min_support=0.2, min_confidence=0.70, min_lift=1.3)
association_results = list(association_rules)
for r in association_results:
print("=====================================")
print('Frequent itemset:{} with support'.format(list(r[0])), r[1])
print('--Association Rules')
for a in r[2]:
print('----Rule: {} -> {}'.format(list(a[0]), list(a[1])))
print('------Confidence: {}'.format(a[2]))
print('------Lift: {}'.format(a[3]))
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
relevant_student_info=student_record_10k.loc[:,'c01':'internship']
student_outcomes = student_record_10k.loc[:,'AtRisk']
X_train, X_test, y_train, y_test = train_test_split(relevant_student_info, student_outcomes, test_size=0.50)
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
text_representation = tree.export_text(classifier)
print(text_representation)
from sklearn.linear_model import LogisticRegression
graduate_outcomes = student_record_10k['graduate_program']
graduate_outcomes_rounded = graduate_outcomes.round(0).astype('int32')
X_train, X_test, y_train, y_test = train_test_split(relevant_student_info, graduate_outcomes_rounded, test_size=0.25)
logisticClassifier = LogisticRegression(max_iter=1000)
logisticClassifier.fit(X_train, y_train)
y_pred = logisticClassifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.preprocessing import Binarizer, MinMaxScaler
from sklearn.naive_bayes import ComplementNB
binarizer = Binarizer(threshold=0.5)
scaler = MinMaxScaler()
placement = np.asarray(student_record_10k.loc[:,'placement']).reshape(-1,1)
binary_placement_outcomes = binarizer.fit_transform(placement).astype('int32')
X_train, X_test, y_train, y_test = train_test_split(relevant_student_info, np.ravel(binary_placement_outcomes), test_size=0.25)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
nbClassifier = ComplementNB().fit(X_train,y_train)
y_pred = nbClassifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
linear_regression_predictor = np.asarray(student_record_10k['campus']).reshape(-1,1)
linear_regression_outcome = np.asarray(student_record_10k['annual']).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(linear_regression_predictor, linear_regression_outcome, test_size = 0.25)
linear_regression_model = LinearRegression().fit(X_train,y_train)
print(linear_regression_model.score(X_test,y_test))
y_pred = linear_regression_model.predict(X_test)
plt.scatter(X_test, y_test, color ='b')
plt.plot(X_test, y_pred, color ='k')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
poly_regression_predictor = poly.fit_transform(np.asarray(student_record_10k['campus']).reshape(-1,1))
poly_regression_outcome = np.asarray(student_record_10k['annual']).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(poly_regression_predictor, poly_regression_outcome, test_size = 0.25)
poly_regression_model = LinearRegression().fit(X_train,y_train)
print(poly_regression_model.score(X_test,y_test))
y_pred = poly_regression_model.predict(X_test)
plt.scatter(student_record_10k['campus'], student_record_10k['annual'], color ='b')
plt.scatter(X_test[:,0], y_pred, color='k')
plt.show()
multi_regression_predictor = np.asarray(student_record_10k.loc[:,['campus','internship','academic']])
multi_regression_outcome = np.asarray(student_record_10k['annual']).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(multi_regression_predictor, multi_regression_outcome, test_size = 0.25)
multi_regression_model = LinearRegression().fit(X_train,y_train)
print(multi_regression_model.score(X_test,y_test))
y_pred = multi_regression_model.predict(X_test)
plt.scatter(y_test, y_pred)
plt.show()
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
features = np.asarray(student_record_10k.loc[:,'c01':'c10'].values)
features = StandardScaler().fit_transform(features)
inertia=np.empty((0,2))
for i in range(1,10):
k_means_model = KMeans(n_clusters=i, random_state=0).fit(features)
inertia = np.append(inertia, np.array([[i, k_means_model.inertia_]]), axis=0)
print(inertia)
x=inertia[:,0]
y=inertia[:,1]
sns.scatterplot(x=x,y=y, hue=y)
Using the elbow method for inertia, 3 clusters seems to be an appropriate number.
labels = KMeans(n_clusters=3).fit_predict(features)
to_print = ['Inertia',k_means_model.inertia_,'Silhouette Score',silhouette_score(features, labels)]
for i in range(0,3,2):
print(to_print[i]+'\n')
print(str(to_print[i+1])+'\n')
print('--------------\n')
from sklearn.cluster import DBSCAN
scores = np.empty((0,3))
for i in np.arange(0.5,0.65,0.01):
dbScan = DBSCAN(eps=i, min_samples = 3).fit(features)
labels = dbScan.fit_predict(features)
scores = np.append(scores,[[silhouette_score(features,labels),len(labels[labels<0])/len(labels),i]], axis=0)
print(scores)
scores = pd.DataFrame(scores,columns=["Silhouette Score", "Percent Noisy Points","Epsilon"])
sns.scatterplot(data=scores, x="Silhouette Score",y='Percent Noisy Points', hue="Epsilon")
A good clustering maximizes Silhouette Score while minimizing the percentage of noisy points. Thus, a good value for epsilon would be 0.57.
from sklearn.decomposition import PCA
features = np.asarray(student_record_10k.loc[:,'c01':'internship'].values)
features = StandardScaler().fit_transform(features)
pca = PCA(n_components=2).fit(features)
print('Explained Variance % by Component:\n')
sum=0
for i in pca.explained_variance_ratio_:
print(str(i*100)+'\n')
sum+=100*i
print('Total Explained Variance: \n\n'+str(sum))
variances = np.empty((0,2))
for i in range(2,10):
pca = PCA(n_components=i).fit(features)
percent_explained_variance = np.sum(pca.explained_variance_ratio_)*100
variances = np.append(variances,[[i,percent_explained_variance]],axis=0)
if percent_explained_variance>=80.0:
print(i)
variances = pd.DataFrame(variances,columns=["Number of Components", "Percent Explained Variance"])
sns.scatterplot(data=variances, x="Number of Components",y='Percent Explained Variance', hue="Percent Explained Variance")
The minimum number of principal components needed to explain 80% of variance appears to be about 3.