import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
df = pd.read_csv('/work/student_record_10k.csv')
df.head()
df.info()
df.boxplot(column = 'placement')
sns.relplot(data = df, x = 'academic', y = 'annual', hue = 'gender')
sns.heatmap(df.corr())
df.describe()
!pip install apyori==1.1.2
import numpy as np
import pandas as pd
from apyori import apriori
!pip install openpyxl
df.shape
df.elective.unique()
Electives = [i.strip('"] [').replace("'",'').replace('\n','').split(' ') for i in df.elective.iloc[0:10000]]
Electives
association_rules = apriori(Electives, min_support=0.4, min_confidence=0.7, min_lift=1.2 , min_length=2)
association_results = list(association_rules)
print(len(association_results))
print(association_results)
for r in association_results:
print("=====================================")
print('Frequent itemset:{} with support'.format(list(r[0])), r[1])
print('--Association Rules')
for a in r[2]:
print('----Rule: {} -> {}'.format(list(a[0]), list(a[1])))
print('------Confidence: {}'.format(a[2]))
print('------Lift: {}'.format(a[3]))
association_rules_2 = apriori(Electives, min_support=0.20, min_confidence=0.80, min_lift=1.2, max_length=3)
association_results_2 = list(association_rules_2)
print(len(association_results_2))
for r in association_results_2:
print("=====================================")
print('Frequent itemset:{} with support'.format(list(r[0])), r[1])
print('--Association Rules')
for a in r[2]:
print('----Rule: {} -> {}'.format(list(a[0]), list(a[1])))
print('------Confidence: {}'.format(a[2]))
print('------Lift: {}'.format(a[3]))
association_rules_2 = apriori(Electives, min_support=0.30, min_confidence=0.60, min_lift=1.2, max_length=3)
association_results_2 = list(association_rules_2)
print(len(association_results_2))
for r in association_results_2:
print("=====================================")
print('Frequent itemset:{} with support'.format(list(r[0])), r[1])
print('--Association Rules')
for a in r[2]:
print('----Rule: {} -> {}'.format(list(a[0]), list(a[1])))
print('------Confidence: {}'.format(a[2]))
print('------Lift: {}'.format(a[3]))
records = pd.read_csv("/work/student_record_10k.csv",index_col=0)
records
df1 = records.iloc[:,3:16]
df2 = records.iloc[:,[19]]
df1
df2
from sklearn.model_selection import train_test_split
df1_train,df1_test, df2_train, df2_test = train_test_split(df1,df2,test_size=0.20)
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(df1_train, df2_train)
y_pred = classifier.predict(df1_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(df2_test, y_pred))
print(classification_report(df2_test, y_pred))
from sklearn import tree
text_representation = tree.export_text(classifier)
print(text_representation)
def evaluate_model(dt_classifier):
print("Train Accuracy :", accuracy_score(df2_train, dt_classifier.predict(df1_train)))
print("Train Confusion Matrix:")
print(confusion_matrix(df2_train, dt_classifier.predict(df1_train)))
print("-"*50)
print("Test Accuracy :", accuracy_score(df2_test, dt_classifier.predict(df1_test)))
print("Test Confusion Matrix:")
print(confusion_matrix(df2_test, dt_classifier.predict(df1_test)))
dt_min_leaf=DecisionTreeClassifier(min_samples_leaf=20,random_state=42)
dt_min_leaf.fit(df1_train,df2_train)
evaluate_model(dt_min_leaf)
dt_min_split=DecisionTreeClassifier(min_samples_split=20)
dt_min_split.fit(df1_train,df2_train)
evaluate_model(dt_min_split)
df['graduate_program_conv']= 1
df.loc[df['graduate_program']<0.5,'graduate_program_conv']= 0
x= df1
y=df['graduate_program_conv']
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=42,test_size=0.3)
lr= LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)
y_pred
f1_score(y_pred,y_test)
df1 = records.iloc[:,3:16]
df4 = records.iloc[:,[23]]
df1
df4
df4 = np.ravel(df4)
for i in range(0,10000):
if (df4[i]>0.5):
df4[i] = 1
else:
df4[i] = 0
from sklearn.model_selection import train_test_split
df1_train, df1_test, df4_train, df4_test = train_test_split(df1, df4, test_size=0.20)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10)
classifier.fit(df1_train, df4_train)
y_pred = classifier.predict(df1_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(df4_test, y_pred))
print(classification_report(df4_test, y_pred))
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import sklearn.metrics as metrics
from sklearn import linear_model
df['academic'].corr(df['annual'])
df['campus'].corr(df['annual'])
df['internship'].corr(df['annual'])
x= df['academic'].values.reshape(-1,1)
y=df['annual'].values.reshape(-1,1)
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=42,test_size=0.3)
lr= LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
r2_score(y_test,y_pred)
sns.regplot(x = df['academic'], y = df['annual'],x_jitter=1,y_jitter=1)
x= df['academic'].values
y=df['annual'].values
poly= PolynomialFeatures(degree=2)
x_poly= poly.fit_transform(x.reshape(-1,1))
x_train,x_test,y_train,y_test= train_test_split(x_poly,y,random_state=42,test_size=0.3)
poly.fit(x_poly,y)
ridge= Ridge()
ridge.fit(x_train,y_train)
y_pred=ridge.predict(x_test)
r2_score(y_pred,y_test)
y_pred
y_test
x= df[['academic','campus','internship']].values
y=df['annual'].values
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=42,test_size=0.3)
lr= LinearRegression()
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)
r2_score(y_pred,y_test)
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test,y_pred)
meanSqErr = metrics.mean_squared_error(y_test,y_pred)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
r2_score(y_pred,y_test)
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
co=['c01','c02','c03','c04','c05','c06','c07','c08','c09','c10']
K = range(1,10)
for num_clusters in K :
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(df[co])
Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K,Sum_of_squared_distances)
plt.xlabel("Values of K")
plt.ylabel("Sum of squared distances/Inertia")
plt.title("Elbow Method For Optimal k")
plt.show()
kmeans = KMeans(n_clusters=4, random_state=0).fit(df[co])
label=kmeans.predict(df[co])
label
sns.scatterplot(x=df['c01'],y= df['c04'], hue = label )
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
neighbours = NearestNeighbors(n_neighbors=2)
nbrs = neighbours.fit(df[co])
distances,indices = nbrs.kneighbors(df[co])
plt.figure(figsize=(7,5))
distances = np.sort(distances, axis = 0)
distances = distances[:, 1]
plt.rcParams['figure.figsize'] = (5,3)
plt.plot(distances)
plt.title("Plotting distances")
plt.show()
clustering = DBSCAN(eps=4, min_samples=28).fit(df[co])
clustering.labels_
pd.Series(clustering.labels_).value_counts()
sns.scatterplot(x=df['c01'],y= df['c04'], hue = clustering.labels_ )
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pca_columns=['c01','c02','c03','c04','c05','c06','c07','c08','c09','c10','academic', 'campus', 'internship']
x = df.loc[:, pca_columns].values
x = StandardScaler().fit_transform(x)
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pca1', 'pca2','pca3'])
pca.explained_variance_ratio_
print(pca.explained_variance_ratio_)
pca = PCA(0.80)
X_pca = pca.fit_transform(x)
X_pca.shape
pca.explained_variance_ratio_
pca.n_components_