import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
csvUrl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
colNames = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Class']
irisData = pd.read_csv(csvUrl, names = colNames)
irisData.std()
def minmaxnorm(trData,teData,minV = 0,maxV = 1):
temp = pd.DataFrame(data = trData)
# creating a copy so as not to globally updata the input dataframe
trDataLocal = temp.copy()
check = False # this will be useful to check whether or not we have test data later on
if isinstance(teData, pd.DataFrame):
check = True
teDataLocal = teData.copy()
elif isinstance(teData,type(None)):
teDataLocal = None
else:
check = True
teData = np.array(teData)
if np.ndim(teData) == 1:
temp = np.zeros((1,len(teData)))
for i in range(len(teData)):
temp[0,i] = teData[i]
teData = temp
teDataLocal = pd.DataFrame(data=teData)
trData_numeric = trDataLocal.select_dtypes(include = np.number) # only use numeric data (we cant normalize names)
if check:
teData_numeric = teDataLocal.select_dtypes(include = np.number)
for i in range(len(trData_numeric.columns)):
# looping through thorugh the numeric columns
# First make the data between 0 and 1, then multiply so that it is the right width, then shift into place
trDataLocal.iloc[:,i] = (trData_numeric.iloc[:,i] - trData_numeric.iloc[:,i].min())
trDataLocal.iloc[:,i] = (trDataLocal.iloc[:,i].div(trDataLocal.iloc[:,i].max())).mul(maxV - minV)
trDataLocal.iloc[:,i] = (trDataLocal.iloc[:,i] + minV)
if check:
teDataLocal.iloc[:,i] = (teData_numeric.iloc[:,i] - trData_numeric.iloc[:,i].min())
teDataLocal.iloc[:,i] = (teDataLocal.iloc[:,i].div(trDataLocal.iloc[:,i].max())).mul(maxV - minV)
teDataLocal.iloc[:,i] = (teDataLocal.iloc[:,i] + minV)
return (trDataLocal,teDataLocal)
def zscorenorm(trData,teData,madFlag = False):
temp = pd.DataFrame(data = trData)
trDataLocal = temp.copy()
check = False
if isinstance(teData, pd.DataFrame):
check = True
teDataLocal = teData.copy()
elif isinstance(teData,type(None)):
teDataLocal = None
else:
check = True
teDataLocal = pd.DataFrame(data=teData)
if(madFlag):
sig = trDataLocal.mad()
else:
sig = trDataLocal.std(ddof=0)
mu = trDataLocal.mean()
NormTrData = np.divide((trDataLocal.select_dtypes(include=np.number)-mu),sig)
if(check):
NormTeData = np.divide((teDataLocal.select_dtypes(include=np.number)-mu),sig)
else:
NormTeData = None
return (NormTrData,NormTeData)
TestData = pd.DataFrame([20,37,40,60,85,120])
print(minmaxnorm(TestData,None))
print(minmaxnorm(TestData,None,-1,1))
print(zscorenorm(TestData,None))
from sklearn.preprocessing import MinMaxScaler
Scaler = sk.preprocessing.MinMaxScaler()
Scaler.fit(TestData)
print(Scaler.transform(TestData))
Scaler = sk.preprocessing.MinMaxScaler((-1,1))
Scaler.fit(TestData)
print(Scaler.transform(TestData))
from sklearn.preprocessing import StandardScaler
Scaler = sk.preprocessing.StandardScaler()
Scaler.fit(TestData)
print(Scaler.transform(TestData))
# writing the distance functions
def euclidean(x,y):
x = np.array(x)
y = np.array(y)
return np.sqrt(np.dot(x - y, x - y))
def manhattan(x,y):
x = np.array(x)
y = np.array(y)
temp = 0
for i in range(len(x)):
temp += abs(x[i] - y[i])
return temp
def minkowski(x,y,l):
x = np.array(x)
y = np.array(y)
temp = 0
for i in range(len(x)):
temp += abs(x[i] - y[i])**l
return temp**(1/l)
def supremum(x,y):
x = np.array(x)
y = np.array(y)
return max(abs((x - y)))
def cosinesim(x,y):
x = np.array(x)
y = np.array(y)
return np.dot(x,y)/(np.sqrt(np.dot(x,x))*np.sqrt(np.dot(y,y)))
# Putting in the data
q3_data = np.array([[1.4,1.3,2.9],[1.8,1.1,3.2],[1.3,1.2,2.9],[0.9,3.3,3.1],[1.5,2.1,3.3]])
q3_point = np.array([1.25,1.74,3.01])
dist_table = np.zeros((5,5))
for i in range(5):
dist_table[i,0] = manhattan(q3_point,q3_data[i,:])
dist_table[i,1] = euclidean(q3_point,q3_data[i,:])
dist_table[i,2] = minkowski(q3_point,q3_data[i,:],3)
dist_table[i,3] = supremum(q3_point,q3_data[i,:])
dist_table[i,4] = cosinesim(q3_point,q3_data[i,:])
names = ['manhattan','euclidean','minkowski','supremum','cosine']
dist_table = pd.DataFrame(data=dist_table)
dist_table.columns = names
print(dist_table)
q3_point = np.zeros((1,3))
q3_point[0,0] = 1.25; q3_point[0,1] = 1.74; q3_point[0,2] = 3.01; #making sure the data input is a row, not a column
normalized = minmaxnorm(q3_data,q3_point)
q3_data = np.array(normalized[0]); q3_point = np.array(normalized[1]);
dist_table = np.zeros((5,5))
for i in range(5):
dist_table[i,0] = manhattan(q3_point[0],q3_data[i,:])
dist_table[i,1] = euclidean(q3_point[0],q3_data[i,:])
dist_table[i,2] = minkowski(q3_point[0],q3_data[i,:],3)
dist_table[i,3] = supremum(q3_point[0],q3_data[i,:])
dist_table[i,4] = cosinesim(q3_point[0],q3_data[i,:])
names = ['manhattan','euclidean','minkowski','supremum','cosine']
dist_table = pd.DataFrame(data=dist_table)
dist_table.columns = names
print(dist_table)
MovieData = pd.read_csv('movies.csv')
NumNa = MovieData.isna().sum()
PercentNa = 100*NumNa[NumNa != 0]/len(MovieData)
print(PercentNa)
MovieDataCopy = MovieData.copy()
print("number of samples to be removed is",MovieData['director'].isna().sum())
MovieDataCopy.dropna(subset=['director'],inplace=True)
print("number of remaining samples is",len(MovieDataCopy))
for i in ['genre','runtime','mpaa_rating','studio','thtr_rel_month',
'imdb_rating','audience_score','best_pic_win']:
print('The type of',i, 'is',type(MovieData[i][0]),end=' ')
if(isinstance(MovieData[i][0],np.number)):
#grab max and min of numeric (is a np.number) attributes
print('The Maximum Value =',MovieData[i].max(),'The Minimum Value =',MovieData[i].min(),end='')
else:
#one hot encoding of nominal/ordinal attribute
dummies = pd.get_dummies(MovieData[i])
#the number of unique elements is the number of columns of the one hot encoded dataframe
print('The size of the domain is',len(dummies.columns),end=' ')
#newline
print()
for i in ['critics_score','runtime']:
FiveNumSum = np.percentile(MovieData[i].dropna(),[0,25,50,75,100])
print(i)
print('Minimum =',FiveNumSum[0],end=' ')
print('25th percentile =',FiveNumSum[1],end=' ')
print('Median =',FiveNumSum[2],end=' ')
print('75th percentile =',FiveNumSum[3],end=' ')
print('Maximum =',FiveNumSum[4])
for i in ['audience_score','imdb_rating']:
ColData = MovieData[i].dropna()
print(i)
print('Mean =',ColData.mean(),end=' ')
print('Median =',ColData.median(),end=' ')
print('Mode =',ColData.mode().values[0])
for i in ['audience_score','imdb_rating']:
FiveNumSum = np.percentile(MovieData[i].dropna(),[25,31,75,90])
print(i)
print('25th percentile =',FiveNumSum[0],end=' ')
print('31st percentile =',np.round(FiveNumSum[1],2),end=' ')
print('75th percentile =',FiveNumSum[2],end=' ')
print('90th percentile =',FiveNumSum[3])
import seaborn as sns
import matplotlib.pyplot as plt
dummies = pd.get_dummies(MovieData['genre']) #one hot encoding to find number of each genre
name_list = []
num_genre = np.zeros(len(dummies.columns))
for i, column in enumerate(dummies.columns):
name_list.append(column) #getting the list of names to order them by
num_genre[i] = dummies[column].sum()
name_list = np.array(name_list)
num_genre = np.array(num_genre)
sorted_index = num_genre.argsort() #arg sort so I can sort the name list the same way I sort the list of counts
ordered_names = np.flip(name_list[sorted_index])
dummies = dummies.reindex(ordered_names, axis=1)
plt.figure(figsize=(10,5))
ax = sns.countplot(x = 'genre',data=MovieData, order=ordered_names)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right") #rotating to prevent overlap
plt.title('Number of Movies by Genre')
plt.show()
dummies = pd.get_dummies(MovieData['mpaa_rating'])
name_list = []
num_genre = np.zeros(len(dummies.columns))
for i, column in enumerate(dummies.columns):
name_list.append(column)
num_genre[i] = dummies[column].sum()
name_list = np.array(name_list)
num_genre = np.array(num_genre)
sorted_index = num_genre.argsort() #doing the same thing as before
ordered_names = np.flip(name_list[sorted_index])
dummies = dummies.reindex(ordered_names, axis=1)
plt.figure(figsize=(10,5))
sns.countplot(y = 'mpaa_rating',data=MovieData, order=ordered_names)
plt.title('Number of Movies by mpaa Rating')
plt.show()
sturges = int(1 + 3.322*np.log(len(MovieData['audience_score']))) #using sturges rule to select bin number
plt.figure(figsize=(10,5))
sns.histplot(x="audience_score",data=MovieData,bins=sturges)
plt.title('Histogram of Audience Score')
plt.show()
plt.figure(figsize=(10,5))
sns.displot(x="runtime",data=MovieData,kind = 'kde') # using kernel density estimate
plt.title('Estimated Density of Run Time')
plt.show()
#plt.figure(figsize=(10,5))
sns.displot(data=MovieData, x="critics_rating", hue="genre", multiple = 'stack',stat='proportion',common_norm=False)
#sns.histplot(data=MovieData, x="critics_rating", hue="genre", multiple = 'stack',stat='probability',common_norm=False)
plt.show()
plt.figure(figsize=(10,5))
sns.countplot(x = 'critics_rating',data=MovieData)
plt.title('Number of Movies by Critic Rating')
plt.show()
Top5 = MovieData['genre'].value_counts().head(5).index
MovieData['Bin'] = pd.cut(MovieData['audience_score'],10)
plt.figure(figsize=(10,5))
ax = sns.countplot(x='genre',hue='Bin',data = MovieData,order=Top5)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
plt.show()
sns.boxplot(x='critics_rating', y='audience_score', data=MovieData)
ax = sns.violinplot(x="genre", y="audience_score", data=MovieData, order = Top5)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
plt.title('Violin Plot of Audience Score by Genre')
plt.plot()
plt.figure(figsize=(10,5))
sns.displot(x="critics_score",data=MovieData,hue = 'audience_rating',kind = 'kde')
plt.title('Estimated Density of Critics Score')
plt.show()
plt.scatter(MovieData['imdb_rating'],MovieData['audience_score'])
plt.xlabel('imdb rating')
plt.ylabel('audience score')
plt.show()
graphs = sns.FacetGrid(MovieData, col='mpaa_rating')
graphs.map(sns.scatterplot,"imdb_rating","audience_score")
Top3 = MovieData['genre'].value_counts().head(3).index
graphs = sns.FacetGrid(MovieData, col='mpaa_rating',hue='genre',hue_order=Top3)
graphs.map(sns.scatterplot,"imdb_rating","audience_score")
graphs.add_legend()