import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
csvUrl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
colNames = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Class']
irisData =  pd.read_csv(csvUrl, names = colNames)
irisData.std()
def minmaxnorm(trData,teData,minV = 0,maxV = 1):
    temp = pd.DataFrame(data = trData)
    # creating a copy so as not to globally updata the input dataframe
    trDataLocal = temp.copy()
    check = False   # this will be useful to check whether or not we have test data later on
    if isinstance(teData, pd.DataFrame):
        check = True
        teDataLocal = teData.copy()
    elif isinstance(teData,type(None)):
        teDataLocal = None
    else: 
        check = True
        teData = np.array(teData)
        if np.ndim(teData) == 1:
            temp = np.zeros((1,len(teData)))
            for i in range(len(teData)):
                temp[0,i] = teData[i]
            teData = temp
        teDataLocal = pd.DataFrame(data=teData)
    trData_numeric = trDataLocal.select_dtypes(include = np.number) # only use numeric data (we cant normalize names)
    if check:
        teData_numeric = teDataLocal.select_dtypes(include = np.number)
    for i in range(len(trData_numeric.columns)):
        # looping through thorugh the numeric columns
        # First make the data between 0 and 1, then multiply so that it is the right width, then shift into place
        trDataLocal.iloc[:,i] = (trData_numeric.iloc[:,i] - trData_numeric.iloc[:,i].min())
        trDataLocal.iloc[:,i] = (trDataLocal.iloc[:,i].div(trDataLocal.iloc[:,i].max())).mul(maxV - minV)
        trDataLocal.iloc[:,i] = (trDataLocal.iloc[:,i] + minV)
        if check:
            teDataLocal.iloc[:,i] = (teData_numeric.iloc[:,i] - trData_numeric.iloc[:,i].min())
            teDataLocal.iloc[:,i] = (teDataLocal.iloc[:,i].div(trDataLocal.iloc[:,i].max())).mul(maxV - minV)
            teDataLocal.iloc[:,i] = (teDataLocal.iloc[:,i] + minV)
    return (trDataLocal,teDataLocal)
def zscorenorm(trData,teData,madFlag = False):
    temp = pd.DataFrame(data = trData)
    trDataLocal = temp.copy()
    check = False
    if isinstance(teData, pd.DataFrame):
        check = True
        teDataLocal = teData.copy()
    elif isinstance(teData,type(None)):
        teDataLocal = None
    else: 
        check = True
        teDataLocal = pd.DataFrame(data=teData)
    
    if(madFlag):
        sig = trDataLocal.mad()
    else:
        sig = trDataLocal.std(ddof=0)
    mu = trDataLocal.mean()
    NormTrData = np.divide((trDataLocal.select_dtypes(include=np.number)-mu),sig)
    if(check):
        NormTeData = np.divide((teDataLocal.select_dtypes(include=np.number)-mu),sig)
    else:
        NormTeData = None
    return (NormTrData,NormTeData)
TestData = pd.DataFrame([20,37,40,60,85,120])
print(minmaxnorm(TestData,None))
print(minmaxnorm(TestData,None,-1,1))
print(zscorenorm(TestData,None))
from sklearn.preprocessing import MinMaxScaler
Scaler = sk.preprocessing.MinMaxScaler()
Scaler.fit(TestData)
print(Scaler.transform(TestData))
Scaler = sk.preprocessing.MinMaxScaler((-1,1))
Scaler.fit(TestData)
print(Scaler.transform(TestData))
from sklearn.preprocessing import StandardScaler
Scaler = sk.preprocessing.StandardScaler()
Scaler.fit(TestData)
print(Scaler.transform(TestData))
# writing the distance functions
def euclidean(x,y):
    x = np.array(x)
    y = np.array(y)
    return np.sqrt(np.dot(x - y, x - y))
def manhattan(x,y):
    x = np.array(x)
    y = np.array(y)
    temp = 0
    for i in range(len(x)):
        temp += abs(x[i] - y[i])
    return temp
def minkowski(x,y,l):
    x = np.array(x)
    y = np.array(y)
    temp = 0
    for i in range(len(x)):
        temp += abs(x[i] - y[i])**l
    return temp**(1/l)
def supremum(x,y):
    x = np.array(x)
    y = np.array(y)
    return max(abs((x - y)))
    
def cosinesim(x,y):
    x = np.array(x)
    y = np.array(y)
    return np.dot(x,y)/(np.sqrt(np.dot(x,x))*np.sqrt(np.dot(y,y)))
# Putting in the data
q3_data = np.array([[1.4,1.3,2.9],[1.8,1.1,3.2],[1.3,1.2,2.9],[0.9,3.3,3.1],[1.5,2.1,3.3]])
q3_point = np.array([1.25,1.74,3.01])
dist_table = np.zeros((5,5))
for i in range(5):
    dist_table[i,0] = manhattan(q3_point,q3_data[i,:])
    dist_table[i,1] = euclidean(q3_point,q3_data[i,:])
    dist_table[i,2] = minkowski(q3_point,q3_data[i,:],3)
    dist_table[i,3] = supremum(q3_point,q3_data[i,:])
    dist_table[i,4] = cosinesim(q3_point,q3_data[i,:])
names = ['manhattan','euclidean','minkowski','supremum','cosine']
dist_table = pd.DataFrame(data=dist_table)
dist_table.columns = names
print(dist_table)
q3_point = np.zeros((1,3))
q3_point[0,0] = 1.25; q3_point[0,1] = 1.74; q3_point[0,2] = 3.01; #making sure the data input is a row, not a column
normalized = minmaxnorm(q3_data,q3_point)
q3_data = np.array(normalized[0]); q3_point = np.array(normalized[1]); 
dist_table = np.zeros((5,5))
for i in range(5):
    dist_table[i,0] = manhattan(q3_point[0],q3_data[i,:])
    dist_table[i,1] = euclidean(q3_point[0],q3_data[i,:])
    dist_table[i,2] = minkowski(q3_point[0],q3_data[i,:],3)
    dist_table[i,3] = supremum(q3_point[0],q3_data[i,:])
    dist_table[i,4] = cosinesim(q3_point[0],q3_data[i,:])
names = ['manhattan','euclidean','minkowski','supremum','cosine']
dist_table = pd.DataFrame(data=dist_table)
dist_table.columns = names
print(dist_table)
MovieData = pd.read_csv('movies.csv')
NumNa = MovieData.isna().sum()
PercentNa = 100*NumNa[NumNa != 0]/len(MovieData)
print(PercentNa)
MovieDataCopy = MovieData.copy()
print("number of samples to be removed is",MovieData['director'].isna().sum())
MovieDataCopy.dropna(subset=['director'],inplace=True)
print("number of remaining samples is",len(MovieDataCopy))
for i in ['genre','runtime','mpaa_rating','studio','thtr_rel_month',
'imdb_rating','audience_score','best_pic_win']:
    print('The type of',i, 'is',type(MovieData[i][0]),end=' ')
    if(isinstance(MovieData[i][0],np.number)): 
        #grab max and min of numeric (is a np.number) attributes
        print('The Maximum Value =',MovieData[i].max(),'The Minimum Value =',MovieData[i].min(),end='')
    else:
        #one hot encoding of nominal/ordinal attribute
        dummies = pd.get_dummies(MovieData[i]) 
        #the number of unique elements is the number of columns of the one hot encoded dataframe
        print('The size of the domain is',len(dummies.columns),end=' ') 
    #newline
    print()
for i in ['critics_score','runtime']:
    FiveNumSum = np.percentile(MovieData[i].dropna(),[0,25,50,75,100])
    print(i)
    print('Minimum =',FiveNumSum[0],end=' ')
    print('25th percentile =',FiveNumSum[1],end=' ')
    print('Median =',FiveNumSum[2],end=' ')
    print('75th percentile =',FiveNumSum[3],end=' ')
    print('Maximum =',FiveNumSum[4])
for i in ['audience_score','imdb_rating']:
    ColData = MovieData[i].dropna()
    print(i)
    print('Mean =',ColData.mean(),end=' ')
    print('Median =',ColData.median(),end=' ')
    print('Mode =',ColData.mode().values[0])
for i in ['audience_score','imdb_rating']:
    FiveNumSum = np.percentile(MovieData[i].dropna(),[25,31,75,90])
    print(i)
    print('25th percentile =',FiveNumSum[0],end=' ')
    print('31st percentile =',np.round(FiveNumSum[1],2),end=' ')
    print('75th percentile =',FiveNumSum[2],end=' ')
    print('90th percentile =',FiveNumSum[3])
import seaborn as sns
import matplotlib.pyplot as plt
dummies = pd.get_dummies(MovieData['genre']) #one hot encoding to find number of each genre
name_list = []
num_genre = np.zeros(len(dummies.columns))
for i, column in enumerate(dummies.columns):
    name_list.append(column) #getting the list of names to order them by
    num_genre[i] = dummies[column].sum()
name_list = np.array(name_list)
num_genre = np.array(num_genre)
sorted_index = num_genre.argsort() #arg sort so I can sort the name list the same way I sort the list of counts
ordered_names = np.flip(name_list[sorted_index])
dummies = dummies.reindex(ordered_names, axis=1)
plt.figure(figsize=(10,5))
ax = sns.countplot(x = 'genre',data=MovieData, order=ordered_names)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right") #rotating to prevent overlap
plt.title('Number of Movies by Genre')
plt.show()
dummies = pd.get_dummies(MovieData['mpaa_rating'])
name_list = []
num_genre = np.zeros(len(dummies.columns))
for i, column in enumerate(dummies.columns):
    name_list.append(column)
    num_genre[i] = dummies[column].sum()
name_list = np.array(name_list)
num_genre = np.array(num_genre)
sorted_index = num_genre.argsort() #doing the same thing as before
ordered_names = np.flip(name_list[sorted_index])
dummies = dummies.reindex(ordered_names, axis=1)
plt.figure(figsize=(10,5))
sns.countplot(y = 'mpaa_rating',data=MovieData, order=ordered_names)
plt.title('Number of Movies by mpaa Rating')
plt.show()
sturges = int(1 + 3.322*np.log(len(MovieData['audience_score']))) #using sturges rule to select bin number
plt.figure(figsize=(10,5))
sns.histplot(x="audience_score",data=MovieData,bins=sturges)
plt.title('Histogram of Audience Score')
plt.show()
plt.figure(figsize=(10,5))
sns.displot(x="runtime",data=MovieData,kind = 'kde') # using kernel density estimate
plt.title('Estimated Density of Run Time')
plt.show()
#plt.figure(figsize=(10,5))
sns.displot(data=MovieData, x="critics_rating", hue="genre", multiple = 'stack',stat='proportion',common_norm=False)
#sns.histplot(data=MovieData, x="critics_rating", hue="genre", multiple = 'stack',stat='probability',common_norm=False)
plt.show()
plt.figure(figsize=(10,5))
sns.countplot(x = 'critics_rating',data=MovieData)
plt.title('Number of Movies by Critic Rating')
plt.show()
Top5 = MovieData['genre'].value_counts().head(5).index
MovieData['Bin'] = pd.cut(MovieData['audience_score'],10)
plt.figure(figsize=(10,5))
ax = sns.countplot(x='genre',hue='Bin',data = MovieData,order=Top5)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
plt.show()
sns.boxplot(x='critics_rating', y='audience_score', data=MovieData)
ax = sns.violinplot(x="genre", y="audience_score", data=MovieData, order = Top5)
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
plt.title('Violin Plot of Audience Score by Genre')
plt.plot()
plt.figure(figsize=(10,5))
sns.displot(x="critics_score",data=MovieData,hue = 'audience_rating',kind = 'kde')
plt.title('Estimated Density of Critics Score')
plt.show()
plt.scatter(MovieData['imdb_rating'],MovieData['audience_score'])
plt.xlabel('imdb rating')
plt.ylabel('audience score')
plt.show()
graphs = sns.FacetGrid(MovieData, col='mpaa_rating')
graphs.map(sns.scatterplot,"imdb_rating","audience_score")
Top3 = MovieData['genre'].value_counts().head(3).index
graphs = sns.FacetGrid(MovieData, col='mpaa_rating',hue='genre',hue_order=Top3)
graphs.map(sns.scatterplot,"imdb_rating","audience_score")
graphs.add_legend()