import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import process_time
(A) Algorithm Implementation
def insertionSort(arr, l, r):
    c=0
    for i in range(l+1, r+1):
        for j in range(i, l,-1):
            c+=1 #update key comparison
            k=arr[j]
            if arr[j]<arr[j-1]:
                arr[j]=arr[j-1]
                arr[j-1]=k
            else:
                break
    return c
def mergeSertionSort(arr, l, r, S):
    if(r-l<=0):
        return 0
    
    # Switch to insertion sort for small sub-arrays < size S
    if(r-l+1<=S):
        return insertionSort(arr, l, r)
    
    mid = (l+r)//2
    c1=mergeSertionSort(arr, l, mid, S)
    c2=mergeSertionSort(arr, mid+1, r, S)
    c3=merge(arr, l, r, mid)
    
    #returns total no. key comparisons
    return c1+c2+c3
#Merging 2 sorted arrays using auxiliary arrays
def merge(arr, l, r, m):
    #Creating auxiliary arrays for left and right subarrays
    llist = arr[l:m+1]
    rlist = arr[m+1:r+1]
    #Initialising pointers pointing to the head of each copied subarray and the original array
    lptr = 0
    rptr = 0
    aptr = l
    c=0
    while(lptr<len(llist) and rptr<len(rlist)):
        #Updating key comparison 
        c+=1
        #If element of left subarray smaller than element of right subarray, replace element of original array with it
        if(llist[lptr]<rlist[rptr]):
            arr[aptr]=llist[lptr]
            aptr+=1
            lptr+=1
        #Replace element of original array with right subarray element if it is smaller than left subarray element
        elif (llist[lptr]>rlist[rptr]):
            arr[aptr]=rlist[rptr]
            aptr+=1
            rptr+=1
        #If elements of both subarrays are equal, add both into original array
        else:
            arr[aptr]=llist[lptr]
            aptr+=1
            arr[aptr]=rlist[rptr]
            aptr+=1
            rptr+=1
            lptr+=1
    #Unvisited elements are larger than visited elements. Add to back of original array.
    while(lptr<len(llist)):
        arr[aptr]=llist[lptr]
        aptr+=1
        lptr+=1
    while(rptr<len(rlist)):
        arr[aptr]=rlist[rptr]
        aptr+=1
        rptr+=1
    return c
arrTest=[20, 48, 36, 10, 31, 13, 29, 5, 2, 34, 15, 14, 42, 40, 43, 11, 21, 49, 8, 3]
print("Original Array:", arrTest, sep=", ")
print("No. of Key Comparisons:",mergeSertionSort(arrTest, 0, 19, 3))
print("Sorted Array:",arrTest, sep=", ")
(B) Generate Input Data
#list of exponents 
num = [3, 4, 5, 6, 7]
#array for datasets
data = []
for n in num:
    data.append(np.random.randint(1000000, size = 10**n).tolist())
print(data[0])
(C) Analyse Time Complexity
i. Fixed S, Vary n
S=10
count = []
for i in range(5):
    temp=list(data[i])
    counter=mergeSertionSort(temp,0,len(temp)-1,S)
    count.append(counter)
print(count)
xaxis = []
for i in num:
    x = 10**i
    xaxis.append(x)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(xaxis,count,linewidth=3)
plt.xlabel("Size of array")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against n', fontsize=20)
plt.xscale('log') 
plt.xticks([1000,10000,100000,1000000,10000000])
plt.show()
from IPython.display import Image
Image(filename='/work/Screenshot 2022-02-14 at 11.11.22 PM.png')
x=np.array(range(1,10000000))
y=x*11+x*(np.log2(x/10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,y)
plt.xlabel('Size of array, n')
plt.ylabel('Key comparisons, W(n)')
plt.xscale('log')
plt.title('Theoretical W(n) against n', fontsize=20)
plt.show()
ii. Fixed n, Vary S
count2 = []
for S in range(2, 101):
    tempcopy = list(data[2])
    # test_copy = list(testdata)
    counter = mergeSertionSort(tempcopy,0,99999,S)
    count2.append(counter)
print(count2)
xaxis = np.linspace(2,100,99)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(xaxis,count2,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S', fontsize=20)
plt.show()
print("The optimal S value for n = 100,000 is : ", end="")
print(count2.index(min(count2))+2)
x=np.array(range(1,101))
y=100000*x+100000*(np.log2(100000/x))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,y)
plt.xlabel('Size of S, S')
plt.ylabel('Key comparisons, W(n)')
plt.title('Theoretical W(n) against S', fontsize=20)
plt.show()
iii. Finding Optimal S
count3 = []
x_S = []
y_arrSize = []
num = [3, 4, 5]
for S in range(1, 30):
    for i in num:
        temparr = list(data[i-3])
        counter = mergeSertionSort(temparr,0,len(temparr)-1,S)
        count3.append(counter)
        y_arrSize.append(10**i)
        x_S.append(S)
print(count3)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
fig.set_size_inches(10, 10)
ax.plot(x_S, y_arrSize, count3)
ax.set_title("Finding Optimal S")
ax.set_xlabel("S value")
ax.set_ylabel("Size of array")
ax.set_zlabel("Key comparisons")
plt.show()
Given the rather linear trend of size of array VS key comparisons, we can thus narrow down our analysis to looking at S value VS key comparisons.
Hence, zooming in onto the individual n values, we plot the S value against the number of key comparisons to identify a common trend as S varies from 2 to 10.
(1) n = 1000
count3_1 = []
for S in range(2, 10):
    temp1 = list(data[0])
    counter = mergeSertionSort(temp1,0,999,S)
    count3_1.append(counter)
print(count3_1)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_1,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000', fontsize=20)
plt.show()
optS_1000=count3_1.index(min(count3_1))+2
print("The optimal S value for n = 1000 is : ", end="")
print(optS_1000)
(2) n = 10,000
count3_2 = []
for S in range(2, 10):
    temp2=list(data[1])
    counter = mergeSertionSort(temp2,0,9999,S)
    count3_2.append(counter)
print(count3_2)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_2,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=10000', fontsize=20)
plt.show()
optS_10000=count3_2.index(min(count3_2))+2
print("The optimal S value for n = 10,000 is : ", end="")
print(optS_10000)
(3) n = 100,000
count3_3 = []
for S in range(2, 10):
    temp3=list(data[2])
    counter = mergeSertionSort(temp3,0,99999,S)
    count3_3.append(counter)
print(count3_3)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_3,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=100000', fontsize=20)
plt.show()
optS_100000=count3_3.index(min(count3_3))+2
print("The optimal S value for n = 100,000 is : ", end="")
print(optS_100000)
(4) n = 1,000,000
count3_4 = []
for S in range(2,10):
    temp4=list(data[3])
    counter = mergeSertionSort(temp4,0,999999,S)
    count3_4.append(counter)
print(count3_4)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_4,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000000', fontsize=20)
plt.show()
optS_1000000=count3_4.index(min(count3_4))+2
print("The optimal S value for n = 1,000,000 is : ", end="")
print(optS_1000000)
(5) n=10,000,000
count3_5 = []
for S in range(2, 10):
    temp5=list(data[4])
    counter = mergeSertionSort(temp5,0,9999999,S)
    count3_5.append(counter)
print(count3_5)
f,axes = plt.subplots(1,1,figsize=(16,10))
x=np.array(range(2,10))
plt.plot(x,count3_5,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000000', fontsize=20)
plt.show()
optS_10000000=count3_5.index(min(count3_5))+2
print("The optimal S value for n = 10,000,000 is : ", end="")
print(optS_10000000)
new_x=np.linspace(2,10,50)
poly1=np.poly1d(np.polyfit(x, count3_1, 2))
poly2=np.poly1d(np.polyfit(x, count3_2, 2))
poly3=np.poly1d(np.polyfit(x, count3_3, 2))
poly4=np.poly1d(np.polyfit(x, count3_4, 2))
poly5=np.poly1d(np.polyfit(x, count3_5, 2))
new_y1=poly1(new_x)
new_y2=poly2(new_x)
new_y3=poly3(new_x)
new_y4=poly4(new_x)
new_y5=poly5(new_x)
f,axes = plt.subplots(3,2,figsize=(10,10))
axes[0,0].plot(x,count3_1,linewidth=3)
axes[0,0].plot(new_x, new_y1)
axes[0,1].plot(x,count3_2,linewidth=3)
axes[0,1].plot(new_x, new_y2)
axes[1,0].plot(x,count3_3,linewidth=3)
axes[1,0].plot(new_x, new_y3)
axes[1,1].plot(x,count3_4,linewidth=3)
axes[1,1].plot(new_x, new_y4)
axes[2,0].plot(x,count3_5,linewidth=3)
axes[2,0].plot(new_x, new_y5)
plt.show()
print("Average Optimal S Value is: ", (optS_1000+optS_10000+optS_100000+optS_1000000+optS_10000000)//5+1)
(D) MergeSertionSort vs MergeSort
def mergeSort(arr, l, r):
    if(r-l<=0):
        return 0
    
    mid = (l+r)//2
    c1=mergeSort(arr, l, mid)
    c2=mergeSort(arr, mid+1, r)
    c3=merge(arr, l, r, mid)
    
    #returns total no. key comparisons
    return c1+c2+c3
optS=6 
msdata=list(data[4])
mssdata=list(data[4])
print("Regular MergeSort Key Comparisons: ",mergeSort(msdata, 0, 9999999))
print("MergeSertionSort Key Comparisons: ", mergeSertionSort(mssdata, 0, 9999999, optS))
df = pd.DataFrame(columns=('MergeSort','MergeSertionSort'))
for i in range(5):
    arr1 = list(data[4])
    arr2 = list(data[4])
    tms = process_time()
    mergeSort(arr1, 0, 9999999)
    tms2 = process_time()
    mergeSertionSort(arr2, 0, 9999999, optS)
    mss=process_time()
    df.loc[i]=[tms2-tms, mss-tms2]
display(df)
print("Average Results:\n",df.mean(axis=0))