import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import process_time
(A) Algorithm Implementation
def insertionSort(arr, l, r):
c=0
for i in range(l+1, r+1):
for j in range(i, l,-1):
c+=1 #update key comparison
k=arr[j]
if arr[j]<arr[j-1]:
arr[j]=arr[j-1]
arr[j-1]=k
else:
break
return c
def mergeSertionSort(arr, l, r, S):
if(r-l<=0):
return 0
# Switch to insertion sort for small sub-arrays < size S
if(r-l+1<=S):
return insertionSort(arr, l, r)
mid = (l+r)//2
c1=mergeSertionSort(arr, l, mid, S)
c2=mergeSertionSort(arr, mid+1, r, S)
c3=merge(arr, l, r, mid)
#returns total no. key comparisons
return c1+c2+c3
#Merging 2 sorted arrays using auxiliary arrays
def merge(arr, l, r, m):
#Creating auxiliary arrays for left and right subarrays
llist = arr[l:m+1]
rlist = arr[m+1:r+1]
#Initialising pointers pointing to the head of each copied subarray and the original array
lptr = 0
rptr = 0
aptr = l
c=0
while(lptr<len(llist) and rptr<len(rlist)):
#Updating key comparison
c+=1
#If element of left subarray smaller than element of right subarray, replace element of original array with it
if(llist[lptr]<rlist[rptr]):
arr[aptr]=llist[lptr]
aptr+=1
lptr+=1
#Replace element of original array with right subarray element if it is smaller than left subarray element
elif (llist[lptr]>rlist[rptr]):
arr[aptr]=rlist[rptr]
aptr+=1
rptr+=1
#If elements of both subarrays are equal, add both into original array
else:
arr[aptr]=llist[lptr]
aptr+=1
arr[aptr]=rlist[rptr]
aptr+=1
rptr+=1
lptr+=1
#Unvisited elements are larger than visited elements. Add to back of original array.
while(lptr<len(llist)):
arr[aptr]=llist[lptr]
aptr+=1
lptr+=1
while(rptr<len(rlist)):
arr[aptr]=rlist[rptr]
aptr+=1
rptr+=1
return c
arrTest=[20, 48, 36, 10, 31, 13, 29, 5, 2, 34, 15, 14, 42, 40, 43, 11, 21, 49, 8, 3]
print("Original Array:", arrTest, sep=", ")
print("No. of Key Comparisons:",mergeSertionSort(arrTest, 0, 19, 3))
print("Sorted Array:",arrTest, sep=", ")
(B) Generate Input Data
#list of exponents
num = [3, 4, 5, 6, 7]
#array for datasets
data = []
for n in num:
data.append(np.random.randint(1000000, size = 10**n).tolist())
print(data[0])
(C) Analyse Time Complexity
i. Fixed S, Vary n
S=10
count = []
for i in range(5):
temp=list(data[i])
counter=mergeSertionSort(temp,0,len(temp)-1,S)
count.append(counter)
print(count)
xaxis = []
for i in num:
x = 10**i
xaxis.append(x)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(xaxis,count,linewidth=3)
plt.xlabel("Size of array")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against n', fontsize=20)
plt.xscale('log')
plt.xticks([1000,10000,100000,1000000,10000000])
plt.show()
from IPython.display import Image
Image(filename='/work/Screenshot 2022-02-14 at 11.11.22 PM.png')
x=np.array(range(1,10000000))
y=x*11+x*(np.log2(x/10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,y)
plt.xlabel('Size of array, n')
plt.ylabel('Key comparisons, W(n)')
plt.xscale('log')
plt.title('Theoretical W(n) against n', fontsize=20)
plt.show()
ii. Fixed n, Vary S
count2 = []
for S in range(2, 101):
tempcopy = list(data[2])
# test_copy = list(testdata)
counter = mergeSertionSort(tempcopy,0,99999,S)
count2.append(counter)
print(count2)
xaxis = np.linspace(2,100,99)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(xaxis,count2,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S', fontsize=20)
plt.show()
print("The optimal S value for n = 100,000 is : ", end="")
print(count2.index(min(count2))+2)
x=np.array(range(1,101))
y=100000*x+100000*(np.log2(100000/x))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,y)
plt.xlabel('Size of S, S')
plt.ylabel('Key comparisons, W(n)')
plt.title('Theoretical W(n) against S', fontsize=20)
plt.show()
iii. Finding Optimal S
count3 = []
x_S = []
y_arrSize = []
num = [3, 4, 5]
for S in range(1, 30):
for i in num:
temparr = list(data[i-3])
counter = mergeSertionSort(temparr,0,len(temparr)-1,S)
count3.append(counter)
y_arrSize.append(10**i)
x_S.append(S)
print(count3)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
fig.set_size_inches(10, 10)
ax.plot(x_S, y_arrSize, count3)
ax.set_title("Finding Optimal S")
ax.set_xlabel("S value")
ax.set_ylabel("Size of array")
ax.set_zlabel("Key comparisons")
plt.show()
Given the rather linear trend of size of array VS key comparisons, we can thus narrow down our analysis to looking at S value VS key comparisons.
Hence, zooming in onto the individual n values, we plot the S value against the number of key comparisons to identify a common trend as S varies from 2 to 10.
(1) n = 1000
count3_1 = []
for S in range(2, 10):
temp1 = list(data[0])
counter = mergeSertionSort(temp1,0,999,S)
count3_1.append(counter)
print(count3_1)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_1,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000', fontsize=20)
plt.show()
optS_1000=count3_1.index(min(count3_1))+2
print("The optimal S value for n = 1000 is : ", end="")
print(optS_1000)
(2) n = 10,000
count3_2 = []
for S in range(2, 10):
temp2=list(data[1])
counter = mergeSertionSort(temp2,0,9999,S)
count3_2.append(counter)
print(count3_2)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_2,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=10000', fontsize=20)
plt.show()
optS_10000=count3_2.index(min(count3_2))+2
print("The optimal S value for n = 10,000 is : ", end="")
print(optS_10000)
(3) n = 100,000
count3_3 = []
for S in range(2, 10):
temp3=list(data[2])
counter = mergeSertionSort(temp3,0,99999,S)
count3_3.append(counter)
print(count3_3)
x=np.array(range(2,10))
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_3,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=100000', fontsize=20)
plt.show()
optS_100000=count3_3.index(min(count3_3))+2
print("The optimal S value for n = 100,000 is : ", end="")
print(optS_100000)
(4) n = 1,000,000
count3_4 = []
for S in range(2,10):
temp4=list(data[3])
counter = mergeSertionSort(temp4,0,999999,S)
count3_4.append(counter)
print(count3_4)
f,axes = plt.subplots(1,1,figsize=(16,10))
plt.plot(x,count3_4,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000000', fontsize=20)
plt.show()
optS_1000000=count3_4.index(min(count3_4))+2
print("The optimal S value for n = 1,000,000 is : ", end="")
print(optS_1000000)
(5) n=10,000,000
count3_5 = []
for S in range(2, 10):
temp5=list(data[4])
counter = mergeSertionSort(temp5,0,9999999,S)
count3_5.append(counter)
print(count3_5)
f,axes = plt.subplots(1,1,figsize=(16,10))
x=np.array(range(2,10))
plt.plot(x,count3_5,linewidth=3)
plt.xlabel("S value")
plt.ylabel("Key comparisons")
plt.title('Empirical W(n) against S for n=1000000', fontsize=20)
plt.show()
optS_10000000=count3_5.index(min(count3_5))+2
print("The optimal S value for n = 10,000,000 is : ", end="")
print(optS_10000000)
new_x=np.linspace(2,10,50)
poly1=np.poly1d(np.polyfit(x, count3_1, 2))
poly2=np.poly1d(np.polyfit(x, count3_2, 2))
poly3=np.poly1d(np.polyfit(x, count3_3, 2))
poly4=np.poly1d(np.polyfit(x, count3_4, 2))
poly5=np.poly1d(np.polyfit(x, count3_5, 2))
new_y1=poly1(new_x)
new_y2=poly2(new_x)
new_y3=poly3(new_x)
new_y4=poly4(new_x)
new_y5=poly5(new_x)
f,axes = plt.subplots(3,2,figsize=(10,10))
axes[0,0].plot(x,count3_1,linewidth=3)
axes[0,0].plot(new_x, new_y1)
axes[0,1].plot(x,count3_2,linewidth=3)
axes[0,1].plot(new_x, new_y2)
axes[1,0].plot(x,count3_3,linewidth=3)
axes[1,0].plot(new_x, new_y3)
axes[1,1].plot(x,count3_4,linewidth=3)
axes[1,1].plot(new_x, new_y4)
axes[2,0].plot(x,count3_5,linewidth=3)
axes[2,0].plot(new_x, new_y5)
plt.show()
print("Average Optimal S Value is: ", (optS_1000+optS_10000+optS_100000+optS_1000000+optS_10000000)//5+1)
(D) MergeSertionSort vs MergeSort
def mergeSort(arr, l, r):
if(r-l<=0):
return 0
mid = (l+r)//2
c1=mergeSort(arr, l, mid)
c2=mergeSort(arr, mid+1, r)
c3=merge(arr, l, r, mid)
#returns total no. key comparisons
return c1+c2+c3
optS=6
msdata=list(data[4])
mssdata=list(data[4])
print("Regular MergeSort Key Comparisons: ",mergeSort(msdata, 0, 9999999))
print("MergeSertionSort Key Comparisons: ", mergeSertionSort(mssdata, 0, 9999999, optS))
df = pd.DataFrame(columns=('MergeSort','MergeSertionSort'))
for i in range(5):
arr1 = list(data[4])
arr2 = list(data[4])
tms = process_time()
mergeSort(arr1, 0, 9999999)
tms2 = process_time()
mergeSertionSort(arr2, 0, 9999999, optS)
mss=process_time()
df.loc[i]=[tms2-tms, mss-tms2]
display(df)
print("Average Results:\n",df.mean(axis=0))