## Gene Compression Project

#### BENG 202

Kasl, Hsu, Wang

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time

## Implementation

# shortest = 999999999
# This is the code for very slow iterative procedure
# (My first attempt at overlap)
# for first_patt in all_forward_first:
# for second_patt in all_forward_second:
# for i in range(1,len(first_patt)+1):
# first_overlap_check = first_patt[0:i]
# second_overlap_check = second_patt[-i:]
# if first_overlap_check == second_overlap_check:
# if 2*(len(first_patt)-i) + i < shortest:
# shortest = 2*(len(first_patt)-i) + i
# two_return = (first_patt,second_patt)
# print(shortest)
# print(two_return)

file = open('/work/codon_table.txt', 'r')
count = 0
stuff = file.read()
stuff_split = stuff.split('\n')
lines = len(stuff_split)
file = open('/work/codon_table.txt', 'r')
codon_table = dict()
for i in range(lines-2):
codon_vals = list(map(str,file.readline().replace('\n', '').split('\t')))
if codon_vals[2] not in codon_table:
codon_table[codon_vals[2]] = {codon_vals[0]}
else:
temp = codon_table[codon_vals[2]]
temp.add(codon_vals[0])
codon_table[codon_vals[2]] = temp
file = open('/work/alike_aas.txt', 'r')
count = 0
stuff = file.read()
stuff_split = stuff.split('\n')
lines = len(stuff_split)
file = open('/work/alike_aas.txt', 'r')
alike_aas = []
for n in range(lines):
alike_aas.append(set(map(str,file.readline().replace('\n', '').split(', '))))
print(codon_table)
print(alike_aas)
basePairingTable = dict()
basePairingTable['A'] = 'T'
basePairingTable['T'] = 'A'
basePairingTable['G'] = 'C'
basePairingTable['C'] = 'G'
# D neighbors for the sequences
def d_neighbors(pattern, d):
if d==0:
return {pattern}
if len(pattern)==1:
return {"A","C","G","T"}
neighborhood=set()
suffixNeighbors = d_neighbors(pattern[1:],d)
for text in suffixNeighbors:
if hamDistance(pattern[1:],text)<d:
nucs=["A","C","G","T"]
for nuc in nucs:
neighborhood.add(nuc+text)
else:
neighborhood.add(pattern[0]+text)
return neighborhood
# This will be most efficient in the amino alphabet
# This makes nucleotide sequences from amino acid sequences
def codon_neighbors(pattern):
global codon_table
if len(pattern)==1:
return codon_table[pattern]
neighborhood = set()
suffixNeighbors = codon_neighbors(pattern[1:])
for text in suffixNeighbors:
codons = codon_table[pattern[0]]
for codon in codons:
neighborhood.add(codon+text)
return neighborhood
# This will be most efficient in the amino alphabet
# This makes all the neighbors in the amino alphabet
def amino_neighbors(pattern):
global alike_aas
if len(pattern)==1:
for similar in alike_aas:
if pattern in similar:
temp = similar
return temp
neighborhood = set()
suffixNeighbors = amino_neighbors(pattern[1:])
for text in suffixNeighbors:
for similar in alike_aas:
if pattern[0] in similar:
aas = similar
for aa in aas:
neighborhood.add(aa+text)
return neighborhood
# :D
def hamDistance(str1,str2):
count = 0
for i in range(len(str1)):
if str1[i]!=str2[i]:
count+=1
return count
# :D
def reverseComplement(Text):
# Make an array the same size as the input text
rev=[0]*len(Text)
# Iterate through the text backwards and add the complement
for i in range(len(Text)-1,-1,-1):
if Text[i]=='A':
rev[len(Text)-(i+1)]='T'
elif Text[i]=='T':
rev[len(Text)-(i+1)]='A'
elif Text[i]=='C':
rev[len(Text)-(i+1)]='G'
elif Text[i]=='G':
rev[len(Text)-(i+1)]='C'
s=""
# Create a string from the array
rev=s.join(rev)
return rev

# We only want reverseComplement for one strand
def generate_forward_firsts(aa, aa_neigh_func=amino_neighbors, d=0):
neigh_out_first = aa_neigh_func(aa)
all_forward_first = set()
for neigh in neigh_out_first:
neigh_two = codon_neighbors(neigh)
for neigh_2 in neigh_two:
all_forward = d_neighbors(neigh_2,d)
for patt in all_forward:
# This adds the reverse of every sequence
# (Since the trie will be constructed from this
# we need to forward and reverse)
all_forward_first.add(patt[::-1])
all_forward_first.add(patt)
all_forward_first.add(reverseComplement(patt))
return all_forward_first
def generate_forward_seconds(aa, aa_neigh_func=amino_neighbors, d=0):
neigh_out_second = aa_neigh_func(aa)
all_forward_second = set()
for neigh in neigh_out_second:
neigh_two = codon_neighbors(neigh)
for neigh_2 in neigh_two:
all_forward = d_neighbors(neigh_2,d)
for patt in all_forward:
# Forward and reverse (no reverse complement)
all_forward_second.add(patt[::-1])
all_forward_second.add(patt)
return all_forward_second
def trieConstruction_anyOrder(patterns):
trie = dict()
nodeCount = 0
for pattern in patterns:
pattern = pattern+'$'
for n in range(len(pattern)):
pattern_in = pattern[n:]
# pattern = pattern[0]
currentNode = 0
currentSymbol = ''
for i in range(len(pattern_in)):
currentSymbol = pattern_in[i]
if (currentNode,currentSymbol) in trie:
currentNode = trie[(currentNode,currentSymbol)]
else:
nodeCount += 1
nextNode = nodeCount
trie[(currentNode,currentSymbol)] = int(nextNode)
currentNode = nextNode
# print(nodeCount)
return trie
# print(all_forward_first)
# print(all_forward_first_trie)

def maximalOverlapLength(all_forward_first_trie, all_forward_second):
overlap = 0
for pattern in all_forward_second:
# Keep track of length
pattern_track = pattern
length_counter = 1
# Get next_node (number) in trie
next_node = all_forward_first_trie[(0,pattern[0])]
symbol = pattern[1]
pattern = pattern[2:]
represent = True
reached_dollar = False
while represent == True:
if len(pattern) > 0:
if (next_node,symbol) in all_forward_first_trie:
next_node = all_forward_first_trie[(next_node,symbol)]
length_counter += 1
symbol = pattern[0]
pattern = pattern[1:]
elif (next_node,'$') in all_forward_first_trie:
length_counter += 1
represent = False
reached_dollar = True
else:
represent = False
else:
if (next_node,symbol) in all_forward_first_trie:
length_counter += 1
represent = False
elif (next_node,'$') in all_forward_first_trie:
length_counter += 1
represent = False
reached_dollar = True
else:
represent = False
if (length_counter > overlap) and reached_dollar == True:
overlap = length_counter
return_patt = pattern_track
else:
return_patt = ''
return overlap, return_patt
start = time.time()
all_forward_first = generate_forward_firsts('ATGTG')
all_forward_second = generate_forward_seconds('WHHT')
all_forward_first_trie = trieConstruction_anyOrder(all_forward_first)
print(maximalOverlapLength(all_forward_first_trie,all_forward_second))
end = time.time()
print(end-start)

## Benchmarking

amino_acid_alphabet = ['G','P','A','V','L','I','M','C','F','Y','W','H','K','R','Q','N','E','D','S','T']

overlap_dist_df = pd.DataFrame()
SIZE = 10
for SIZE in range(2,5):
for i in range(10):
# print(SIZE, i)
p1 = ''.join(np.random.choice(amino_acid_alphabet, SIZE))
p2 = ''.join(np.random.choice(amino_acid_alphabet, SIZE))
start = time.time()
all_forward_first = generate_forward_firsts(p1)
all_forward_second = generate_forward_seconds(p2)
all_forward_first_trie = trieConstruction_anyOrder(all_forward_first)
end = time.time()
time_elapsed = end-start
overlap_length = maximalOverlapLength(all_forward_first_trie,all_forward_second)[0]
overlap_dist_df = overlap_dist_df.append({'size (AAs)':SIZE, 'overlap (bp)': overlap_length, 'time (s)': time_elapsed}, ignore_index=True)

sns.boxplot(data=overlap_dist_df, x='size (AAs)', y='time (s)')
plt.title('input sequence length vs time taken')

overlap_dist_df['percent_overlap (%)'] = overlap_dist_df['overlap (bp)']/(overlap_dist_df['size (AAs)'] * 3)

sns.pointplot(data=overlap_dist_df, x='size (AAs)', y='percent_overlap (%)')
plt.title('input sequence length vs maximal overlap %')
plt.ylim([.5, 1])