import re
# Code source: Patrick Kunzmann
# License: BSD 3 clause
import pandas as pd
import tempfile
import itertools
import numpy as np
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
#===========================
# GET ANNOTATED SEQ GENOME
#===========================
# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb")
)
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])
# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
codon: 0 for codon
in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) )
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))
# Iterate over all CDS features
for cds in k12_genome.annotation:
# Obtain the sequence of the CDS
cds_seq = k12_genome[cds]
if len(cds_seq) % 3 != 0:
# A CDS' length should be a multiple of 3,
# otherwise the CDS is malformed
continue
# Iterate over the sequence in non-overlapping frames of 3
# and count the occurence of each codon
for i in range(0, len(cds_seq), 3):
codon_code = tuple(cds_seq.code[i:i+3])
codon_counter[codon_code] += 1
# Convert the total frequencies into relative frequencies
# for each amino acid
# The NCBI codon table with ID 11 is the bacterial codon table
table = seq.CodonTable.load(11)
table
dicttable = {'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S', 'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C', 'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'}
codonDF = pd.Series(dicttable)
codonDF = pd.DataFrame(codonDF)
codonDF['codon'] = codonDF.index
codonDF.index = range(len(codonDF))
codonDF.columns = ['residue','codon']
codonDF = codonDF.groupby("residue").agg({"codon":lambda x: x.tolist()})
codonDF['codon_count'] = [len(x) for x in codonDF.codon]
codonDF.sort_values('codon_count')
codonDF
ddict = {}
# As the script uses symbol codes, each amino acid is represented by a
# number between 0 and 19, instead of the single letter code
for amino_acid_code in range(20):
# Find all codons coding for the amino acid
# The codons are also in symbol code format, e.g. ATG -> (0, 3, 2)
codon_codes_for_aa = table[amino_acid_code]
# Get the total amount of codon occurrences for the amino acid
total = 0
for codon_code in codon_codes_for_aa:
total += codon_counter[codon_code]
# Replace the total frequencies with relative frequencies
# and print it
for codon_code in codon_codes_for_aa:
# Convert total frequencies into relative frequencies
codon_counter[codon_code] /= total
# The rest of this code block prints the codon usage table
# in human readable format
amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code)
# Convert the symbol codes for each codon into symbols...
# ([0,3,2] -> ['A' 'T' 'G'])
codon = k12_genome.sequence.alphabet.decode_multiple(codon_code)
# ...and represent as string
# (['A' 'T' 'G'] -> "ATG")
codon = "".join(codon)
freq = codon_counter[codon_code]
ddict.update({codon:[amino_acid,freq]})
#print(f"{amino_acid} {codon} {freq:.2f}")
#print()
ddict
df = pd.DataFrame(ddict).T
df = df.rename(columns={0:'aa', 1:'freq'})
df['codon'] = list(df.index.values)
df
grp = df.groupby(by=['aa']).max()
grp
grp.codon.values
class OptCodons :
AMINOACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
def __init__(self, codonlist):
self.codontable = {}
for x in range(len(codonlist)) :
self.codontable.update({self.AMINOACIDS[x] : codonlist[x]})
def __repr__(self):
out = ""
col = 0
for x, y in self.codontable.items():
out += f'{x}:{y}'
if col <3:
out += f'\t'
col += 1
if col == 3:
out += "\n"
col = 0
return out
class Genome:
# Get the E. coli K-12 genome as annotated sequence
def __init__(self, gb_id:str):
self.name = "UNTITLED"
#ex "U00096"
self.gb_file = gb.GenBankFile.read(
entrez.fetch(gb_id, tempfile.gettempdir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
self.k12_genome = gb.get_sequence(self.gb_file)
# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
self.codon_counter = {
codon: 0 for codon
in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) )
}
# Iterate over all CDS features
for cds in self.k12_genome:
# Obtain the sequence of the CDS
self.cds_seq = cds
if len(self.cds_seq) % 3 != 0:
# A CDS' length should be a multiple of 3,
# otherwise the CDS is malformed
continue
# Iterate over the sequence in non-overlapping frames of 3
# and count the occurence of each codon
for i in range(0, len(cds_seq), 3):
codon_code = tuple(cds_seq.code[i:i+3])
codon_counter[codon_code] += 1
# Convert the total frequencies into relative frequencies
# for each amino acid
# The NCBI codon table with ID 11 is the bacterial codon table
self.table = seq.CodonTable.load(1)
self.ddict = {}
# As the script uses symbol codes, each amino acid is represented by a
# number between 0 and 19, instead of the single letter code
for amino_acid_code in range(20):
# Find all codons coding for the amino acid
# The codons are also in symbol code format, e.g. ATG -> (0, 3, 2)
codon_codes_for_aa = table[amino_acid_code]
# Get the total amount of codon occurrences for the amino acid
total = 0
for codon_code in codon_codes_for_aa:
total += codon_counter[codon_code]
# Replace the total frequencies with relative frequencies
# and print it
for codon_code in codon_codes_for_aa:
# Convert total frequencies into relative frequencies
self.codon_counter[codon_code] /= total
# The rest of this code block prints the codon usage table
# in human readable format
amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code)
# Convert the symbol codes for each codon into symbols...
# ([0,3,2] -> ['A' 'T' 'G'])
codon = k12_genome.sequence.alphabet.decode_multiple(codon_code)
# ...and represent as string
# (['A' 'T' 'G'] -> "ATG")
codon = "".join(codon)
freq = codon_counter[codon_code]
self.ddict.update({codon:[amino_acid,freq]})
#print(f"{amino_acid} {codon} {freq:.2f}")
self.df_optCodons = pd.DataFrame(ddict).T
self.df_optCodons = self.df_optCodons.rename(columns={0:'aa', 1:'freq'})
self.grp = df.groupby(by=['aa']).max()
self.OptimalCodons = list(self.grp.codon.values)
self.optimalcodons = OptCodons(self.OptimalCodons)
self.translategenome = self.k12_genome.translate()
@property
def _optimalcodonview(self):
out = ""
head = self.name.upper() + " CODON TABLE\n- - - - - - - - - - - -\n"
tb = str(self.optimalcodons)
out += head
out += tb
print(out)
@property
def concat_cds(self):
ids = self.translategenome[1]
out = ""
for start,stop in ids:
bases = str(self.k12_genome[start:stop])
out += bases
return bases
g1= Genome("U00096")
g1.k12_genome.translate()
mycodonTable = OptCodons (grp.codon.values)
mycodonTable
g2 = Genome("NC_045512.2")
g2._optimalcodonview==g1._optimalcodonview
Protein to Optimized DNA
aastr = 'macvreqwwmhytk'.upper()
def aa_to_optimalDNA (aa_str, optCodons):
dnastr = ""
for x in aa_str:
index = optCodons[0].index(x)
dnastr += optCodons[1][index]
return dnastr