Untitled project

import re

# Code source: Patrick Kunzmann # License: BSD 3 clause import pandas as pd import tempfile import itertools import numpy as np import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez #=========================== # GET ANNOTATED SEQ GENOME #=========================== # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb") ) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) ) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys()))

# Iterate over all CDS features for cds in k12_genome.annotation: # Obtain the sequence of the CDS cds_seq = k12_genome[cds] if len(cds_seq) % 3 != 0: # A CDS' length should be a multiple of 3, # otherwise the CDS is malformed continue # Iterate over the sequence in non-overlapping frames of 3 # and count the occurence of each codon for i in range(0, len(cds_seq), 3): codon_code = tuple(cds_seq.code[i:i+3]) codon_counter[codon_code] += 1 # Convert the total frequencies into relative frequencies # for each amino acid # The NCBI codon table with ID 11 is the bacterial codon table table = seq.CodonTable.load(11) table

dicttable = {'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S', 'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C', 'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'} codonDF = pd.Series(dicttable) codonDF = pd.DataFrame(codonDF) codonDF['codon'] = codonDF.index codonDF.index = range(len(codonDF)) codonDF.columns = ['residue','codon'] codonDF = codonDF.groupby("residue").agg({"codon":lambda x: x.tolist()}) codonDF['codon_count'] = [len(x) for x in codonDF.codon] codonDF.sort_values('codon_count') codonDF

ddict = {} # As the script uses symbol codes, each amino acid is represented by a # number between 0 and 19, instead of the single letter code for amino_acid_code in range(20): # Find all codons coding for the amino acid # The codons are also in symbol code format, e.g. ATG -> (0, 3, 2) codon_codes_for_aa = table[amino_acid_code] # Get the total amount of codon occurrences for the amino acid total = 0 for codon_code in codon_codes_for_aa: total += codon_counter[codon_code] # Replace the total frequencies with relative frequencies # and print it for codon_code in codon_codes_for_aa: # Convert total frequencies into relative frequencies codon_counter[codon_code] /= total # The rest of this code block prints the codon usage table # in human readable format amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code) # Convert the symbol codes for each codon into symbols... # ([0,3,2] -> ['A' 'T' 'G']) codon = k12_genome.sequence.alphabet.decode_multiple(codon_code) # ...and represent as string # (['A' 'T' 'G'] -> "ATG") codon = "".join(codon) freq = codon_counter[codon_code] ddict.update({codon:[amino_acid,freq]}) #print(f"{amino_acid} {codon} {freq:.2f}") #print() ddict

df = pd.DataFrame(ddict).T df = df.rename(columns={0:'aa', 1:'freq'})

df['codon'] = list(df.index.values) df

grp = df.groupby(by=['aa']).max() grp

grp.codon.values

class OptCodons : AMINOACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] def __init__(self, codonlist): self.codontable = {} for x in range(len(codonlist)) : self.codontable.update({self.AMINOACIDS[x] : codonlist[x]}) def __repr__(self): out = "" col = 0 for x, y in self.codontable.items(): out += f'{x}:{y}' if col <3: out += f'\t' col += 1 if col == 3: out += "\n" col = 0 return out

class Genome: # Get the E. coli K-12 genome as annotated sequence def __init__(self, gb_id:str): self.name = "UNTITLED" #ex "U00096" self.gb_file = gb.GenBankFile.read( entrez.fetch(gb_id, tempfile.gettempdir(), "gb", "nuccore", "gb")) # We are only interested in CDS features self.k12_genome = gb.get_sequence(self.gb_file) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys self.codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) ) } # Iterate over all CDS features for cds in self.k12_genome: # Obtain the sequence of the CDS self.cds_seq = cds if len(self.cds_seq) % 3 != 0: # A CDS' length should be a multiple of 3, # otherwise the CDS is malformed continue # Iterate over the sequence in non-overlapping frames of 3 # and count the occurence of each codon for i in range(0, len(cds_seq), 3): codon_code = tuple(cds_seq.code[i:i+3]) codon_counter[codon_code] += 1 # Convert the total frequencies into relative frequencies # for each amino acid # The NCBI codon table with ID 11 is the bacterial codon table self.table = seq.CodonTable.load(1) self.ddict = {} # As the script uses symbol codes, each amino acid is represented by a # number between 0 and 19, instead of the single letter code for amino_acid_code in range(20): # Find all codons coding for the amino acid # The codons are also in symbol code format, e.g. ATG -> (0, 3, 2) codon_codes_for_aa = table[amino_acid_code] # Get the total amount of codon occurrences for the amino acid total = 0 for codon_code in codon_codes_for_aa: total += codon_counter[codon_code] # Replace the total frequencies with relative frequencies # and print it for codon_code in codon_codes_for_aa: # Convert total frequencies into relative frequencies self.codon_counter[codon_code] /= total # The rest of this code block prints the codon usage table # in human readable format amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code) # Convert the symbol codes for each codon into symbols... # ([0,3,2] -> ['A' 'T' 'G']) codon = k12_genome.sequence.alphabet.decode_multiple(codon_code) # ...and represent as string # (['A' 'T' 'G'] -> "ATG") codon = "".join(codon) freq = codon_counter[codon_code] self.ddict.update({codon:[amino_acid,freq]}) #print(f"{amino_acid} {codon} {freq:.2f}") self.df_optCodons = pd.DataFrame(ddict).T self.df_optCodons = self.df_optCodons.rename(columns={0:'aa', 1:'freq'}) self.grp = df.groupby(by=['aa']).max() self.OptimalCodons = list(self.grp.codon.values) self.optimalcodons = OptCodons(self.OptimalCodons) self.translategenome = self.k12_genome.translate() @property def _optimalcodonview(self): out = "" head = self.name.upper() + " CODON TABLE\n- - - - - - - - - - - -\n" tb = str(self.optimalcodons) out += head out += tb print(out) @property def concat_cds(self): ids = self.translategenome[1] out = "" for start,stop in ids: bases = str(self.k12_genome[start:stop]) out += bases return bases

g1= Genome("U00096")

g1.k12_genome.translate()

mycodonTable = OptCodons (grp.codon.values) mycodonTable

g2 = Genome("NC_045512.2")

g2._optimalcodonview==g1._optimalcodonview

Protein to Optimized DNA

aastr = 'macvreqwwmhytk'.upper()

def aa_to_optimalDNA (aa_str, optCodons): dnastr = "" for x in aa_str: index = optCodons[0].index(x) dnastr += optCodons[1][index] return dnastr

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Protein to Optimized DNA

Protein to Optimized DNA