import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

bc_set = pd.read_csv('Regex_1_full_counts.csv') # Barcodes extracted from Levy et al. 2015 data
bc_set.sort_values(by='counts', ascending=False).head(3)

BCobject | countsint64 | ||
---|---|---|---|

2297 | TTAATAAACAAGAAACGGGTTGTGGA | 175959 | |

1653 | TAGACAAGTCTCAAGGTTGTTAATAC | 82082 | |

1825 | CTGTTAATGGGCAACAGGCTTGGAGG | 53276 | |

3 rows × 2 columns |

bc_count_dict = {i[0]:i[1] for i in np.array(bc_set[['BC', 'counts']])}

bc = 'TTAATAAACAAGAAACGGGTTGTGGA'
perfect_reads = bc_count_dict[bc]
print(perfect_reads, 'reads')
# calculating the error rate of every possible SNP
snps_freqs = []
for i in range(len(bc)):
for letter in ['A', 'T', 'C', 'G']:
if letter != bc[i]:
error = bc[:i]+letter+bc[i+1:]
if error in bc_count_dict:
snps_freqs.append(bc_count_dict[error]/perfect_reads)
plt.hist(snps_freqs)
plt.xlabel('Error rate')
sns.despine()
print('Total error rate from single snps:', np.sum(snps_freqs))

```
175959 reads
Total error rate from single snps: 0.011587926732932104
```

bc = 'TTCTCAAAAAAAAAATCCGTTTTGTT'
bc_missing_1 = 'TTCTCAAAAAAAAATCCGTTTTGTT'
bc_plus_1 = 'TTCTCAAAAAAAAAAATCCGTTTTGTT'
perfect_reads = bc_count_dict[bc]
print(perfect_reads, 'reads')
print('Error rate missing one A:', bc_count_dict[bc_missing_1]/perfect_reads)
print('Error rate plus one A:', bc_count_dict[bc_plus_1]/perfect_reads)

```
1055 reads
Error rate missing one A: 0.07298578199052133
Error rate plus one A: 0.03033175355450237
```

bc = 'TGAAAAAAAAACAAGATATTTAGTTT'
bc_missing_1 = 'TGAAAAAAAACAAGATATTTAGTTT'
bc_plus_1 = 'TGAAAAAAAAAACAAGATATTTAGTTT'
print('Error rate missing one A:', bc_count_dict[bc_missing_1]/bc_count_dict[bc])
print('Error rate plus one A:', bc_count_dict[bc_plus_1]/bc_count_dict[bc])

```
Error rate missing one A: 0.014934660858743
Error rate plus one A: 0.03235843186060983
```

bc_set = bc_set.sort_values(by='counts', ascending=False)
error_rate_dict = dict()
for letter in ['A', 'T']:
for i in range(5,12):
run = letter*i
td = bc_set[(bc_set['BC'].str.contains(run)) & (~bc_set['BC'].str.contains(run+letter))]
if len(td) > 0:
plus_rate, minus_rate = np.nan, np.nan
for bc, count in np.array(td[['BC', 'counts']])[:1]:
plus_error = bc[:bc.index(run)]+letter+bc[bc.index(run):]
minus_error = bc[:bc.index(run)]+bc[bc.index(run)+1:]
if plus_error in bc_count_dict:
plus_rate = bc_count_dict[plus_error]/count
if minus_error in bc_count_dict:
minus_rate = bc_count_dict[minus_error]/count
error_rate_dict[run] = [plus_rate, minus_rate]

mat = []
for run in error_rate_dict:
mat.append([run] + error_rate_dict[run])
errors = pd.DataFrame(mat, columns=['Run', 'Plus_base_error_rate', 'Minus_base_error_rate'])
errors['Base'] = errors['Run'].str[0]
errors['Run Length'] = errors['Run'].apply(lambda r: len(r))
f, subs = plt.subplots(1, 2, figsize=(9, 4))
sns.lineplot(data=errors, x='Run Length', y='Plus_base_error_rate', hue='Base', ax=subs[0])
sns.lineplot(data=errors, x='Run Length', y='Minus_base_error_rate', hue='Base', ax=subs[1])
sns.despine()