! pip install biopython
! pip install python-Levenshtein
Collecting biopython
Downloading biopython-1.78-cp37-cp37m-manylinux1_x86_64.whl (2.3 MB)
|████████████████████████████████| 2.3 MB 15.4 MB/s
Requirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from biopython) (1.19.5)
Installing collected packages: biopython
Successfully installed biopython-1.78
Collecting python-Levenshtein
Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
|████████████████████████████████| 50 kB 7.8 MB/s
Requirement already satisfied: setuptools in /root/venv/lib/python3.7/site-packages (from python-Levenshtein) (57.0.0)
Building wheels for collected packages: python-Levenshtein
Building wheel for python-Levenshtein (setup.py) ... done
Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=83258 sha256=2e937145d9c4fc2b15a539a40b7ae5c7433f460432b9ddcdc425ed9d1817189f
Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2
from Bio import Entrez
from Bio.Seq import Seq
from Bio import SeqIO
import re
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from Levenshtein import distance
import operator
from difflib import SequenceMatcher
from sklearn.decomposition import TruncatedSVD, PCA
import sklearn.cluster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
l =[]
handle = Entrez.esearch(db="nucleotide",
term='"Homo sapiens"[ORGN]',
retmax=100000)
record = Entrez.read(handle,
validate = False)
idlist=record['IdList']
handle2 = Entrez.efetch(db="nucleotide",
id=idlist,
rettype="fasta",
retmode="text")
for seq_record in SeqIO.parse(handle2,'fasta'):
l.append([seq_record.id,
str(seq_record.seq),
seq_record.description.split(',')[0]])
a=pd.DataFrame(l)
/root/venv/lib/python3.7/site-packages/Bio/Entrez/__init__.py:670: UserWarning:
Email address is not specified.
To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request. As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
from Bio import Entrez
Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
UserWarning,
a
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
tf_vectorizer = CountVectorizer(max_df=0.95,
min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(a[2])
tf_feature_names = tf_vectorizer.get_feature_names()
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation()
# parameters = {'n_components':range(5,100,50),
# 'max_iter':range(1,100,50),
# 'learning_offset':range(10,500,100),
# 'learning_decay':[0.5,0.6, 0.7, 0.8,0.9,1]
# }
# clf = GridSearchCV(lda,
# parameters,
# verbose=3,
# cv=2)
lda.fit(tf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("Topic %d:" % (topic_idx))
print (" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)
Topic 0:
homo sapiens methyltransferase family member factor subunit alpha 21a domain
Topic 1:
sapiens homo protein containing domain 63 coiled coil zinc centrosomal
Topic 2:
sequence patent wo2021048544 ep3793521 21 18 wo2021084021 41 39 47
Topic 3:
patent sequence ep3802856 wo2021055350 ep3801574 ep3814502 wo2021069654 ep3819387 10 15
Topic 4:
sequence patent ep3793521 ep3816301 wo2021083572 wo2021078910 ep3801574 ep3802856 wo2021078645 ep3813881
Topic 5:
region sapiens homo mrna chain variable anti immunoglobulin isolate peanut
Topic 6:
receptor 16 sapiens homo gene member isolate tas2r16 taste 35
Topic 7:
homo sapiens 13 chain transacylase dihydrolipoamide 86 pdb 6d semaphorin
Topic 8:
sequence patent ep3801574 ep3816283 wo2021083572 ep3816301 ep3819386 wo2021058543 19 29
Topic 9:
chain branched mrna sapiens homo dehydrogenase isolate acid e1 keto
# Vectorising text descriptions
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(a[2])
#dimetionality reduction
pca = TruncatedSVD(n_components=20)
X=pca.fit_transform(X)
tsne = TSNE(n_components=2)
df = tsne.fit_transform(X)
#Clustering
clf = DBSCAN(eps=1,
min_samples=25)
label = clf.fit_predict(df)
set(label)
# Vectorising text descriptions
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(a[1])
#dimetionality reduction
pca = TruncatedSVD(n_components=20)
X=pca.fit_transform(X)
tsne = TSNE(n_components=2)
df = tsne.fit_transform(X)
#Clustering
clf = DBSCAN(eps=1,
min_samples=25)
label = clf.fit_predict(df)
set(label)
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
#Getting unique labels
u_labels = np.unique(label)
fig = plt.figure(figsize=(10,10))
ax = plt.axes()#projection='3d'
#plotting the results:
for i in u_labels:
if i!=-1:
print(len(df[label == i , 0]))
# ax.scatter3D(df[label == i , 0],
# df[label == i , 1],
# df[label == i , 2],
# label = i);
plt.scatter(df[label == i , 0],
df[label == i , 1],
label = i)
plt.legend()
plt.show()
30
153
for i in u_labels:
if i!=-1:
print(i)
subset = a[label == i]
names = list(subset[1])
substring_counts={}
for k in range(0, len(names)):
for j in range(k+1,len(names)):
string1 = names[k]
string2 = names[j]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
matching_substring=string1[match.a:match.a+match.size]
if matching_substring and len(matching_substring)>3:
if (matching_substring not in substring_counts):
substring_counts[matching_substring]=1
else:
substring_counts[matching_substring]+=1
max_occurring_substring=max(substring_counts.items(), key=operator.itemgetter(1))
print(max_occurring_substring)
0
('CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACATTCAGTAGCTATGCTATACACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCACTTATATCATCTGATGGAAGTTATAAAACCTACGCGGACTCCGTAAAGGGCCGATTTACCATCTCCAGAGACAATTCCAAGCACACGCTGTATCTGCAAATGAGCAGCCTTAGAACTGAGGACACGGCTGTCTATTACTGTGCGAGAGACTCATCGGCGCTCGAGATTTACAACAGGTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG', 435)
1
('ATGATACCCATCCAACTCACTGTCTTCTTCATGATCATCTATGTGCTTGAGTCCTTGACAATTATTGTGCAGAGCAGCCTAATTGTTGCAGTGCTGGGCAGAGAATGGCTGCAAGTCAGAAGGCTGATGCCTGTGGACATGATTCTCATCAGCCTGGGCATCTCTCGCTTCTGTCTACAGTGGGCATCAATGCTGAACAATTTTTGCTCCTATTTTAATTTGAATTATGTACTTTGCAACTTAACAATCACCTGGGAATTTTTTAATATCCTTACATTCTGGTTAAACAGCTTGCTTACCGTGTTCTACTGCATCAAGGTCTCTTCTTTCACCCATCACATCTTTCTCTGGCTGAGGTGGAGAATTTTGAGGTTGTTTCCCTGGATATTACTGGGTTCTCTGATGATTACTTGTGTAACAATCATCCCTTCAGCTATTGGGAATTACATTCAAATTCAGTTACTCACCATGGAGCATCTACCAAGAAACAGCACTGTAACTGACAAACTTGAAAATTTTCATCAGTATCAGTTCCAGGCTCATACAGTTGCATTGGTTATTCCTTTCATCCTGTTCCTGGCCTCCACCATCTTTCTCATGGCATCACTGACCAAGCAGATACAACATCATAGCACTGGTCACTGCAATCCAAGCATGAAAGCGCACTTCACTGCCCTGAGGTCCCTTGCCGTCTTATTTATTGTGTTTACCTCTTACTTTCTAACCATACTCATCACCATTATAGGTACTCTATTTGATAAGAGATGTTGGTTATGGGTCTGGGAAGCTTTTGTCTATGCTTTCATCTTAATGCATTCCACTTCACTGATGCTGAGCAGCCCTACGTTGAAAAGGATTCTAAAGGGAAAGTGCTAG', 11628)