Multiple sequence alignment and dendogram

Coronaviruses
In [8]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio import Phylo
import os
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.ranges import FactorRange
output_notebook()
Loading BokehJS …
In [9]:
seq_ids=['AY274119','AY278488.2','MG772933','MN908947','MN988668', 'MN988669']
fasta_sequences=[]
for id in seq_ids:
    fasta_sequences.append(SeqIO.read("./Data/Coronavirus_genomes/"+id+".fna", "fasta"))
fasta_sequences
Out[9]:
[SeqRecord(seq=Seq('ATATTAGGTTTTTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGA...AAA', SingleLetterAlphabet()), id='AY274119.3', name='AY274119.3', description='AY274119.3 Severe acute respiratory syndrome-related coronavirus isolate Tor2, complete genome', dbxrefs=[]),
 SeqRecord(seq=Seq('CCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACT...AAA', SingleLetterAlphabet()), id='AY278488.2', name='AY278488.2', description='AY278488.2 SARS coronavirus BJ01, complete genome', dbxrefs=[]),
 SeqRecord(seq=Seq('ATATTAGGTTTTTACCTTCCCAGGTAACAAACCAACTAACTCTCGATCTCTTGT...AAA', SingleLetterAlphabet()), id='MG772933.1', name='MG772933.1', description='MG772933.1 Bat SARS-like coronavirus isolate bat-SL-CoVZC45, complete genome', dbxrefs=[]),
 SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA', SingleLetterAlphabet()), id='MN908947.3', name='MN908947.3', description='MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[]),
 SeqRecord(seq=Seq('TTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTA...AAA', SingleLetterAlphabet()), id='MN988668.1', name='MN988668.1', description='MN988668.1 Severe acute respiratory syndrome coronavirus 2 isolate 2019-nCoV WHU01, complete genome', dbxrefs=[]),
 SeqRecord(seq=Seq('TTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTA...AAA', SingleLetterAlphabet()), id='MN988669.1', name='MN988669.1', description='MN988669.1 Severe acute respiratory syndrome coronavirus 2 isolate 2019-nCoV WHU02, complete genome', dbxrefs=[])]
In [10]:
#GC content comparison
gcContents =[]
for sequence in fasta_sequences:
    gcContents.append(GC(sequence.seq))
In [11]:
gcPlot = figure(x_range=FactorRange(factors=seq_ids), height=400, width=700, title='GC content %')
gcPlot.vbar(x=seq_ids, top=gcContents, width=0.75)
gcPlot.title.text_font_size = '16pt'
gcPlot.title.align='center'
show(gcPlot)
In [12]:
SeqIO.write(fasta_sequences, 'coronaviruses.fasta', 'fasta')
Out[12]:
6
In [13]:
#cmd =  ClustalwCommandline("clustalw2", infile="coronaviruses.fasta")
In [14]:
#import os
#stdout, stderr = cmd()
In [15]:
from Bio import AlignIO
alignment = AlignIO.read("coronaviruses.aln","clustal")
print(alignment[:, :100])
SingleLetterAlphabet() alignment with 6 rows and 100 columns
-TTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTC...CTC MN988668.1
-TTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTC...CTC MN988669.1
ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTC...CTC MN908947.3
ATATTAGGTTTTTACCTTCCCAGGTAACAAACCAACTAACTCTC...CTT MG772933.1
ATATTAGGTTTTTACCTACCCAGGAAA--AGCCAACCAAC-CTC...CTC AY274119.3
-------------------CCAGGAAA--AGCCAACCAAC-CTC...CTC AY278488.2
In [16]:
from Bio import Phylo
tree = Phylo.read("coronaviruses.dnd", "newick")
tree.name = 'Simple tree of three related coronaviruses'
tree.name
Out[16]:
'Simple tree of three related coronaviruses'
In [17]:
Phylo.draw_ascii(tree)
                                                                  , AY274119.3
                       ___________________________________________|
  ____________________|                                           | AY278488.2
 |                    |
 |                    |_________________ MG772933.1
 |
 | MN908947.3
_|
 | MN988668.1
 |
 | MN988669.1

In [18]:
fig = Phylo.draw(tree)