Related
I need to create a program that takes a file containing DNA and converts the open reading frame into protein data. I need to run the function once "ATG" occurs and until the stop codons "TAG" "TAA" or "TGA" occur.
I'm new to programming and this is what I have,
map = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
DNAlist = []
DNAlist1 = []
DNAlist2 = []
protein = []
for i in range(0, len(DNA), 3):
DNAlist.append(DNA[i:i+3])
for i in range(1, len(DNA), 3):
DNAlist1.append(DNA[i:i+3])
for i in range(2, len(DNA), 3):
DNAlist2.append(DNA[i:i+3])
while True:
if elements in DNAlist2 == 'TAG' or 'TAA' or 'TGA':
False
else:
protein = ''.join([map[elements] for elements in DNAlist2])```
A sample output would be
MLLGSFRLIPKETLIQVAGSSPCNLS
M
MGMTPRLGLESLLE
MTPRLGLESLLE
well tried without using Biopython, went only for forward strand (no reverse) and used the translated sequence
found two ways, I am sure these are non-optimal approaches, I am waiting for somebody better here How to find a open reading frame in Python are fastest ways I suppose.
first one gives you ORFs even if there is no stop codon (sequence doesnt terminate so no '_' for stop codon presence :
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
# for i in mappy:
# print(mappy[i])
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
DNAlist1 = []
DNAlist2 = []
DNAlist3 = []
# protein = []
def revProt(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
# proteinz = ''.join(proteinz)
return ''.join([ i for i in reversed(proteinz)])
for i in range(0, len(DNA), 3):
DNAlist1.append(DNA[i:i+3])
for i in range(1, len(DNA), 3):
DNAlist2.append(DNA[i:i+3])
for i in range(2, len(DNA), 3):
DNAlist3.append(DNA[i:i+3])
# for i in [DNAlist1] : #, DNAlist2, DNAlist3]:
for i in [DNAlist1, DNAlist2, DNAlist3]:
protein = revProt(i)
print(''.join(protein), type(''.join(protein)))
seqs = []
j = 0
orf = []
while True:
if j <= len(protein)-1:
if protein[j] == '_' :
if orf[0] == 'M':
orf.append('_')
seqs.append(''.join([i for i in reversed(orf)]))
orf = []
else :
orf = []
orf.append('_')
if protein[j] not in [ '_' , 'M'] :
orf.append(protein[j])
if protein[j] == 'M':
orf.append(protein[j])
seqs.append(''.join([i for i in reversed(orf)]))
else :
break
j += 1
print(seqs, '\n')
output:
QSAVRIM_A_ELLSELGLRPTMGMYGSNAVHS <class 'str'>
['MIRVASQ', 'MTPRLGLESLLE_', 'MGMTPRLGLESLLE_'] -----> here sequences 1st is at the end of DNA so no stop
LH_ES_EPKNWFLS_DLDRP_GWTVQTL_MA <class 'str'>
['M_']
SISSPDNLSIGFSVRIWTAPDDGHLRL_SCP <class 'str'>
[]
second way even more cumbersome :
import itertools
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
DNAlist1 = []
DNAlist2 = []
DNAlist3 = []
def Prot(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
# proteinz = ''.join(proteinz)
return proteinz
def Met(protein):
met = [i for i, x in enumerate(protein) if x == "M"]
return met
def Stop(protein):
stop = [i for i, x in enumerate(protein) if x == "_"]
return stop
for i in range(0, len(DNA), 3):
DNAlist1.append(DNA[i:i+3])
for i in range(1, len(DNA), 3):
DNAlist2.append(DNA[i:i+3])
for i in range(2, len(DNA), 3):
DNAlist3.append(DNA[i:i+3])
for i in [DNAlist1, DNAlist2, DNAlist3]:
protein = Prot(i)
print(''.join(protein), type(''.join(protein)))
met = Met(protein)
# print('met : ', met)
stop = Stop(protein)
# print('stop : ' , stop)
# print('------------------')
orf = [i for i in list(itertools.product(met, stop)) if i[0] < i[1]]
print(orf)
orf_p = [''.join(protein[j[0]:j[1]]) for j in orf]
orf_pp = [i for i in orf_p]
for y in orf_p:
# print(y, type(y))
if '_' in y:
# print('ok')
orf_pp.remove(y)
print('orf_pp : ',orf_pp)
print('______________')
output:
SHVANSGYMGMTPRLGLESLLE_A_MIRVASQ <class 'str'>
[(8, 22), (8, 24), (10, 22), (10, 24)]
orf_pp : ['MGMTPRLGLESLLE', 'MTPRLGLESLLE'] ----->here the sequences
______________
AM_LTQVTWG_PRDLD_SLFWNKPE_SE_HL <class 'str'>
[(1, 2), (1, 10), (1, 16), (1, 25), (1, 28)]
orf_pp : ['M']
______________
PCS_LRLHGDDPATWIRVSFGISLNDPSSIS <class 'str'>
[]
orf_pp : []
______________
shorter (probably faster copied from : How to find a open reading frame in Python
import re
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
# for i in mappy:
# print(mappy[i])
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
def Prot(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
return proteinz
pattern = re.compile(r'(?=(ATG(?:...)*?)(?=TAG|TGA|TAA))')
def revcomp(dna_seq):
return dna_seq[::-1].translate(str.maketrans("ATGC","TACG"))
def orfs(dna):
return set(pattern.findall(dna) + pattern.findall(revcomp(dna)))
for j in orfs(DNA):
# print(j, type(j))
DNAlistz = []
for i in range(0, len(j), 3):
DNAlistz.append(j[i:i+3])
print(''.join(Prot(DNAlistz)))
print('+++++++++++++')
output this time with reverse strand translation too:
MGMTPRLGLESLLE
MTPRLGLESLLE
M
MLLGSFRLIPKETLIQVAGSSPCNLS
+++++++++++++
I keep getting the following error:
File "./translatetest.py", line 32, in <module>
residue = cdna[codon]
TypeError: string indices must be integers
I am trying to code a script that will output a single amino acid letter for each set of 3 lets when given a series of codons (set of 3 letters). I also need it to break as soon as a STOP codon is received. Below is what I have right now. Help would be appreciated.
import sys
STOP = '*'
genetic_code = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':STOP, 'TAG':STOP,
'TGC':'C', 'TGT':'C', 'TGA':STOP, 'TGG':'W',
}
cdna = sys.argv[1].upper()
cdnLen = 3
cdns = [ cdna[idx:idx+cdnLen] for idx in range(0, len(cdna), cdnLen) ]
prot = ''
for codon in cdns:
residue = cdna[codon]
if residue == STOP:
break
prot += residue
print(prot)
Your problem is the following line:
residue = cdna[codon]
You should access the genetic_code to map a triplet to amino acid, so changing it to:
residue = genetic_code[codon]
You can condense your protein string to a single line with the following:
prot = ''.join(genetic_code[codon] for codon in cdns).split('STOP')[0]
But this may not be as efficient as checking for a stop and then breaking.
I have the following list in python.
list = ['10♠', '10♣', '2♡', '4♠', '4♣', '5♡', '5♣', '6♡', '6♣', '7♠', '7♡', '7♢', '7♣', '8♡', '8♢', '8♣', '9♡', '9♢', '9♣', 'A♠', 'A♢', 'A♣', 'J♢', 'K♠', 'K♢', 'Q♡']
how can I remove pairs from this? As an example, if a number appears an odd number of times, the last appearance of it should be kept. All others should be removed. Including all the ones that appear an even number of times.
ex: From '9♡', '9♢', '9♣', only the '9♣' should be kept.
Can someone help me with this?
I tried to use the below code to identify indices first. But still no luck.
i = 0
while i < len(deck):
count = 0
k = len(deck[i])
pivot = i
j = i
while j < len(deck):
if deck[i][:k-1] == deck[j][:k-1]:
print(deck[i]+','+deck[j])
count+= 1
pivot = j
j+=1
if (count %2 != 0):
print('pivot:'+str(pivot))
i = pivot +1
i +=1
No need to consider symbols. Just want to remove pairs from the list.
Please provide your suggestions.
Is this what you are looking for?
from collections import defaultdict
deck = ['10♠', '10♣', '2♡', '4♠', '4♣', '5♡', '5♣', '6♡', '6♣', '7♠', '7♡', '7♢', '7♣', '8♡', '8♢', '8♣', '9♡', '9♢', '9♣', 'A♠', 'A♢', 'A♣', 'J♢', 'K♠', 'K♢', 'Q♡']
# Create a dictionary and group all the cards with same number
groups = defaultdict(list)
for card in deck:
key = card[:-1]
groups[key].append(card)
new_deck = []
for subgroup in groups.values():
# iterate through the dictionary
# If you have odd number of cards in a subgroup
# consider the last card in that subgroup
if len(subgroup)%2 == 1:
new_deck.append(subgroup[-1])
for card in new_deck:
print(card)
Output
2♡ 8♣ 9♣ A♣ J♢ Q♡
Edit: A minor simplification to the second iteration with groups.values, thanks to RoadRunner.
Group the card pairs into a collections.defaultdict, then only return the last card from uneven pairs in a new list using a list comprehension:
from collections import defaultdict
lst = ['10♠', '10♣', '2♡', '4♠', '4♣', '5♡', '5♣', '6♡', '6♣', '7♠', '7♡', '7♢', '7♣', '8♡', '8♢', '8♣', '9♡', '9♢', '9♣', 'A♠', 'A♢', 'A♣', 'J♢', 'K♠', 'K♢', 'Q♡']
cards = defaultdict(list)
for card in lst:
cards[card[:-1]].append(card)
result = [pairs[-1] for pairs in cards.values() if len(pairs) % 2]
print(result)
Output:
['2♡', '8♣', '9♣', 'A♣', 'J♢', 'Q♡']
Keeping the same order, you can use:
import re
l = ['10♠', '10♣', '2♡', '4♠', '4♣', '5♡', '5♣', '6♡', '6♣', '7♠', '7♡', '7♢', '7♣', '8♡', '8♢', '8♣', '9♡', '9♢', '9♣', 'A♠', 'A♢', 'A♣', 'J♢', 'K♠', 'K♢', 'Q♡']
nc, nl = [], [0]
for x in l:
clean = re.sub(r"[^A-Z\d]", "", x)
if clean != nl[-1]:
nl.append(clean)
nc.append(x)
else:
del nl[-1]
del nc[-1]
print(nc)
# ['2♡', '8♣', '9♣', 'A♣', 'J♢', 'Q♡']
Demo
First of all, list is a reserved keyword, you should never name your variables after reserved keywords, use lst instead of list
Now, Here is the minimal solution:
lst = ['10♠', '10♣', '2♡', '4♠', '4♣', '5♡', '5♣', '6♡', '6♣', '7♠', '7♡', '7♢', '7♣', '8♡', '8♢', '8♣', '9♡', '9♢', '9♣', 'A♠', 'A♢', 'A♣', 'J♢', 'K♠', 'K♢', 'Q♡']
dictionary = dict.fromkeys(list('A23456789JQK')+['10'])
for item in lst:
dictionary[item[:-1]] = item if dictionary[item[:-1]] is None else None
print(list(filter(None.__ne__, dictionary.values())))
output:
['A♣', '2♡', '8♣', '9♣', 'J♢', 'Q♡']
I have DNA data in Phylip format that I would like to translate to amino acid. I've tried searching for libraries (or modules) that can do this but all of which seem to translate/produce files in FastA format.
This is how the input data looks:
3 1500
seq1 TTTGCTA...
seq2 TTCGCAA...
seq3 TTTGCCA...
where 1500 is the length of the sequences
This is the code I have but the output file I'm getting is empty:
#!/usr/bin/python
import sys
filename = '/path/to/phylip/data/'
finalrst = open('/path/to/translated/phylip/data/','w')
def translate_dna(sequence):
codontable = {
'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
'ATG':'M'
}
proteinsequence = ''
for n in range (0,len(sequence),3):
if sequence[n:n+3] in codontable:
proteinsequence += codontable[cds[n:n+3]]
sequence = ''
print proteinsequence
for line in open(filename):
if line[0] == "3 1500":
finalrst.write(line)
elif line == '':
finalrst.write(line)
elif line.startswith('sequence'):
finalrst.write(line + translate_dna(line.replace('sequence', '')))
finalrst.close()
Any suggestions on what the issue is? or perhaps a better way of getting at this task?
Thanks!
Are those filepaths to folders or a file? If data is a file, take the / off the end. If it's a folder path then you need to specify the file you want to open.
Then, just for the sake of sanity change this:
for line in open(filename):
if line[0] == "3 1500":
finalrst.write(line)
elif line == '':
finalrst.write(line)
elif line.startswith('sequence'):
finalrst.write(line + translate_dna(line.replace('sequence', '')))
To something like this:
with open(filename, 'r') as readfile:
for line in readfile:
line = line.strip()
# Check the full line, stripped instead of the first character.
if line == "3 1500":
finalrst.write(line + '\n')
elif line == '':
finalrst.write(line + '\n')
elif line.startswith('sequence'):
finalrst.write(line + translate_dna(line.replace('sequence', '')) + '\n')
This way the readfile file handle will always get closed.
But it's probably because of the filepaths pointing to a folder. If it's not that it may be because of the lingering file handle that was opened, but not closed.
Your translate_dna doesn't appear to work. Here is a working approach albeit not the most optimal
def translate_dna(sequence):
sequence = sequence.upper()
codontable = {
'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
'ATG':'M'
}
translated = ''
while len(sequence) >=3:
substring_3 = sequence[:3]
if substring_3 in codontable:
translated+= codontable[substring_3]
sequence = sequence[1:]
else:
sequence = sequence[3:]
return translated
Also, there are other problems. For example:
elif line.startswith('sequence'):
finalrst.write(line + translate_dna(line.replace('sequence', '')))
There is no string 'sequence' in your input. Make it:
elif line.startswith('seq'):
finalrst.write(line + '\t' + translate_dna(line.split()[-1]))
I'm in the middle of an exercise to translate DNA into protein. I've got a dictionary "codons" that I want to iterate through and match keys to elements in a list "plist." And then, of course, print the value only to a new element ("protein") that I'll concatenate later, once I get this working :P.
My problem is that the code after the dictionary doesn't print anything, though it doesn't throw an error either. I've tried adding a return statement to see if that would fix it but nah. Hints/guidance/halp? Am I going about this pythonically?
n=3
protein = []
plist = [DNA_input[i:i+n] for i in range(0, len(DNA_input), n)]
if len(plist[-1]) % 3 == 0:
print("Sequence length OK")
else:
print("Taking a bit off the end...")
plist.pop()
print(plist)
codons = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
for key, value in codons.items():
if key in codons in plist:
return value
protein.append(value)
print(protein)
You're bailing out on the line:
return value
the program never reaches the line print(protein)
In addition to that, most chances are that in the following line:
if key in codons in plist:
you're not doing what you think you're doing.
Since you posted only part of the original code, it's not easy to debug it. You should look at the full error (stacktrace), which line generates it and go from there.