Counting "Digram" aka pairs of nucleotides in a for loop - python

I need to write a function that takes a fasta file and counts the digrams (AT, CG, TT, CC, etc) in the file.
My for loop currently reads the file line by line, and produces the count for that line. Then it restarts the count in the next line. (This is all organized into a dictionary)
I want to maintain the counts each line, so I get a count for the whole file, not just individual lines.
This is my code that I'm trying to fix:
dinucleotides = ['AA','AT','AG','AC',
'TA','TT','TG','TC',
'GA','GT','GG','GC',
'CA','CT','CG','CT']
all_counts = {}
with open('short.fasta', 'r') as dna_file:
dna_file.readline()
for line in dna_file:
my_line = line.strip()
for pairs in dinucleotides:
count = my_line.count(pairs)
all_counts[pairs] = count
Thank you!

Add it to the last count you had,
all_counts[pairs] = all_counts.get(pairs, 0) + count

You can use collections.defaultdict with int as a default_factory.
And change all_counts[pairs] = count to all_counts[pairs] += count.
from collections import defaultdict
dinucleotides = ['AA','AT','AG','AC',
'TA','TT','TG','TC',
'GA','GT','GG','GC',
'CA','CT','CG','CT']
all_counts = defaultdict(int)
with open('short.fasta', 'r') as dna_file:
dna_file.readline()
for line in dna_file:
my_line = line.strip()
for pairs in dinucleotides:
count = my_line.count(pairs)
all_counts[pairs] += count
Or, use dict.setdefault method.
...
all_counts = {}
...
all_counts.setdefault(pairs, 0) += count

One idea is to initialize a Python dict mapping each 2gram to zero, and increment that according to each line. Here, I will assume that the FASQ file only contains bases in "ATGC". In addition, iterating over each possible pair for each line requires 16 passes over each line. This can be avoided by passing each line once with look-ahead and saving each pair. Perhaps as follows:
import random
def random_dnukes(lines=1000, cols=40):
return [''.join(random.choices('ATGC', k=cols)) for _ in range(lines)]
# e.g.
# ['TGACTCGTCAAAGGTACGTTAATCCTTGGGCAGTTACGGG',
# 'ATTGTTCAATCGAACGTTCGCTACTCGACTCGCGCCCCCT',
# 'TCCCGTGGGACAGGTTCCCAATTGACCGGACGCCGGACTC',
# 'TCGTCGTGCCCCGACATTGCTTCACGGCGGTGCGCGCTGG',
# 'GGTCCGGTCTAGGCGATCCCTAATAGTCAAGCACCGATTA',
# 'CCGAGCCTTGTGTATACTCTGTAAACACTTCTTCCCATAC',
# 'CGGATAGCAGCTAGTGGTTCCCGCAGTACAGGATGACCAA',
# 'CTCGGACGAGAAATCAGGCCAACCTCCACTGGCGACAGAA',
# 'TCTGACCTGCAGTGCAGTCCAGTTATAGTGGAACACCAGC',
# 'GTCAGCCCTTATCCGTTAGCCCAGGTGCCTCAATAGGAGG']
fake_file_iterator = iter(random_dnukes(1000, 40))
from collections import defaultdict
total_counts = defaultdict(int)
for line in fake_file_iterator:
line = line.strip()
for i in range(len(line) - 1):
total_counts[line[i:i+2]] += 1
for k, v in total_counts.items():
print(k, v)
Resulting in
GC 2497
CC 2382
CG 2444
GT 2422
TT 2508
TA 2373
AC 2466
GG 2408
TG 2473
CA 2462
AA 2412
CT 2448
AG 2454
GA 2470
TC 2400
AT 2381

Related

Counting the occurrence of two words at the beginning of a text file

I am withing a python code to read all lines in a text file (b.txt) and count those with special words (ATOM and HETATM) at the beginning of each line. The b.txt file is as follows:
REMARK 480 ATOM ZERO OCCUPANCY
ATOM 3332 CA GLY A 8 9.207 4.845 44.955 1.00 42.92 C
HETATM 2954 O HOH A 489 -17.507 4.101 8.012 1.00 53.13 O
and the code is:
pdb_text = open("b.txt","r")
data = pdb_text.read()
n_atoms = data.count("ATOM")
n_het_atom = data.count("HETATM")
total_atoms = n_atoms + n_het_atom
print('Number of atoms:', total_atoms)
I expect “2” as the output, but I get “3” instead.
For counting the lines that start with ATOM or HETATM you can use string's startwith function. For example you can do:
data = ""
with open("b.txt","r") as file:
data = file.readlines()
counter = 0
for line in data.split('\n'):
if line.startswith("ATOM") or line.startswith("HETATM"):
counter = counter + 1
print('Number of atoms:', counter)
Using the following code, you can read a PDB file and count the number of atoms in it.
# open the pdb file
pdb_file = input ("Enter the name of your PDB file: ")
pdb_text = open(pdb_file,"r")
# read contents of the pdb file to string
data = ""
data = pdb_text.read()
# get number of occurrences of ATOMS and in the pdb file
n_atoms = 0
for line in data.split('\n'):
if line.startswith("ATOM") or line.startswith("HETATM"):
n_atoms = n_atoms + 1
print('Number of atoms:', n_atoms)
Let's generalize it:
We want a function which takes a file_path and a list of words.
It should then return a dictionary of the line counts starting with the words.
For the sum, we takes the values of the dictionary and sum it. We can take that also as a function.
In other languages, such a function which returns a dictionary of the words with their count as value frequence. Since we count not overall occurrence but those at the start of the lines, we call it start_frequencies:
def start_frequencies(path, words):
dct = {}
with open(path) as f:
for line in f:
for word in words:
if line.startswith(word):
dct[word] = dct.get(word, 0) + 1
return dct
The magic is dct.get(word, 0) because it says: "If the word exist in the dictionary dct already as a key, take the value of it, else take as default value (the count) 0."
Then, we can write the function which returns the sum of all start counts:
def sum_of_start_frequencies(path, words):
dct = start_frequencies(path, words)
return sum(dct.values())
So in your case, you use it like:
pdb_file = input ("Enter the name of your PDB file: ")
sum_of_start_frequencies(pdb_file, ["ATOM", "HETATM"])
It should return 2.

Counting items in txt file with Python dictionaries

I have following txt file (only a fragment is given)
## DISTANCE : Shortest distance from variant to transcript
## a lot of comments here
## STRAND : Strand of the feature (1/-1)
## FLAGS : Transcript quality flags
#Uploaded_variation Location Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation Extra
chr1_69270_A/G chr1:69270 G ENSG00000186092 ENST00000335137 Transcript upstream_gene_variant 216 180 60 S tcA/tcG - IMPACT=LOW;STRAND=1
chr1_69270_A/G chr1:69270 G ENSG00000186092 ENST00000641515 Transcript intron_variant 303 243 81 S tcA/tcG - IMPACT=LOW;STRAND=1
chr1_69511_A/G chr1:69511 G ENSG00000186092 ENST00000335137 Transcript upstream_gene_variant 457 421 141 T/A Aca/Gca - IMPACT=MODERATE;STRAND=1
with many unknown various ENSG numbers, such as ENSG00000187583, etc. The count of integers in each ENSG string is 11.
I have to count how many intron_variant and upstream_gene_variant contains each gene (ENSGxxx).
and output it to csv file.
I use dictionary for this purpose. i tried to write this code, but not sure about correct syntax.
The logics should be: if these 11 numbers are not in dictionary, it should be added with value 1. If they already are in dictionary, value should be changed to x + 1. I currently have this code, but I am not really Python programmer, and not sure about correct syntax.
with open(file, 'rt') as f:
data = f.readlines()
Count = 0
d = {}
for line in data:
if line[0] == "#":
output.write(line)
if line.__contains__('ENSG'):
d[line.split('ENSG')[1][0:11]]=1
if 1 in d:
d=1
else:
Count += 1
Any suggestions?
Thank you!
Can you try this:
from collections import Counter
with open('data.txt') as fp:
ensg = []
for line in fp:
idx = line.find('ENSG')
if not line.startswith('#') and idx != -1:
ensg.append(line[idx+4:idx+15])
count = Counter(ensg)
>>> count
Counter({'00000187961': 2, '00000187583': 2})
Update
I need to know how many ENGs contain "intron_variant" and "upstream_gene_variant"
Use regex to extract desired patterns:
from collections import Counter
import re
PAT_ENSG = r'ENSG(?P<ensg>\d{11})'
PAT_VARIANT = r'(?P<variant>intron_variant|upstream_gene_variant)'
PATTERN = re.compile(fr'{PAT_ENSG}.*\b{PAT_VARIANT}\b')
with open('data.txt') as fp:
ensg = []
for line in fp:
sre = PATTERN.search(line)
if not line.startswith('#') and sre:
ensg.append(sre.groups())
count = Counter(ensg)
Output:
>>> count
Counter({('00000186092', 'upstream_gene_variant'): 2,
('00000186092', 'intron_variant'): 1})
Here's another interpretation of your requirement:-
I have modified your sample data such that the first ENG value is ENSG00000187971 to highlight how this works.
D = {}
with open('eng.txt') as eng:
for line in eng:
if not line.startswith('#'):
t = line.split()
V = t[6]
E = t[3]
if not V in D:
D[V] = {}
if not E in D[V]:
D[V][E] = 1
else:
D[V][E] += 1
print(D)
The output of this is:-
{'intron_variant': {'ENSG00000187971': 1, 'ENSG00000187961': 1}, 'upstream_gene_variant': {'ENSG00000187583': 2}}
So what you have now is a dictionary keyed by variant. Each variant has its own dictionary keyed by the ENSG values and a count of occurrences of each ENSG value

Counting number of occurrence of a string in a text file

I have a text file containing:
Rabbit:Grass
Eagle:Rabbit
Grasshopper:Grass
Rabbit:Grasshopper
Snake:Rabbit
Eagle:Snake
I want to count the number of occurrence of a string, say, the number of times the animals occur in the text file and print the count. Here's my code:
fileName = input("Enter the name of file:")
foodChain = open(fileName)
table = []
for line in foodChain:
contents = line.strip().split(':')
table.append(contents)
def countOccurence(l):
count = 0
for i in l:
#I'm stuck here#
count +=1
return count
I'm unsure about how will python count the occurrence in a text file. The output i wanted is:
Rabbit: 4
Eagle: 2
Grasshopper: 2
Snake: 2
Grass: 2
I just need some help on the counting part and I will be able to manage the rest of it. Regards.
what you need is a dictionary.
dictionary = {}
for line in table:
for animal in line:
if animal in dictionary:
dictionary[animal] += 1
else:
dictionary[animal] = 1
for animal, occurences in dictionary.items():
print(animal, ':', occurences)
The solution using str.split(), re.sub() functions and collections.Counter subclass:
import re, collections
with open(filename, 'r') as fh:
# setting space as a common delimiter
contents = re.sub(r':|\n', ' ', fh.read()).split()
counts = collections.Counter(contents)
# iterating through `animal` counts
for a in counts:
print(a, ':', counts[a])
The output:
Snake : 2
Rabbit : 4
Grass : 2
Eagle : 2
Grasshopper : 2
Use in to judge if an array is an element of another array, in Python, you can use a string as array:
def countOccurence(l):
count = 0
#I'm stuck here#
if l in table:
count +=1
return count
from collections import defaultdict
dd = defaultdict(int)
with open(fpath) as f:
for line in f:
words = line.split(':')
for word in words:
dd[word] += 1
for k,v in dd.items():
print(k+': '+str(v))

How to write itertools.izip to txt file one by line?

My code
import itertools
import os
with open("base.txt") as fv, open("new2.txt", 'r') as fi, open('sortedf.txt','w') as fs:
vel = (line.strip() for line in fv)
ind = (int(line.strip()) for line in fi)
z = itertools.izip(ind, vel) # sort according to ind
# itertools.izip(vel, ind) # sort according to vel
for i, v in sorted(z):
fs.write(str(v))
I got everything on one line
2.900000e+032.900000e+032.900000e+032.900000e+032.
When I change to
fs.write('\n'.join(str(v)))
Then I got
2
.
9
0
0
0
0
0
e
+
0
32
.
9
0
0
0
0
0
e
+
0
32
.
How to get proper one by line value output?
Just Change
for i, v in sorted(z):
fs.write(str(v))
to
for i,v in sorted(z):
print(v, file=fs)
\n is added automatically due to the end="\n" default parameter of print
works for any datatype. No need for str(v)
Please try the following
fs.writelines(map(lambda x: x[1], sorted(z)))
Why below statement failed
fs.write('\n'.join(str(v)))
Here v, is converted into a list and join is applied over it. Look at below example for clarity
>>> sam = 'hello'
>>> '-'.join(sam)
'h-e-l-l-o'
so how to use fs.write ?
write just as : fs.write(v)
now add linebreak : fs.write('\n')
Few suggestions:
import os # load important modules first
from itertools import izip # makes processing faster, if you only need izip
with open("base.txt") as fv, open("new2.txt", 'r') as fi, open('sortedf.txt','w') as fs:
vel = [line.strip() for line in fv] # use square braces
ind = int(line.strip()) for line in fi] # use square braces
z = izip(ind, vel) # sort according to ind
for i, v in sorted(z):
fs.write(v)
fs.write('\n') # adding line break

generating a single outfile after analyzing multiple files in python

i have multiple files each containing 8/9 columns.
for a single file : I have to read last column containing some value and count the number of occurrence of each value and then generate an outfile.
I have done it like:
inp = open(filename,'r').read().strip().split('\n')
out = open(filename,'w')
from collections import Counter
C = Counter()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
now the problem is it works fine if I have to generate one output for one input. But I need to scan a directory using glob.iglobfunction for all files to be used as input. And then have to perform above said program on each file to gather result for each file and then of course have to write all of the analyzed results for each file into a single OUTPUT file.
NOTE: During generating single OUTPUT file if any value is found to be getting repeated then instead of writing same entry twice it is preferred to sum up the 'count' only. e.g. analysis of 1st file generate:
123 6
111 5
0 6
45 5
and 2nd file generate:
121 9
111 7
0 1
22 2
in this case OUTPUT file must be written such a way that it contain:
123 6
111 12 #sum up count no. in case of similar value entry
0 7
45 5
22 2
i have written prog. for single file analysis BUT i'm stuck in mass analysis section.
please help.
from collections import Counter
import glob
out = open(filename,'w')
g_iter = glob.iglob('path_to_dir/*')
C = Counter()
for filename in g_iter:
f = open(filename,'r')
inp = f.read().strip().split('\n')
f.close()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
After de-uglification:
from collections import Counter
import glob
def main():
# create Counter
cnt = Counter()
# collect data
for fname in glob.iglob('path_to_dir/*.dat'):
with open(fname) as inf:
cnt.update(line.split()[-1] for line in inf)
# dump results
with open("summary.dat", "w") as outf:
outf.writelines("{:5s} {:>5d}\n".format(val,num) for val,num in cnt.iteritems())
if __name__=="__main__":
main()
Initialise a empty dictionary at the top of the program,
lets say, dic=dict()
and for each Counter update the dic so that the values of similar keys are summed and the new keys are also added to the dic
to update dic use this:
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
where C is the current Counter, and after all files are finished write the dic to the output file.
import glob
from collections import Counter
dic=dict()
g_iter = glob.iglob(r'c:\\python32\fol\*')
for x in g_iter:
lis=[]
with open(x) as f:
inp = f.readlines()
for line in inp:
num=line.split()[-1]
lis.append(num)
C=Counter(lis)
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
for x in dic:
print(x,'\t',dic[x])
I did like this.
import glob
out = open("write.txt",'a')
from collections import Counter
C = Counter()
for file in glob.iglob('temp*.txt'):
for line in open(file,'r').read().strip().split('\n'):
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

Categories