How to write itertools.izip to txt file one by line? - python

My code
import itertools
import os
with open("base.txt") as fv, open("new2.txt", 'r') as fi, open('sortedf.txt','w') as fs:
vel = (line.strip() for line in fv)
ind = (int(line.strip()) for line in fi)
z = itertools.izip(ind, vel) # sort according to ind
# itertools.izip(vel, ind) # sort according to vel
for i, v in sorted(z):
fs.write(str(v))
I got everything on one line
2.900000e+032.900000e+032.900000e+032.900000e+032.
When I change to
fs.write('\n'.join(str(v)))
Then I got
2
.
9
0
0
0
0
0
e
+
0
32
.
9
0
0
0
0
0
e
+
0
32
.
How to get proper one by line value output?

Just Change
for i, v in sorted(z):
fs.write(str(v))
to
for i,v in sorted(z):
print(v, file=fs)
\n is added automatically due to the end="\n" default parameter of print
works for any datatype. No need for str(v)

Please try the following
fs.writelines(map(lambda x: x[1], sorted(z)))
Why below statement failed
fs.write('\n'.join(str(v)))
Here v, is converted into a list and join is applied over it. Look at below example for clarity
>>> sam = 'hello'
>>> '-'.join(sam)
'h-e-l-l-o'
so how to use fs.write ?
write just as : fs.write(v)
now add linebreak : fs.write('\n')
Few suggestions:
import os # load important modules first
from itertools import izip # makes processing faster, if you only need izip
with open("base.txt") as fv, open("new2.txt", 'r') as fi, open('sortedf.txt','w') as fs:
vel = [line.strip() for line in fv] # use square braces
ind = int(line.strip()) for line in fi] # use square braces
z = izip(ind, vel) # sort according to ind
for i, v in sorted(z):
fs.write(v)
fs.write('\n') # adding line break

Related

Counting "Digram" aka pairs of nucleotides in a for loop

I need to write a function that takes a fasta file and counts the digrams (AT, CG, TT, CC, etc) in the file.
My for loop currently reads the file line by line, and produces the count for that line. Then it restarts the count in the next line. (This is all organized into a dictionary)
I want to maintain the counts each line, so I get a count for the whole file, not just individual lines.
This is my code that I'm trying to fix:
dinucleotides = ['AA','AT','AG','AC',
'TA','TT','TG','TC',
'GA','GT','GG','GC',
'CA','CT','CG','CT']
all_counts = {}
with open('short.fasta', 'r') as dna_file:
dna_file.readline()
for line in dna_file:
my_line = line.strip()
for pairs in dinucleotides:
count = my_line.count(pairs)
all_counts[pairs] = count
Thank you!
Add it to the last count you had,
all_counts[pairs] = all_counts.get(pairs, 0) + count
You can use collections.defaultdict with int as a default_factory.
And change all_counts[pairs] = count to all_counts[pairs] += count.
from collections import defaultdict
dinucleotides = ['AA','AT','AG','AC',
'TA','TT','TG','TC',
'GA','GT','GG','GC',
'CA','CT','CG','CT']
all_counts = defaultdict(int)
with open('short.fasta', 'r') as dna_file:
dna_file.readline()
for line in dna_file:
my_line = line.strip()
for pairs in dinucleotides:
count = my_line.count(pairs)
all_counts[pairs] += count
Or, use dict.setdefault method.
...
all_counts = {}
...
all_counts.setdefault(pairs, 0) += count
One idea is to initialize a Python dict mapping each 2gram to zero, and increment that according to each line. Here, I will assume that the FASQ file only contains bases in "ATGC". In addition, iterating over each possible pair for each line requires 16 passes over each line. This can be avoided by passing each line once with look-ahead and saving each pair. Perhaps as follows:
import random
def random_dnukes(lines=1000, cols=40):
return [''.join(random.choices('ATGC', k=cols)) for _ in range(lines)]
# e.g.
# ['TGACTCGTCAAAGGTACGTTAATCCTTGGGCAGTTACGGG',
# 'ATTGTTCAATCGAACGTTCGCTACTCGACTCGCGCCCCCT',
# 'TCCCGTGGGACAGGTTCCCAATTGACCGGACGCCGGACTC',
# 'TCGTCGTGCCCCGACATTGCTTCACGGCGGTGCGCGCTGG',
# 'GGTCCGGTCTAGGCGATCCCTAATAGTCAAGCACCGATTA',
# 'CCGAGCCTTGTGTATACTCTGTAAACACTTCTTCCCATAC',
# 'CGGATAGCAGCTAGTGGTTCCCGCAGTACAGGATGACCAA',
# 'CTCGGACGAGAAATCAGGCCAACCTCCACTGGCGACAGAA',
# 'TCTGACCTGCAGTGCAGTCCAGTTATAGTGGAACACCAGC',
# 'GTCAGCCCTTATCCGTTAGCCCAGGTGCCTCAATAGGAGG']
fake_file_iterator = iter(random_dnukes(1000, 40))
from collections import defaultdict
total_counts = defaultdict(int)
for line in fake_file_iterator:
line = line.strip()
for i in range(len(line) - 1):
total_counts[line[i:i+2]] += 1
for k, v in total_counts.items():
print(k, v)
Resulting in
GC 2497
CC 2382
CG 2444
GT 2422
TT 2508
TA 2373
AC 2466
GG 2408
TG 2473
CA 2462
AA 2412
CT 2448
AG 2454
GA 2470
TC 2400
AT 2381

How to add digits in output

I have a file which I'm reading using python. In this file, I'm selecting certain numbers which are displayed as a list in the output, I want to add these numbers. Here is the code I'm using:
with open ("C:/xampp/htdocs/Final/uploads/file.dist", 'r') as rf:
g = [rf.replace(' ', '') for rf in rf]
k=[]
for e in g[1::47]:
r=(e[:12])
s=(r[:2])
i.append(s)
m= Counter(i)
for letter in m:
t= m[letter]
print(t)
This gives me output as follows:
80
80
80
80
I want to add these number so that the final output will be 320 (80+80+80+80). I've tried listing method, import math library, but none of them is giving me the required output. Any help will be highly appreciated.
Use += instead of = to add the values of m[letter] to t:
from collections import Counter
with open ("C:/path/file.dist", 'r') as rf:
g = [rf.replace(' ', '') for rf in rf]
i=[]
for e in g[1::47]:
r=(e[:12])
s=(r[:2])
i.append(s)
m = Counter(i)
t = 0
for letter in m:
t += m[letter]
print(t)

python: how to count number in one file?

I need to write a Python program to read the values in a file, one per line, such as file: test.txt
1
2
3
4
5
6
7
8
9
10
Denoting these as j1, j2, j3, ... jn,
I need to sum the differences of consecutive values:
a=(j2-j1)+(j3-j2)+...+(jn-j[n-1])
I have example source code
a=0
for(j=2;j<=n;j++){
a=a+(j-(j-1))
}
print a
and the output is
9
If I understand correctly, the following equation;
a = (j2-j1) + (j3-j2) + ... + (jn-(jn-1))
As you iterate over the file, it will subtract the value in the previous line from the value in the current line and then add all those differences.
a = 0
with open("test.txt", "r") as f:
previous = next(f).strip()
for line in f:
line = line.strip()
if not line: continue
a = a + (int(line) - int(previous))
previous = line
print(a)
Solution (Python 3)
res = 0
with open("test.txt","r") as fp:
lines = list(map(int,fp.readlines()))
for i in range(1,len(lines)):
res += lines[i]-lines[i-1]
print(res)
Output: 9
test.text contains:
1
2
3
4
5
6
7
8
9
10
I'm not even sure if I understand the question, but here's my best attempt at solving what I think is your problem:
To read values from a file, use "with open()" in read mode ('r'):
with open('test.txt', 'r') as f:
-your code here-
"as f" means that "f" will now represent your file if you use it anywhere in that block
So, to read all the lines and store them into a list, do this:
all_lines = f.readlines()
You can now do whatever you want with the data.
If you look at the function you're trying to solve, a=(j2-j1)+(j3-j2)+...+(jn-(jn-1)), you'll notice that many of the values cancel out, e.g. (j2-j1)+(j3-j2) = j3-j1. Thus, the entire function boils down to jn-j1, so all you need is the first and last number.
Edit: That being said, please try and search this forum first before asking any questions. As someone who's been in your shoes before, I decided to help you out, but you should learn to reference other people's questions that are identical to your own.
The correct answer is 9 :
with open("data.txt") as f:
# set prev to first number in the file
prev = int(next(f))
sm = 0
# iterate over the remaining numbers
for j in f:
j = int(j)
sm += j - prev
# update prev
prev = j
print(sm)
Or using itertools.tee and zip:
from itertools import tee
with open("data.txt") as f:
a,b = tee(f)
next(b)
print(sum(int(j) - int(i) for i,j in zip(a, b)))

Create Matrix from a csv file - Python

I am trying to read some numbers from a .csv file and store them into a matrix using Python. The input file looks like this
Input File
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
The input is to be manipulated to a matrix like -
Output File
1 2 3
A 2 1 1
B 3 2 0
Here, the first column of the input file becomes the row, second column becomes the column and the value is the count of the occurrence. How should I implement this? The size of my input file is huge (1000000 rows) and hence there can be large number of rows (anywhere between 50 to 10,000) and columns (from 1 to 50)
With pandas, it becomes easy, almost in just 3 lines
import pandas as pd
df = pd.read_csv('example.csv', names=['label', 'value'])
# >>> df
# label value
# 0 B 1
# 1 A 1
# 2 A 1
# 3 B 1
# 4 A 3
# 5 A 2
# 6 B 1
# 7 B 2
# 8 B 2
s = df.groupby(['label', 'value']).size()
# >>> s
# label value
# A 1 2
# 2 1
# 3 1
# B 1 3
# 2 2
# dtype: int64
# ref1: http://stackoverflow.com/questions/15751283/converting-a-pandas-multiindex-dataframe-from-rows-wise-to-column-wise
# ref2: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.unstack.html
m = s.unstack()
# >>> m
# value 1 2 3
# label
# A 2 1 1
# B 3 2 NaN
# Below are optional: just to make it look more like what you want
m.columns.name = None
m.index.name = None
m = m.fillna(0)
print m
# 1 2 3
# A 2 1 1
# B 3 2 0
My solution does not seem to be very effective for a huge amout of input data since I am doing a lot of stuff manually which could be done by some of pandas DataFrame methods, probably.
However, this does the job:
#!/usr/bin/env python3
# coding: utf-8
import pandas as pd
from collections import Counter
with open('foo.txt') as f:
l = f.read().splitlines()
numbers_list = []
letters_list = []
for element in l:
letter = element.split(',')[0]
number = element.split(',')[1]
if number not in numbers_list:
numbers_list.append(number)
if letter not in letters_list:
letters_list.append(letter)
c = Counter(l)
d = dict(c)
output = pd.DataFrame(columns=sorted(numbers_list), index=sorted(letters_list))
for col in numbers_list:
for row in letters_list:
key = '{},{}'.format(row, col)
if key in d:
output[col][row] = d[key]
else:
output[col][row] = 0
The output is as desired:
1 2 3
A 2 1 1
B 3 2 0
The following solution uses just standard Python modules:
import csv, collections, itertools
with open('my.csv', 'r') as f_input:
counts = collections.Counter()
for cols in csv.reader(f_input):
counts[(cols[0], cols[1])] += 1
keys = set(key[0] for key in counts.keys())
values = set(counts.values())
d = {}
for k in itertools.product(keys, values):
d[(k[0], str(k[1]))] = 0
d.update(dict(counts))
with open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
# Write the header, 'X' is whatever you want the first column called
csv_output.writerow(['X'] + sorted(values))
# Write the rows
for k, g in itertools.groupby(sorted(d.items()), key=lambda x: x[0][0]):
csv_output.writerow([k] + [col[1] for col in g])
This gives you an output CSV file looking like:
X,1,2,3
A,2,1,1
B,3,2,0
Here is another variation using standard modules:
import csv
import re
from collections import defaultdict
from itertools import chain
d = defaultdict(list)
with open('data.csv', 'rb') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
d[row[0]].append(row[1])
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
print ' ', re.sub(r'[\[\],]','',str(v))
for i, j in enumerate(k):
print j, re.sub(r'[\[\],]','',str(e[i]))
Given data.csv has the contents of the input file shown in the question, this script prints the following as output:
1 2 3
A 2 1 1
B 3 2 0
Thanks to #zyxue for a pure pandas solution. It takes a lot less code up front with the problem being selection of it. However, extra coding is not necessarily in vain regarding run time performance. Using timeit in IPython to measure the run time difference between my code and that of &zyxue using pure pandas, I found that my method ran 36 times faster excluding imports and input IO and 121 times faster when also excuding output IO (print statements). These tests were done with functions to encapsulate code blocks. Here are the functions that were tested using Python 2.7.10 and Pandas 0.16.2:
def p(): # 1st pandas function
s = df.groupby(['label', 'value']).size()
m = s.unstack()
m.columns.name = None
m.index.name = None
m = m.fillna(0)
print m
def p1(): # 2nd pandas function - omitting print statement
s = df.groupby(['label', 'value']).size()
m = s.unstack()
m.columns.name = None
m.index.name = None
m = m.fillna(0)
def q(): # first std mods function
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
print ' ', re.sub(r'[\[\],]','',str(v))
for i, j in enumerate(k):
print j, re.sub(r'[\[\],]','',str(e[i]))
def q1(): # 2nd std mods function - omitting print statements
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
Prior to testing the following code was run to import modules, input IO and initialize variables for all functions:
import pandas as pd
df = pd.read_csv('data.csv', names=['label', 'value'])
import csv
from collections import defaultdict
from itertools import chain
import re
d = defaultdict(list)
with open('data.csv', 'rb') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
d[row[0]].append(row[1])
The contents of the data.csv input file was:
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
The test command line for each function was of the form:
%timeit fun()
Here are the test results:
p(): 100 loops, best of 3: 4.47 ms per loop
p1(): 1000 loops, best of 3: 1.88 ms per loop
q(): 10000 loops, best of 3: 123 µs per loop
q1(): 100000 loops, best of 3: 15.5 µs per loop
These results are only suggestive and for one small dataset. In particular I would expect pandas to perform comparatively better for larger datasets up to a point.
Here is a way to do it with MapReduce using Hadoop streaming where the mapper and reducer scripts both read stdin.
The mapper script is mostly an input mechanism and filters input to remove improper data with advantages that the input can be split over multiple mapper processes with the total output automatically sorted and forwarded to a reducer plus the possibility of running combiners locally on mapper nodes. Combiners are essentially intermediate reducers useful for speeding up reduction through parallelism over a cluster.
# mapper script
import sys
import re
# mapper
for line in sys.stdin:
line = line.strip()
word = line.split()[0]
if word and re.match(r'\A[a-zA-Z]+,[0-9]+',word):
print '%s\t%s' % (word)
The reducer script gets sorted output over all mappers, builds an intermediate dict for each input key such as A or B, which is called 'prefix' in the code and outputs results to a file in csv format.
# reducer script
from collections import defaultdict
import sys
def output(s,d):
"""
this function takes a string s and dictionary d with int keys and values
and sorts the keys then creates a string of comma-separate values ordered
by the keys with appropriate insertion of comma-separate zeros equal in
number to the difference between successive keys minus one
"""
v = sorted(d.keys())
o = str(s) + ','
lastk = 0
for k in v:
o += '0,'*(k-lastk-1) + str(d[k]) + ','
lastk = k
return o
prefix = ''
current_prefix = ''
d = defaultdict(int)
maxkey = 0
for line in sys.stdin:
line = line.strip()
prefix,value = line.split(',')
try:
value = int(value)
except ValueError:
continue
if current_prefix == prefix:
d[value] += 1
else:
if current_prefix:
if len(d) > 0:
print output(current_prefix,d)
t = max(d.keys())
if t > maxkey:
maxkey = t
d = defaultdict(int)
current_prefix = prefix
d[value] += 1
# output info for last prefix if needed
if current_prefix == prefix:
print output(prefix,d)
t = max(d.keys())
if t > maxkey:
maxkey = t
# output csv list of keys from 1 through maxkey
h = ' ,'
for i in range(1,maxkey+1):
h += str(i) + ','
print h
To run through data streaming process, given that the mapper gets:
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
It directly outputs the same content which then all gets sorted (shuffled) and sent to a reducer. In this example, what the reducer gets is:
A,1
A,1
A,2
A,3
B,1
B,1
B,1
B,2
B,2
Finally the output of the reducer is:
A,2,1,1,
B,3,2,
,1,2,3,
For larger data sets, the input file would be split with portions containing all data for some sets of keys going to separate mappers. Using a combiner on each mapper node would save overall sorting time. There would still be a need for a single reducer so that the output is totally sorted by key. If that's not a requirement, multiple reducers could be used.
For practical reasons I made a couple of choices. First, each line of output only goes up to the highest integer for a key and trailing zeros are not printed because there is no way to know how many to write until all the input has been processed, which for large input means storing a large amount of intermediate data in memory or slowing down processing by writing it out to disk and reading it back in to complete the job. Second and for the same reason, the header line cannot be written until just before the end of the reduce job so that's when its written. It may be possible to prepend it to the output file, or the first one if output has been split, and that can be investigated in due course. However, provided a great speedup of performance from parallel processing, for massive input, these are minor issues.
This method will work with relatively minor but crucial modifications on a Spark cluster and can be converted to Java or Scala to improve performance if necessary.

generating a single outfile after analyzing multiple files in python

i have multiple files each containing 8/9 columns.
for a single file : I have to read last column containing some value and count the number of occurrence of each value and then generate an outfile.
I have done it like:
inp = open(filename,'r').read().strip().split('\n')
out = open(filename,'w')
from collections import Counter
C = Counter()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
now the problem is it works fine if I have to generate one output for one input. But I need to scan a directory using glob.iglobfunction for all files to be used as input. And then have to perform above said program on each file to gather result for each file and then of course have to write all of the analyzed results for each file into a single OUTPUT file.
NOTE: During generating single OUTPUT file if any value is found to be getting repeated then instead of writing same entry twice it is preferred to sum up the 'count' only. e.g. analysis of 1st file generate:
123 6
111 5
0 6
45 5
and 2nd file generate:
121 9
111 7
0 1
22 2
in this case OUTPUT file must be written such a way that it contain:
123 6
111 12 #sum up count no. in case of similar value entry
0 7
45 5
22 2
i have written prog. for single file analysis BUT i'm stuck in mass analysis section.
please help.
from collections import Counter
import glob
out = open(filename,'w')
g_iter = glob.iglob('path_to_dir/*')
C = Counter()
for filename in g_iter:
f = open(filename,'r')
inp = f.read().strip().split('\n')
f.close()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
After de-uglification:
from collections import Counter
import glob
def main():
# create Counter
cnt = Counter()
# collect data
for fname in glob.iglob('path_to_dir/*.dat'):
with open(fname) as inf:
cnt.update(line.split()[-1] for line in inf)
# dump results
with open("summary.dat", "w") as outf:
outf.writelines("{:5s} {:>5d}\n".format(val,num) for val,num in cnt.iteritems())
if __name__=="__main__":
main()
Initialise a empty dictionary at the top of the program,
lets say, dic=dict()
and for each Counter update the dic so that the values of similar keys are summed and the new keys are also added to the dic
to update dic use this:
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
where C is the current Counter, and after all files are finished write the dic to the output file.
import glob
from collections import Counter
dic=dict()
g_iter = glob.iglob(r'c:\\python32\fol\*')
for x in g_iter:
lis=[]
with open(x) as f:
inp = f.readlines()
for line in inp:
num=line.split()[-1]
lis.append(num)
C=Counter(lis)
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
for x in dic:
print(x,'\t',dic[x])
I did like this.
import glob
out = open("write.txt",'a')
from collections import Counter
C = Counter()
for file in glob.iglob('temp*.txt'):
for line in open(file,'r').read().strip().split('\n'):
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

Categories