generating a single outfile after analyzing multiple files in python - python

i have multiple files each containing 8/9 columns.
for a single file : I have to read last column containing some value and count the number of occurrence of each value and then generate an outfile.
I have done it like:
inp = open(filename,'r').read().strip().split('\n')
out = open(filename,'w')
from collections import Counter
C = Counter()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
now the problem is it works fine if I have to generate one output for one input. But I need to scan a directory using glob.iglobfunction for all files to be used as input. And then have to perform above said program on each file to gather result for each file and then of course have to write all of the analyzed results for each file into a single OUTPUT file.
NOTE: During generating single OUTPUT file if any value is found to be getting repeated then instead of writing same entry twice it is preferred to sum up the 'count' only. e.g. analysis of 1st file generate:
123 6
111 5
0 6
45 5
and 2nd file generate:
121 9
111 7
0 1
22 2
in this case OUTPUT file must be written such a way that it contain:
123 6
111 12 #sum up count no. in case of similar value entry
0 7
45 5
22 2
i have written prog. for single file analysis BUT i'm stuck in mass analysis section.
please help.

from collections import Counter
import glob
out = open(filename,'w')
g_iter = glob.iglob('path_to_dir/*')
C = Counter()
for filename in g_iter:
f = open(filename,'r')
inp = f.read().strip().split('\n')
f.close()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

After de-uglification:
from collections import Counter
import glob
def main():
# create Counter
cnt = Counter()
# collect data
for fname in glob.iglob('path_to_dir/*.dat'):
with open(fname) as inf:
cnt.update(line.split()[-1] for line in inf)
# dump results
with open("summary.dat", "w") as outf:
outf.writelines("{:5s} {:>5d}\n".format(val,num) for val,num in cnt.iteritems())
if __name__=="__main__":
main()

Initialise a empty dictionary at the top of the program,
lets say, dic=dict()
and for each Counter update the dic so that the values of similar keys are summed and the new keys are also added to the dic
to update dic use this:
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
where C is the current Counter, and after all files are finished write the dic to the output file.
import glob
from collections import Counter
dic=dict()
g_iter = glob.iglob(r'c:\\python32\fol\*')
for x in g_iter:
lis=[]
with open(x) as f:
inp = f.readlines()
for line in inp:
num=line.split()[-1]
lis.append(num)
C=Counter(lis)
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
for x in dic:
print(x,'\t',dic[x])

I did like this.
import glob
out = open("write.txt",'a')
from collections import Counter
C = Counter()
for file in glob.iglob('temp*.txt'):
for line in open(file,'r').read().strip().split('\n'):
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

Related

Python; print filename and header

I have files (fasta files with a sequence) that look like this:
File1.fasta
>1
GTCTTCCGGCGAGCGGGCTTTTCACCCGCTTTATCGTTACTTATGTCAGCATTCGCACTT
CTGATACCTCCAGCAACCCTCACAGGCCACCTTCGCAGGCTTACAGAACGCTCCCCTACC
CAACAACGCATAAACGTCGCTGCCGCAGCTTCGGTGCATGGTTTAGCCCCGTTACATCTT
CCGCGCAGGCCGACTCGACCAGTGAGCTATTACGCTTTCTTTAAATGATGGCTGCTTCTA
AGCCAACATCCTGGCTGTCTGG
>2
AAAGAAAGCGTAATAGCTCACTGGTCGAGTCGGCCTGCGCGGAAGATGTAACGGGGCTAA
ACCATGCACCGAAGCTGCGGCAGCGACACTCAGGTGTTGTTGGGTAGGGGAGCGTTCTGT
AAGCCTGTGAAGGTGGCCTGTGAGGGTTGCTGGAGGTATCAGAAGTGCGAATGCTGACAT
AAGTAACGATAAAGCGGGTGAAAAGCCCGCTCGCCGGAAGACCAAGGGTTCCTGTCCAAC
GTTAATCGGGGCAGG
File2.fasta
>1
CAACAACGCATAAACGTCGCTGCCGCAGCTTCGGTGCATGGTTTAGCCCCGTTACATCTT
>2
CCGCGCAGGCCGACTCGACCAGTGAGCTATTACGCTTTCTTTAAATGATGGCTGCTTCTA
With my script, I count all the 5-mers in these files. My code is as follows:
import operator
import glob
def printSeq(name, seq):
kmers = {}
k = 5
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
kmers[kmer] += 1
else:
kmers[kmer] = 1
for kmer, count in kmers.items():
print (kmer + "\t" + str(count))
sortedKmer = sorted(kmers.items(), reverse=True)
for item in sortedKmer:
print (item[0] + "\t" + str(item[1]))
for name in glob.glob('*.fasta'):
with open(name, 'r') as f:
seq = ""
key = ""
for line in f.readlines():
if line.startswith(">"):
if key and seq:
printSeq(key, seq)
key = line[1:].strip()
seq = ""
else:
seq += line.strip()
printSeq(key, seq)
The output is now the 5-mer followed with the count.
I want to adjust my output so that for each output line I get the filename followed by the header and than the count, like this:
File1 1 GTCTT 1
File1 1 TCTTC 1
File1 1 CTTCC 1
....
File2 2 TTCTA 1
How can I achieve that?
Additional question
I want to add the reverse complement sequence of the data and count that together with the previous data. My code to get the reverse complement is as follows
from Bio import SeqIO
for fasta_file in glob.glob('*.fasta'):
for record in SeqIO.parse(fasta_file, "fasta"):
reverse_complement = ">" + record.id + "\n" + record.seq.reverse_complement()
So the "reverse_complement" of file one, header >1 has to be counted together with the previous one etc. How can I include this data to my previous files and count together?
My reverse_complement data is
File1.fasta (reverse_complement)
>1
CCAGACAGCCAGGATGTTGGCTTAGAAGCAGCCATCATTTAAAGAAAGCGTAATAGCTCACTGGTCGAGTCGGCCTGCGCGGAAGATGTAACGGGGCTAAACCATGCACCGAAGCTGCGGCAGCGACGTTTATGCGTTGTTGGGTAGGGGAGCGTTCTGTAAGCCTGCGAAGGTGGCCTGTGAGGGTTGCTGGAGGTATCAGAAGTGCGAATGCTGACATAAGTAACGATAAAGCGGGTGAAAAGCCCGCTCGCCGGAAGAC
>2
CCTGCCCCGATTAACGTTGGACAGGAACCCTTGGTCTTCCGGCGAGCGGGCTTTTCACCCGCTTTATCGTTACTTATGTCAGCATTCGCACTTCTGATACCTCCAGCAACCCTCACAGGCCACCTTCACAGGCTTACAGAACGCTCCCCTACCCAACAACACCTGAGTGTCGCTGCCGCAGCTTCGGTGCATGGTTTAGCCCCGTTACATCTTCCGCGCAGGCCGACTCGACCAGTGAGCTATTACGCTTTCTTT
This could also be done using a Counter() as follows:
from collections import Counter
from itertools import groupby
import glob
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file) as f_fasta:
for k, g in groupby(f_fasta, lambda x: x.startswith('>')):
if k:
sequence = next(g).strip('>\n')
else:
d = list(''.join(line.strip() for line in g))
counts = Counter()
while len(d) >= 5:
five_mer = '{}{}{}{}{}'.format(d[0], d[1], d[2], d[3], d[4])
counts[five_mer] += 1
del d[0]
for five_mer, count in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
print "{} {} {} {}".format(basename, sequence, five_mer, count)
This would give you output with the largest counts first and then alphabetically:
File1 1 CAGGC 3
File1 1 CGCAG 3
File1 1 GCTTT 3
File1 1 AACGC 2
File1 1 ACATC 2
File1 1 ACGCT 2
File1 1 AGGCC 2
It uses Python's groupby() function to read groups of lines together. It either reads a single sequence line or a list of five mer lines. k is the result of the startswith() call. So when k is False, take all the lines returned, remove the newline from each and then join them together to make a single line of characters.
It then reads the first 5 characters from the list, joins them back together and adds them as a key to a Counter(). It then removes the first character from the list and repeats until there are less than 5 characters remaining.
For just alphabetical ordering:
for five_mer, count in sorted(counts.items()):
A Counter() works the same way as a dictionary, so .items() would give a list of key value pairs. These are sorted before being displayed.
You change the signature of
def printSeq(name, seq)
to
def printSeq(file, header, name, seq):
incorporate the new variables in the print statements.
e.g.
print (item[0] + "\t" + str(item[1]))
v
print (file + "\t" + header + "\t" + item[0] + "\t" + str(item[1]))
Then, in your loop you pass the information to this function.
You have the file name available in the loop, stored in the variable name
You parse the header in the lines where you detect it, and store it in a variable for later use. The later use is when you call the printSeq-function

Create Matrix from a csv file - Python

I am trying to read some numbers from a .csv file and store them into a matrix using Python. The input file looks like this
Input File
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
The input is to be manipulated to a matrix like -
Output File
1 2 3
A 2 1 1
B 3 2 0
Here, the first column of the input file becomes the row, second column becomes the column and the value is the count of the occurrence. How should I implement this? The size of my input file is huge (1000000 rows) and hence there can be large number of rows (anywhere between 50 to 10,000) and columns (from 1 to 50)
With pandas, it becomes easy, almost in just 3 lines
import pandas as pd
df = pd.read_csv('example.csv', names=['label', 'value'])
# >>> df
# label value
# 0 B 1
# 1 A 1
# 2 A 1
# 3 B 1
# 4 A 3
# 5 A 2
# 6 B 1
# 7 B 2
# 8 B 2
s = df.groupby(['label', 'value']).size()
# >>> s
# label value
# A 1 2
# 2 1
# 3 1
# B 1 3
# 2 2
# dtype: int64
# ref1: http://stackoverflow.com/questions/15751283/converting-a-pandas-multiindex-dataframe-from-rows-wise-to-column-wise
# ref2: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.unstack.html
m = s.unstack()
# >>> m
# value 1 2 3
# label
# A 2 1 1
# B 3 2 NaN
# Below are optional: just to make it look more like what you want
m.columns.name = None
m.index.name = None
m = m.fillna(0)
print m
# 1 2 3
# A 2 1 1
# B 3 2 0
My solution does not seem to be very effective for a huge amout of input data since I am doing a lot of stuff manually which could be done by some of pandas DataFrame methods, probably.
However, this does the job:
#!/usr/bin/env python3
# coding: utf-8
import pandas as pd
from collections import Counter
with open('foo.txt') as f:
l = f.read().splitlines()
numbers_list = []
letters_list = []
for element in l:
letter = element.split(',')[0]
number = element.split(',')[1]
if number not in numbers_list:
numbers_list.append(number)
if letter not in letters_list:
letters_list.append(letter)
c = Counter(l)
d = dict(c)
output = pd.DataFrame(columns=sorted(numbers_list), index=sorted(letters_list))
for col in numbers_list:
for row in letters_list:
key = '{},{}'.format(row, col)
if key in d:
output[col][row] = d[key]
else:
output[col][row] = 0
The output is as desired:
1 2 3
A 2 1 1
B 3 2 0
The following solution uses just standard Python modules:
import csv, collections, itertools
with open('my.csv', 'r') as f_input:
counts = collections.Counter()
for cols in csv.reader(f_input):
counts[(cols[0], cols[1])] += 1
keys = set(key[0] for key in counts.keys())
values = set(counts.values())
d = {}
for k in itertools.product(keys, values):
d[(k[0], str(k[1]))] = 0
d.update(dict(counts))
with open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
# Write the header, 'X' is whatever you want the first column called
csv_output.writerow(['X'] + sorted(values))
# Write the rows
for k, g in itertools.groupby(sorted(d.items()), key=lambda x: x[0][0]):
csv_output.writerow([k] + [col[1] for col in g])
This gives you an output CSV file looking like:
X,1,2,3
A,2,1,1
B,3,2,0
Here is another variation using standard modules:
import csv
import re
from collections import defaultdict
from itertools import chain
d = defaultdict(list)
with open('data.csv', 'rb') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
d[row[0]].append(row[1])
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
print ' ', re.sub(r'[\[\],]','',str(v))
for i, j in enumerate(k):
print j, re.sub(r'[\[\],]','',str(e[i]))
Given data.csv has the contents of the input file shown in the question, this script prints the following as output:
1 2 3
A 2 1 1
B 3 2 0
Thanks to #zyxue for a pure pandas solution. It takes a lot less code up front with the problem being selection of it. However, extra coding is not necessarily in vain regarding run time performance. Using timeit in IPython to measure the run time difference between my code and that of &zyxue using pure pandas, I found that my method ran 36 times faster excluding imports and input IO and 121 times faster when also excuding output IO (print statements). These tests were done with functions to encapsulate code blocks. Here are the functions that were tested using Python 2.7.10 and Pandas 0.16.2:
def p(): # 1st pandas function
s = df.groupby(['label', 'value']).size()
m = s.unstack()
m.columns.name = None
m.index.name = None
m = m.fillna(0)
print m
def p1(): # 2nd pandas function - omitting print statement
s = df.groupby(['label', 'value']).size()
m = s.unstack()
m.columns.name = None
m.index.name = None
m = m.fillna(0)
def q(): # first std mods function
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
print ' ', re.sub(r'[\[\],]','',str(v))
for i, j in enumerate(k):
print j, re.sub(r'[\[\],]','',str(e[i]))
def q1(): # 2nd std mods function - omitting print statements
k = sorted(d.keys())
v = sorted(map(int,set(chain.from_iterable(d.values()))))
e = []
for i in d:
e.append([0]*len(v))
for j in d[i]:
e[-1][int(j)-1] += 1
Prior to testing the following code was run to import modules, input IO and initialize variables for all functions:
import pandas as pd
df = pd.read_csv('data.csv', names=['label', 'value'])
import csv
from collections import defaultdict
from itertools import chain
import re
d = defaultdict(list)
with open('data.csv', 'rb') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
d[row[0]].append(row[1])
The contents of the data.csv input file was:
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
The test command line for each function was of the form:
%timeit fun()
Here are the test results:
p(): 100 loops, best of 3: 4.47 ms per loop
p1(): 1000 loops, best of 3: 1.88 ms per loop
q(): 10000 loops, best of 3: 123 µs per loop
q1(): 100000 loops, best of 3: 15.5 µs per loop
These results are only suggestive and for one small dataset. In particular I would expect pandas to perform comparatively better for larger datasets up to a point.
Here is a way to do it with MapReduce using Hadoop streaming where the mapper and reducer scripts both read stdin.
The mapper script is mostly an input mechanism and filters input to remove improper data with advantages that the input can be split over multiple mapper processes with the total output automatically sorted and forwarded to a reducer plus the possibility of running combiners locally on mapper nodes. Combiners are essentially intermediate reducers useful for speeding up reduction through parallelism over a cluster.
# mapper script
import sys
import re
# mapper
for line in sys.stdin:
line = line.strip()
word = line.split()[0]
if word and re.match(r'\A[a-zA-Z]+,[0-9]+',word):
print '%s\t%s' % (word)
The reducer script gets sorted output over all mappers, builds an intermediate dict for each input key such as A or B, which is called 'prefix' in the code and outputs results to a file in csv format.
# reducer script
from collections import defaultdict
import sys
def output(s,d):
"""
this function takes a string s and dictionary d with int keys and values
and sorts the keys then creates a string of comma-separate values ordered
by the keys with appropriate insertion of comma-separate zeros equal in
number to the difference between successive keys minus one
"""
v = sorted(d.keys())
o = str(s) + ','
lastk = 0
for k in v:
o += '0,'*(k-lastk-1) + str(d[k]) + ','
lastk = k
return o
prefix = ''
current_prefix = ''
d = defaultdict(int)
maxkey = 0
for line in sys.stdin:
line = line.strip()
prefix,value = line.split(',')
try:
value = int(value)
except ValueError:
continue
if current_prefix == prefix:
d[value] += 1
else:
if current_prefix:
if len(d) > 0:
print output(current_prefix,d)
t = max(d.keys())
if t > maxkey:
maxkey = t
d = defaultdict(int)
current_prefix = prefix
d[value] += 1
# output info for last prefix if needed
if current_prefix == prefix:
print output(prefix,d)
t = max(d.keys())
if t > maxkey:
maxkey = t
# output csv list of keys from 1 through maxkey
h = ' ,'
for i in range(1,maxkey+1):
h += str(i) + ','
print h
To run through data streaming process, given that the mapper gets:
B,1
A,1
A,1
B,1
A,3
A,2
B,1
B,2
B,2
It directly outputs the same content which then all gets sorted (shuffled) and sent to a reducer. In this example, what the reducer gets is:
A,1
A,1
A,2
A,3
B,1
B,1
B,1
B,2
B,2
Finally the output of the reducer is:
A,2,1,1,
B,3,2,
,1,2,3,
For larger data sets, the input file would be split with portions containing all data for some sets of keys going to separate mappers. Using a combiner on each mapper node would save overall sorting time. There would still be a need for a single reducer so that the output is totally sorted by key. If that's not a requirement, multiple reducers could be used.
For practical reasons I made a couple of choices. First, each line of output only goes up to the highest integer for a key and trailing zeros are not printed because there is no way to know how many to write until all the input has been processed, which for large input means storing a large amount of intermediate data in memory or slowing down processing by writing it out to disk and reading it back in to complete the job. Second and for the same reason, the header line cannot be written until just before the end of the reduce job so that's when its written. It may be possible to prepend it to the output file, or the first one if output has been split, and that can be investigated in due course. However, provided a great speedup of performance from parallel processing, for massive input, these are minor issues.
This method will work with relatively minor but crucial modifications on a Spark cluster and can be converted to Java or Scala to improve performance if necessary.

Code doesn't print the last sequence in a file

I have a file that looks like this:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2
<s2> 4
etc. up to more than a thousand
Each sequence has a header like <s0> 3, which in this case states that three lines follow. In the example above, the number of lines below <s1> is two, so I have to correct the header to <s1> 2.
The code I have below picks out the sequence headers and the correct number of lines below them. But for some reason, it never gets the details of the last sequence. I know something is wrong but I don't know what. Can someone point me to what I am doing wrong?
import re
def call():
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
c = 0
c1 = 0
header = []
k = -1
for line in fp:
if line.startswith("<s"):
#header = line.split(" ")
#print header[1]
c = 0
else:
c1 = c + 1
c += 1
if c == 0 and c1>0:
k +=1
printing = c1
if printing >= 0:
s = "<s%s>" % (k)
#print "%s %d" % (s, printing)
docHeader.write(s+" "+str(printing)+"\n")
call()
you have no sentinel at the end of the last sequence in your data, so your code will need to deal with the last sequence AFTER the loop is done.
If I may suggest some python tricks to get to your results; you don't need those c/c1/k counter variables, as they make the code more difficult to read and maintain. Instead, populate a map of sequence header to sequence items and then use the map to do all your work:
(this code works only if all sequence headers are unique - if you have duplicates, it won't work)
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
data = {}
for line in fp:
if line.startswith("<s"):
current_sequence = line
# create a list with the header as the key
data[current_sequence] = []
else:
# add each sequence to the list we defined above
data[current_sequence].append(line)
Your map is ready! It looks like this:
{"<s0> 3": ["line1", "line2", "line5"],
"<s1> 5": ["line1", "line2"]}
You can iterate it like this:
for header, lines in data.items():
# header is the key, or "<s0> 3"
# lines is the list of lines under that header ["line1", "line2", etc]
num_of_lines = len(lines)
The main problem is that you neglect to check the value of c after you have read the last line. You probably had difficulty spotting this problem because of all the superfluous code. You don't have to increment k, since you can extract the value from the <s...> tag. And you don't have to have all three variables c, c1, and printing. A single count variable will do.
import re, sys
def call():
with open('trial_perl.txt') as fp:
docHeader = sys.stdout #open("C:\path\header.txt","w")
count = 0
id = None
for line in fp:
if line.startswith("<s"):
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
count = 0
id = int(line[2:line.find('>')])
else:
count += 1
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
call()
Another approach using groupby from itertools, where you take the maximum number of line in each group - a group corresponding to a sequence of header + line in your file: :
from itertools import groupby
def call():
with open('stack.txt') as fp:
header = [-1]
lines = [0]
for line in fp:
if line.startswith("<s"):
header.append(header[-1]+1)
lines.append(0)
else:
header.append(header[-1])
lines.append(lines[-1] +1)
with open('result','w') as f:
for key, group in groupby(zip(header[1:],lines[1:]), lambda x: x[0]):
f.write(str(("<s%d> %d\n" % max(group))))
f.close()
call()
#<s0> 3
#<s1> 2
stack.txt is the file containing your data:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2

Dictionaries overwriting in Python

This program is to take the grammar rules found in Binary.text and store them into a dictionary, where the rules are:
N = N D
N = D
D = 0
D = 1
but the current code returns D: D = 1, N:N = D, whereas I want N: N D, N: D, D:0, D:1
import sys
import string
#default length of 3
stringLength = 3
#get last argument of command line(file)
filename1 = sys.argv[-1]
#get a length from user
try:
stringLength = int(input('Length? '))
filename = input('Filename: ')
except ValueError:
print("Not a number")
#checks
print(stringLength)
print(filename)
def str2dict(filename="Binary.txt"):
result = {}
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]] = line
print (result)
return result
print (str2dict("Binary.txt"))
Firstly, your data structure of choice is wrong. Dictionary in python is a simple key-to-value mapping. What you'd like is a map from a key to multiple values. For that you'll need:
from collections import defaultdict
result = defaultdict(list)
Next, where are you splitting on '=' ? You'll need to do that in order to get the proper key/value you are looking for? You'll need
key, value = line.split('=', 1) #Returns an array, and gets unpacked into 2 variables
Putting the above two together, you'd go about in the following way:
result = defaultdict(list)
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
key, value = line.split('=', 1)
result[key.strip()].append(value.strip())
return result
Dictionaries, by definition, cannot have duplicate keys. Therefor there can only ever be a single 'D' key. You could, however, store a list of values at that key if you'd like. Ex:
from collections import defaultdict
# rest of your code...
result = defaultdict(list) # Use defaultdict so that an insert to an empty key creates a new list automatically
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]].append(line)
print (result)
return result
This will result in something like:
{"D" : ["D = N D", "D = 0", "D = 1"], "N" : ["N = D"]}

Parsing specific fields and counting the occurrence with python

I have a file separated by delimiter '|' like this:
age=None|sex=M|DEPT=ID1|YEAR=1995|
age=10|sex=M|DEPT=None|YEAR=1992|
age=None|sex=None|DEPT=ID1|YEAR=1991|
age=20|sex=F|DEPT=ID2|YEAR=1990|
age=20|sex=M|DEPT=ID3|YEAR=1991|
In python, how do I get the output of how many times each field is repeated.
Do we have any built-in functions? I looked into collections.update() but my environment
uses python-2.6. Unfortunately I can't use that option(and won't be able to copy
new module files into that environment manually too).
Thanks for any help or pointers.
example output:
1 times Sex=F
3 times Sex=M
1 times age=10
2 times age=None
2 times age=20
2 times YEAR=1991
...
2 times DEPT=ID1
etc
from collections import defaultdict
import csv
with open('path/to/file') as infile:
answer = defaultdict(int)
for row in csv.reader(infile, delimiter="|"):
for field in row:
answer[field] += 1
for k in sorted(answer, key=lambda k: answer[k]):
print answer[k], "times", k
Or:
from collections import Counter
import csv
import itertools
with open('path/to/file') as infile:
answer = Counter(itertools.chain.from_iterable(csv.reader(infile, delimiter="|")))
for k in sorted(answer, key=lambda k:answer[k]):
print answer[k], "times", k
Use get in dictionary may help:
with open('file.txt') as f:
dict = dict()
for line in f:
line = line.strip().split('|')
for item in line:
dict[item] = dict.get(item,0) + 1
for k in dict:
print dict[k], 'times', k

Categories