Count lines matching different patterns in one pass - python

I have a python script the given a pattern goes over a file and for each line that matches the pattern it keeps counts how many times that line shows up in the file.
The script is the following:
#!/usr/bin/env python
import time
fnamein = 'Log.txt'
def filter_and_count_matches(fnamein, fnameout, match):
fin = open(fnamein, 'r')
curr_matches = {}
order_in_file = [] # need this because dict has no particular order
for line in (l for l in fin if l.find(match) >= 0):
line = line.strip()
if line in curr_matches:
curr_matches[line] += 1
else:
curr_matches[line] = 1
order_in_file.append(line)
#
fout = open(fnameout, 'w')
#for line in order_in_file:
for line, _dummy in sorted(curr_matches.iteritems(),
key=lambda (k, v): (v, k), reverse=True):
fout.write(line + '\n')
fout.write(' = {}\n'.format(curr_matches[line]))
fout.close()
def main():
for idx, match in enumerate(open('staffs.txt', 'r').readlines()):
curr_time = time.time()
match = match.strip()
fnameout = 'm{}.txt'.format(idx+1)
filter_and_count_matches(fnamein, fnameout, match)
print 'Processed {}. Time = {}'.format(match, time.time() - curr_time)
main()
So right now I am going over the file each time I want to check for a different pattern.
It is possible to do this go going over the file just once (the file is quite big, so it takes a while to process). It would be nice to be able to do this in a elegant "easy" way. Thanks!
Thanks

Looks like a Counter would do what you need:
from collections import Counter
lines = Counter([line for line in myfile if match_string in line])
For example, if myfile contains
123abc
abc456
789
123abc
abc456
and match_string is "abc", then the above code gives you
>>> lines
Counter({'123abc': 2, 'abc456': 2})
For multiple patterns, how about this:
patterns = ["abc", "123"]
# initialize one Counter for each pattern
results = {pattern:Counter() for pattern in patterns}
for line in myfile:
for pattern in patterns:
if pattern in line:
results[pattern][line] += 1

Related

Print output in Excel file in Python

I compare two txt files, find a match and print the line that matches and three corresponding lines after. I have read How to search a text file for a specific word in Python to accomplish that.
However, I want anything printed to be exported in an excel file. I think I am getting the call out words wrong for the List.Word and Match
An example of the output I want my code to do
import os
import xlwt
def createlist():
items = []
with open('Trialrun.txt') as input:
for line in input:
items.extend(line.strip().split(','))
return items
print(createlist())
word_list = createlist()
my_xls=xlwt.Workbook(encoding = "utf-8")
my_sheet=my_xls.add_sheet("Results")
row_num=0
my_sheet.write(row_num,0,"List.Word()")
my_sheet.write(row_num,1,"Match")
row_num+=1
with open('January 19.txt', 'r') as f:
for line in f:
for word in line.strip().split():
if word in word_list:
print'\t',List.Word(),'\t,',Match (),
print (word, end= '')
my_sheet.write(row_num,0,List.Word())
my_sheet.write(row_num,1,Match())
row_num+=1
print(next(f))
print(next(f))
print(next(f))
else:
StopIteration
my_xls.save("results.xls")
I don't get completely what you want to achieve and, I don't understand the 2nd match and list.word occurrence as well as the print(next(f)) at the end.
But maybe something like this helps; at least the script below iterates over the file and outputs results based on a match in the 2nd file.
import os
import xlwt
def createlist():
items = []
with open('Trialrun.txt') as input:
for line in input:
items.extend(line.strip().split(','))
return items
word_list = createlist()
my_xls = xlwt.Workbook(encoding="utf-8")
my_sheet = my_xls.add_sheet("Results")
row_num = 0
my_sheet.write(row_num, 0, "List.Word()")
my_sheet.write(row_num, 1, "Match")
row_num += 1
i = 1
with open('January 19.txt', 'r') as f:
for line in f:
for word in line.strip().split():
my_sheet.write(row_num, 0, word)
for line in word_list:
if word in line:
i+=1
my_sheet.write(row_num, i, line)
else:
StopIteration
row_num += 1
my_xls.save("results.xls")

Find frequency of words line by line in txt file Python (how to format properly)

I'm trying to make a simple program that can find the frequency of occurrences in a text file line by line. I have it outputting everything correctly except for when more than one word is on a line in the text file. (More information below)
The text file looks like this:
Hello
Hi
Hello
Good Day
Hi
Good Day
Good Night
I want the output to be: (Doesn't have to be in the same order)
Hello: 2
Hi: 2
Good Day: 2
Good Night: 2
What it's currently outputting:
Day: 2
Good: 3
Hello: 2
Hi: 2
Night: 1
My code:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split(None)
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You want to preserve the lines. Don't split. Don't capitalize. Don't sort
Use a Counter
from collections import Counter
c = Counter()
with open('test.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))
Instead of splitting the text by None, split it by each line break so you get each line into a list.
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You can make it yourself very easy by using a Counter object. If you want to count the occurrences of full lines you can simply do:
from collections import Counter
with open('file.txt') as f:
c = Counter(f)
print(c)
Edit
Since you asked for a way without modules:
counter_dict = {}
with open('file.txt') as f:
l = f.readlines()
for line in l:
if line not in counter_dict:
counter_dict[line] = 0
counter_dict[line] +=1
print(counter_dict)
Thank you all for the answers, most of the code produces the desired output just in different ways. The code I ended up using with no modules was this:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
The code I ended up using with modules was this:
from collections import Counter
c = Counter()
with open('live.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))

Indexing lines in a Python file

I want to open a file, and simply return the contents of said file with each line beginning with the line number.
So hypothetically if the contents of a is
a
b
c
I would like the result to be
1: a
2: b
3: c
Im kind of stuck, tried enumerating but it doesn't give me the desired format.
Is for Uni, but only a practice test.
A couple bits of trial code to prove I have no idea what I'm doing / where to start
def print_numbered_lines(filename):
"""returns the infile data with a line number infront of the contents"""
in_file = open(filename, 'r').readlines()
list_1 = []
for line in in_file:
for item in line:
item.index(item)
list_1.append(item)
return list_1
def print_numbered_lines(filename):
"""returns the infile data with a line number infront of the contents"""
in_file = open(filename, 'r').readlines()
result = []
for i in in_file:
result.append(enumerate(i))
return result
A file handle can be treated as an iterable.
with open('tree_game2.txt') as f:
for i, line in enumerate(f):
print ("{0}: {1}".format(i+1,line))
There seems no need to write a python script, awk would solve your problem.
awk '{print NR": "$1}' your_file > new_file
What about using an OrderedDict
from collections import OrderedDict
c = OrderedDict()
n = 1
with open('file.txt', 'r') as f:
for line in f:
c.update({n:line})
#if you just want to print it, skip the dict part and just do:
print n,line
n += 1
Then you can print it out with:
for n,line in c.iteritems(): #.items() if Python3
print k,line
the simple way to do it:
1st:with open the file -----2ed:using count mechanism:
for example:
data = object of file.read()
lines = data.split("\n")
count =0
for line in lines:
print("line "+str(count)+">"+str()+line)
count+=1

How do I count the occurences of characters of a partition in python?

I have a large file containing sequences; I want to analyze only the last set of characters, which happen to be of variable length. In each line I would like to take the first character and last character of each set in a text file and count the total instances of those characters.
Here is an example of the data in the file:
-1iqd_BA_0_CDRH3.pdb kabat H3 PDPDAFDV
-1iqw_HL_0_CDRH3.pdb kabat H3 NRDYSNNWYFDV
I want to take the first character after the "H3" and the last character (both in bold in example).
The output for these two lines should be:
first Counter({'N': 1, 'P': 1})
last Counter({'V': 2})
This is what I have done so far:
f = open("C:/CDRH3.txt", "r")
from collections import Counter
grab = 1
for line in f:
line=line.rstrip()
left,sep,right=line.partition(" H3 ")
if sep:
AminoAcidsFirst = right[:grab]
AminoAcidsLast = right[-grab:]
print ("first ",Counter(line[:] for line in AminoAcidsFirst))
print ("last ",Counter(line[:] for line in AminoAcidsLast))
f.close()
This prints the counts of only the last line of data which looks like:
first Counter({'N': 1})
last Counter({'V': 1})
How do I count all these characters in all lines in the file?
Notes:
Printing (AminoAcidsFirst) or (AminoAcidsLast) gives the desired list of all the lines in vertical but I can't count it or output it to a file. Writing to a new file will only write the characters of the last line of the original file.
Thanks!
No need for Counter: simply grab the last token after spliting and count the first and last characters:
first_counter = {}
last_counter = {}
for line in f:
line=line.split()[-1] # grab the last token
first_counter[line[0]] = first_counter.get(line[0], 0) + 1
last_counter[line[-1]] = last_counter.get(line[-1], 0) + 1
print("first ", first_counter)
print("last ", last_counter)
OUTPUT
first {'P': 1, 'N': 1}
last {'V': 2}
create 2 empty lists and append in each loop like so:
f = open("C:/CDRH3.txt", "r")
from collections import Counter
grab = 1
AminoAcidsFirst = []
AminoAcidsLast = []
for line in f:
line=line.rstrip()
left,sep,right=line.partition(" H3 ")
if sep:
AminoAcidsFirst.append(right[:grab])
AminoAcidsLast.append(right[-grab:])
print ("first ",Counter(line[:] for line in AminoAcidsFirst))
print ("last ",Counter(line[:] for line in AminoAcidsLast))
f.close()
Here:
Creation of empty list:
AminoAcidsFirst = []
AminoAcidsLast = []
Appending in each loop:
AminoAcidsFirst.append(right[:grab])
AminoAcidsLast.append(right[-grab:])
Two important things I would like to point out
never reveal path of file on your computer, this is especially applicable if you are from scientific community
your code can be more pythonic using with...as approach
And now the program
from collections import Counter
filePath = "C:/CDRH3.txt"
AminoAcidsFirst, AminoAcidsLast = [], [] # important! these should be lists
with open(filePath, 'rt') as f: # rt not r. Explicit is better than implicit
for line in f:
line = line.rstrip()
left, sep, right = line.partition(" H3 ")
if sep:
AminoAcidsFirst.append( right[0] ) # really no need of extra grab=1 variable
AminoAcidsLast.append( right[-1] ) # better than right[-grab:]
print ("first ",Counter(AminoAcidsFirst))
print ("last ",Counter(AminoAcidsLast))
Don't do line.strip()[-1] because sep verification is important
OUTPUT
first {'P': 1, 'N': 1}
last {'V': 2}
Note: Data files can get really large, and you might run into memory issues or computer hanging. So, might I suggest lazy read? Folloing is more robust program
from collections import Counter
filePath = "C:/CDRH3.txt"
AminoAcidsFirst, AminoAcidsLast = [], [] # important! these should be lists
def chunk_read(fileObj, linesCount = 100):
lines = fileObj.readlines(linesCount)
yield lines
with open(filePath, 'rt') as f: # rt not r. Explicit is better than implicit
for aChunk in chunk_read(f):
for line in aChunk:
line = line.rstrip()
left, sep, right = line.partition(" H3 ")
if sep:
AminoAcidsFirst.append( right[0] ) # really no need of extra grab=1 variable
AminoAcidsLast.append( right[-1] ) # better than right[-grab:]
print ("first ",Counter(AminoAcidsFirst))
print ("last ",Counter(AminoAcidsLast))
If you put statements at the bottom of or after your for loop to print AminoAcidsFirst and AminoAcidsLast, you will see that on each iteration you are just assigning a new value. Your intent should be to collect, contain or accumulate these values, before feeding them to collections.Counter.
s = ['-1iqd_BA_0_CDRH3.pdb kabat H3 PDPDAFDV', '-1iqw_HL_0_CDRH3.pdb kabat H3 NRDYSNNWYFDV']
An immediate fix for your code would be to accumulate the characters:
grab = 1
AminoAcidsFirst = ''
AminoAcidsLast = ''
for line in s:
line=line.rstrip()
left,sep,right=line.partition(" H3 ")
if sep:
AminoAcidsFirst += right[:grab]
AminoAcidsLast += right[-grab:]
print ("first ",collections.Counter(AminoAcidsFirst))
print ("last ",collections.Counter(AminoAcidsLast))
Another approach would be to produce the characters on demand. Define a generator function that will yield the things you want to count
def f(iterable):
for thing in iterable:
left, sep, right = thing.partition(' H3 ')
if sep:
yield right[0]
yield right[-1]
Then feed that to collections.Counter
z = collections.Counter(f(s))
Or using a file as the data source:
with open('myfile.txt') as f1:
# lines is a generator expression
# that produces stripped lines
lines = (line.strip() for line in f1)
z = collections.Counter(f(lines))

Python - Unable to split lines from a txt file into words

My goal is to open a file and split it into unique words and display that list (along with a number count). I think I have to split the file into lines and then split those lines into words and add it all into a list.
The problem is that if my program will run in an infinite loop and not display any results, or it will only read a single line and then stop. The file being read is The Gettysburg Address.
def uniquify( splitz, uniqueWords, lineNum ):
for word in splitz:
word = word.lower()
if word not in uniqueWords:
uniqueWords.append( word )
def conjunctionFunction():
uniqueWords = []
with open(r'C:\Users\Alex\Desktop\Address.txt') as f :
getty = [line.rstrip('\n') for line in f]
lineNum = 0
lines = getty[lineNum]
getty.append("\n")
while lineNum < 20 :
splitz = lines.split()
lineNum += 1
uniquify( splitz, uniqueWords, lineNum )
print( uniqueWords )
conjunctionFunction()
Using your current code, the line:
lines = getty[lineNum]
should be moved within the while loop.
You figured out what's wrong with your code, but nonetheless, I would do this slightly differently. Since you need to keep track of the number of unique words and their counts, you should use a dictionary for this task:
wordHash = {}
with open('C:\Users\Alex\Desktop\Address.txt', 'r') as f :
for line in f:
line = line.rstrip().lower()
for word in line:
if word not in wordHash:
wordHash[word] = 1
else:
wordHash[word] += 1
print wordHash
def splitData(filename):
return [words for words in open(filename).reads().split()]
Easiest way to split a file into words :)
Assume inp is retrived from a file
inp = """Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense."""
data = inp.splitlines()
print data
_d = {}
for line in data:
word_lst = line.split()
for word in word_lst:
if word in _d:
_d[word] += 1
else:
_d[word] = 1
print _d.keys()
Output
['Beautiful', 'Flat', 'Simple', 'is', 'dense.', 'Explicit', 'better', 'nested.', 'Complex', 'ugly.', 'Sparse', 'implicit.', 'complex.', 'than', 'complicated.']
I recommend:
#!/usr/local/cpython-3.3/bin/python
import pprint
import collections
def genwords(file_):
for line in file_:
for word in line.split():
yield word
def main():
with open('gettysburg.txt', 'r') as file_:
result = collections.Counter(genwords(file_))
pprint.pprint(result)
main()
...but you could use re.findall to deal with punctuation better, instead of string.split.

Categories