Speed up the write-to-different-files process - python

I am reading from a huge file (232MB) line by line.
First, i recognize each line according to a Regular Expression.
Then for each line, I am writing to different city.txt files under the 'report' directory according to a cityname in each line. However, this process takes a while. I am wondering if there is anyway of speeding up the process?
Example of input file: (each column split by a \t)
2015-02-03 19:20 Sane Diebgo Music 692.08 Cash
Actually i have tested the code with writing to different files and not writing to different file(simply process the large file and come up with 2 dicts) the time difference is huge. 80% of the time is spent writing to different files
def processFile(file):
pattern = re.compile(r"(\d{4}-\d{2}-\d{2})\t(\d{2}:\d{2})\t(.+)\t(.+)\t(\d+\.\d+|\d+)\t(\w+)\n")
f = open(file)
total_sale = 0
city_dict = dict()
categories_dict = dict()
os.makedirs("report", exist_ok = True)
for line in f:
valid_entry = pattern.search(line)
if valid_entry == None:
print("Invalid entry: '{}'".format(line.strip()))
continue
else:
entry_sale = float(valid_entry.group(5))
total_sale += entry_sale
city_dict.update({valid_entry.group(3) : city_dict.get(valid_entry.group(3), 0) + entry_sale})
categories_dict.update({valid_entry.group(4) : categories_dict.get(valid_entry.group(4), 0) + entry_sale})
filename = "report/" + valid_entry.group(3) + ".txt"
if os.path.exists(filename):
city_file = open(filename, "a")
city_file.write(valid_entry.group(0))
city_file.close()
else:
city_file = open(filename, "w")
city_file.write(valid_entry.group(0))
city_file.close()
f.close()
return (city_dict, categories_dict, total_sale)

The dictionary lookups and updates could be improved by using defaultdict:
from collections import defaultdict
city_dict = defaultdict(float)
categories_dict = defaultdict(float)
...
city = valid_entry.group(3)
category = valid_entry.group(4)
...
city_dict[city] += entry_sale
category_dict[category] += entry_sale

Related

Syntax error calculating the average of student marks while reading from a text file

f = open('studMarks.txt', 'r')
marks = 0
# Sort out names, split the words then sort which order
for line in f:
words = line.split()
fname = words[0]
lname = words[1]
print(f"{lname},{fname}")
f.close()
f = open('studMarks.txt', 'r')
sum = 0
count = 0
for line in f:
count += 1
sum += float(line.split()[2])
n = []
average = sum/count
print(f"{average}")
When using the for loop it seems to display a value of 64.3, which I believe is for the total of the whole student list and average for all marks.
I need to produce the an output which displays the student names and average on the same line. I can do for the names but I cannot do it for the average as I keep getting errors. I don't know what to input in.
Below is the full solution. The with open line is a context manager and ensures that the file will get closed as soon as you exit the block. You should get used to using this style as it's the safe way to do I/O. The rest is just bog standard Python.
marks=dict()
with open('studMarks.txt', 'r') as f:
for line in f:
words = line.split()
fname = words[0]
lname = words[1]
score = int(words[2])
key = f'{fname} {lname}'
count_key = f'{fname} {lname}_count'
latest_score = score + (marks.get(key)[0] if marks.get(key) else 0)
latest_count = 1 + (marks.get(key)[1] if marks.get(key) else 0)
marks[key] = (latest_score, latest_count )
for name, value in marks.items():
print(f'{name} : {value[0]/value[1]}')
This is an interesting problem.
From what I understand you have a text file that looks like this:
Johnny Ly 90 100 Adam Best 80 30 Tim Smith 10 20 in a file called studentMarks2.txt
and want output like this:
Johnny_Ly 95.0 Adam_Best 55.0 Tim_Smith 15.0
if that is true then it can be done using code like this without pandas or csv
though those would make this a lot easier.
fileContents = []
with open('studMarks2.txt','r') as f:
fileContents = f.read().split()
students = dict()
names = []
for content in fileContents:
if content.isnumeric():
studentKey = '_'.join(names)
currentScore = students.get(studentKey,[])
newScore = currentScore + [float(content)]
students.update({studentKey:newScore})
else:
if len(names) == 2:
names.clear()
names.append(content)
else:
names.append(content)
for student,scores in students.items():
avg = sum(scores)/len(scores)
print(student,avg,end=' ')
Broken down
This part reads the contents and splits on white space
fileContents = []
with open('studMarks2.txt','r') as f:
fileContents = f.read().split()
this part then iterates through the contents
storing the names as keys in a dictionary and putting the scores in a list
students = dict()
names = []
for content in fileContents:
if content.isnumeric():
studentKey = '_'.join(names)
currentScore = students.get(studentKey,[])
newScore = currentScore + [float(content)]
students.update({studentKey:newScore})
else:
if len(names) == 2:
names.clear()
names.append(content)
else:
names.append(content)
Lastly it iterates over the dictionary and output the avg on one line
for student,scores in students.items():
avg = sum(scores)/len(scores)
print(student,avg,end=' ')

Efficient method to replace text in file

I have a bunch of large text files (more that 5 million lines each). I have to add a prefix to the line if it contains any of the keywords from a list (6000 keywords).
This is the code I have written:
import os
mypath = "D:\\Temp"
files = os.listdir(mypath)
fcounter = 0
for file in files:
fl = open(mypath + file, 'r')
lines = fl.readlines()
fl.close
fl2 = open("D:\\Temp\\Keys.txt", 'r')
keys = fl2.readlines()
for i, line in enumerate(lines):
if i % 100000 == 0:
print(i)
for key in keys:
if line.find(key.strip()) > 3:
lines[i] = '$' + line
print(line)
fl = open(mypath + file, 'w')
for line in lines:
fl.write("".join(line))
fl.close()
fl2.close()
fcounter += 1
print(f'Processed {fcounter}')
This is extremely slow. It takes several hours to process a single text file on my system.
Is there a better way of doing this?

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Group and Check-mark using Python

I have several files, each of which has data like this (filename:data inside separated by newline):
Mike: Plane\nCar
Paula: Plane\nTrain\nBoat\nCar
Bill: Boat\nTrain
Scott: Car
How can I create a csv file using python that groups all the different vehicles and then puts a X on the applicable person, like:
Assuming those line numbers aren't in there (easy enough to fix if they are), and with an input file like following:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
Solution can be found here : https://gist.github.com/999481
import sys
from collections import defaultdict
import csv
# see http://stackoverflow.com/questions/6180609/group-and-check-mark-using-python
def main():
# files = ["group.txt"]
files = sys.argv[1:]
if len(files) < 1:
print "usage: ./python_checkmark.py file1 [file2 ... filen]"
name_map = defaultdict(set)
for f in files:
file_handle = open(f, "r")
process_file(file_handle, name_map)
file_handle.close()
print_csv(sys.stdout, name_map)
def process_file(input_file, name_map):
cur_name = ""
for line in input_file:
if ":" in line:
cur_name, item = [x.strip() for x in line.split(":")]
else:
item = line.strip()
name_map[cur_name].add(item)
def print_csv(output_file, name_map):
names = name_map.keys()
items = set([])
for item_set in name_map.values():
items = items.union(item_set)
writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow( [""] + names )
for item in sorted(items):
row_contents = map(lambda name:"X" if item in name_map[name] else "", names)
row = [item] + row_contents
writer.writerow( row )
if __name__ == '__main__':
main()
Output:
,Mike,Bill,Scott,Paula
Boat,,X,,X
Car,X,,X,X
Plane,X,,,X
Train,,X,,X
Only thing this script doesn't do is keep the columns in order that the names are in. Could keep a separate list maintaining the order, since maps/dicts are inherently unordered.
Here is an example of how-to parse these kind of files.
Note that the dictionary is unordered here. You can use ordered dict (in case of Python 3.2 / 2.7) from standard library, find any available implmentation / backport in case if you have older Python versions or just save an order in additional list :)
data = {}
name = None
with open(file_path) as f:
for line in f:
if ':' in line: # we have a name here
name, first_vehicle = line.split(':')
data[name] = set([first_vehicle, ]) # a set of vehicles per name
else:
if name:
data[name].add(line)
# now a dictionary with names/vehicles is available
# let's convert it to simple csv-formatted string..
# a set of all available vehicles
vehicles = set(v for vlist in data.values()
for v in vlist)
for name in data:
name_vehicles = data[name]
csv_vehicles = ''
for v in vehicles:
if v in name_vehicles:
csv_vehicles += v
csv_vehicles += ','
csv_line = name + ',' + csv_vehicles
Assuming that the input looks like this:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
This python script, places the vehicles in a dictionary, indexed by the person:
#!/usr/bin/python
persons={}
vehicles=set()
with open('input') as fd:
for line in fd:
line = line.strip()
if ':' in line:
tmp = line.split(':')
p = tmp[0].strip()
v = tmp[1].strip()
persons[p]=[v]
vehicles.add(v)
else:
persons[p].append(line)
vehicles.add(line)
for k,v in persons.iteritems():
print k,v
print 'vehicles', vehicles
Result:
Mike ['Plane', 'Car']
Bill ['Boat', 'Train']
Scott ['Car']
Paula ['Plane', 'Train', 'Boat', 'Car']
vehicles set(['Train', 'Car', 'Plane', 'Boat'])
Now, all the data needed are placed in data-structures. The csv-part is left as an exercise for the reader :-)
The most elegant and simple way would be like so:
vehiclesToPeople = {}
people = []
for root,dirs,files in os.walk('/path/to/folder/with/files'):
for file in files:
person = file
people += [person]
path = os.path.join(root, file)
with open(path) as f:
for vehicle in f:
vehiclesToPeople.setdefault(vehicle,set()).add(person)
people.sort()
table = [ ['']+people ]
for vehicle,owners in peopleToVehicles.items():
table.append([('X' if p in vehiclesToPeople[vehicle] else '') for p in people])
csv = '\n'.join(','.join(row) for row in table)
You can do pprint.pprint(table) as well to look at it.

Categories