I've scraped a bunch of words from the dictionary, and created a massive CSV file with all of them, one word per row.
I have another function, which reads from that massive CSV file, and then creates smaller CSV files.
The function is supposed to create CSV files with only 500 words/rows, but something is amiss. The first file has 501 words/rows. The rest of the files have 502 words/rows.
Man, maybe I'm tired, but I can't seem to spot what exactly is causing this in my code. Or is there nothing wrong with my code at all?
Below is the part of the function that I'm assuming is causing the problem. The full function can be seen below that.
Suspect Part of Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
The Whole Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
idiomsfilename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\idioms.csv'
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if 'idiom' in row[0] and row[0] != ' idiom':
term = row[0][:-5]
aw = 'a' if os.path.exists(idiomsfilename) else 'w'
with open(idiomsfilename, aw, newline='') as idiomsfile:
idiomsfieldnames = ['idiom']
idiomswriter = csv.DictWriter(idiomsfile, fieldnames=idiomsfieldnames)
idiomswriter.writerow({
'idiom':term
})
continue
else:
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
print(term)
So the reason why the files have weird number of rows is because of your if-else conditions.
You increment count when count is less than or equal to limit. For your very first iteration, you increment to 1 then write your first term, then increment and so on. Because you use <= instead of the strict inequality, you will still increment at count = 500 and write the 501st word.
From the second loop onwards, your first word is written at count = 0. The loop terminates again at count = 501 so you write 502 words this time.
To fix this, check for count >= limit, and create a new file if so. Increment count after you write to the CSV file and not before. That should help.
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
# Remove if and keep else
if count >= limit:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
count += 1 # Increment here
Related
f = open('studMarks.txt', 'r')
marks = 0
# Sort out names, split the words then sort which order
for line in f:
words = line.split()
fname = words[0]
lname = words[1]
print(f"{lname},{fname}")
f.close()
f = open('studMarks.txt', 'r')
sum = 0
count = 0
for line in f:
count += 1
sum += float(line.split()[2])
n = []
average = sum/count
print(f"{average}")
When using the for loop it seems to display a value of 64.3, which I believe is for the total of the whole student list and average for all marks.
I need to produce the an output which displays the student names and average on the same line. I can do for the names but I cannot do it for the average as I keep getting errors. I don't know what to input in.
Below is the full solution. The with open line is a context manager and ensures that the file will get closed as soon as you exit the block. You should get used to using this style as it's the safe way to do I/O. The rest is just bog standard Python.
marks=dict()
with open('studMarks.txt', 'r') as f:
for line in f:
words = line.split()
fname = words[0]
lname = words[1]
score = int(words[2])
key = f'{fname} {lname}'
count_key = f'{fname} {lname}_count'
latest_score = score + (marks.get(key)[0] if marks.get(key) else 0)
latest_count = 1 + (marks.get(key)[1] if marks.get(key) else 0)
marks[key] = (latest_score, latest_count )
for name, value in marks.items():
print(f'{name} : {value[0]/value[1]}')
This is an interesting problem.
From what I understand you have a text file that looks like this:
Johnny Ly 90 100 Adam Best 80 30 Tim Smith 10 20 in a file called studentMarks2.txt
and want output like this:
Johnny_Ly 95.0 Adam_Best 55.0 Tim_Smith 15.0
if that is true then it can be done using code like this without pandas or csv
though those would make this a lot easier.
fileContents = []
with open('studMarks2.txt','r') as f:
fileContents = f.read().split()
students = dict()
names = []
for content in fileContents:
if content.isnumeric():
studentKey = '_'.join(names)
currentScore = students.get(studentKey,[])
newScore = currentScore + [float(content)]
students.update({studentKey:newScore})
else:
if len(names) == 2:
names.clear()
names.append(content)
else:
names.append(content)
for student,scores in students.items():
avg = sum(scores)/len(scores)
print(student,avg,end=' ')
Broken down
This part reads the contents and splits on white space
fileContents = []
with open('studMarks2.txt','r') as f:
fileContents = f.read().split()
this part then iterates through the contents
storing the names as keys in a dictionary and putting the scores in a list
students = dict()
names = []
for content in fileContents:
if content.isnumeric():
studentKey = '_'.join(names)
currentScore = students.get(studentKey,[])
newScore = currentScore + [float(content)]
students.update({studentKey:newScore})
else:
if len(names) == 2:
names.clear()
names.append(content)
else:
names.append(content)
Lastly it iterates over the dictionary and output the avg on one line
for student,scores in students.items():
avg = sum(scores)/len(scores)
print(student,avg,end=' ')
For changing the values from 10 to 18, 19 or 20, I am splitting the string, access the substrings and then trying to change it. Its working but just not changing the values. Here is the solution I am trying to implement:
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
readme = open(topoFileName, "r")
Lines = readme.readlines()
readme.close()
newFile = open(newFileName,"w")
for row in oldLines:
for line in Lines:
tmp = line.split()
list = row.rstrip()
tmp1 = list.split()
newFile.write(row)
if row.find("BEG_ELEM_DATA") > -1:
if tmp[0] == tmp1[0]:
if tmp[2] == 1 and tmp[3] == 0:
# it is magnet, value 18
newFile.write(tmp1.replace(tmp1[1], "18"))
elif tmp[2] == 1 and tmp[3] == 1:
# it is iron, value 19
newFile.write(tmp1.replace(tmp1[1], "19"))
else:
# it is air, value 20
newFile.write(tmp1.replace(tmp1[1], "20"))
newFile.close()
I would really appreciate it if you could able to solve this problem in above script, then I guess it should work.
I'm also still a beginner in Python, but I tried to solve your problem and here is my solution:
I guess there are way better ways to do it because here you have to import all data to a dataframe before comparing it.
Also I don't know if you can read your data with pd.read_csv to a dataframe because I don't know *.hmo and *.topo
import pandas as pd
df = pd.read_csv('tryout.csv', delimiter=';')
df2 = pd.read_csv('density.csv', delimiter=';')
for idx, row in df.iterrows():
for idx2, row2 in df2.iterrows():
if row[0] == row2[0]:
if row2[2] == 1 and row2[3] == 0 :
# it is magnet, value 18
row[1] = 18
elif row2[2] == 1 and row2[3] == 1 :
# it is iron, value 19
row[1] = 19
else:
# it is air, value 20
row[1] = 20
df.to_csv('new_tryout.csv')
What my code is doing here, it loads both files to dataframes. Then iterate over every line to compare where the ID in both files is the same (e.g 3749).
If true there are the 3 if statements whether it is magnet/iron/air and change the value in df to the right number.
At the end save the new df to a new file 'new_tryout.csv'
I created 2 testfiles for it and it worked the way it should.
Finally, here is the solution you were searching for.
import pandas as pd
df2 = pd.read_csv('Density.topo', header = 0, names = list('ABCD'), delimiter=r'\s+', skiprows=1)
df2[['C', 'D']]= df2[['C', 'D']].round()
new_file_content=''
with open('tryout.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
search_idx = df2[df2['A'] == ID_to_search_for].index[0]
if df2['C'][search_idx] == 1 and df2['D'][search_idx] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif df2['C'][search_idx] == 1 and df2['D'][search_idx] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
if you don't want to use dataframes, you can do it like this:
with open('density.topo') as f:
lists_of_list = [line.rstrip().split() for line in f]
new_file_content=''
with open('tryout_test.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
for idx, sublist in enumerate(lists_of_list):
if sublist[0] == ID_to_search_for:
if lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
ok, here is my final answer. It does (again) all things you were searching for. Please debug your code in your IDE if there is a problem. You should start using context manager instead of open and closing files step by step.
I wrote the new code around your code in the question and added some comments to it.
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
m = int(oldLines[3])
print(m)
new_m = m+3
m1 = str(m)
new_m1 = str(new_m)
Phrase = "END_COMP_DATA"
#n = "Phrase not found" #not used --> not needed
with open(oldFileName,"r") as oldFile:
for number, lin in enumerate(oldFile):
if Phrase in lin:
n = number
#insert 3 lines to tryout_new at the right position (--> row n)
magnet = f" {m+1} "'" topo_magnet"'"\n"
iron = f" {m+2} "'" topo_iron"'"\n"
air = f" {m+3} "'" topo_air"'"\n"
oldLines[n:n] = [magnet, iron, air]
newFile = open(newFileName,"w")
flag = 0
with open('density.topo') as f:
data_density = [line.rstrip().split() for line in f]
for idx, row in enumerate(oldLines):
lst = row.rstrip() #I think you shouldn't name a variable like a class in python (list). use 'lst' or something like that
tmp_tryout = lst.split()
if row.find("BEG_ELEM_DATA") > -1:
flag = 1
if flag == 1 and len(tmp_tryout)>1:
# if the column has more than 2 columns (after split), check for the "10"
if tmp_tryout[1] == '10':
# density_idx_line searchs in density.topo for a match with tmp_tryout[0] (e.g. 3749) and stores the whole line
density_idx_line = list(filter(lambda x: x[0] == tmp_tryout[0], data_density))
if len(density_idx_line) >0:
if density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1e-05':
# the ' 10 ' is the 10 with a whitespace before and after it. Only like this only the 10 gets replaced (and not e.g. 3104 to 3184)
newFile.write(row.replace(' 10 ', ' 18 '))
elif density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1.0':
newFile.write(row.replace(' 10 ', ' 19 '))
else:
newFile.write(row.replace(' 10 ', ' 20 '))
else:
newFile.write(row)
else:
if idx == 3:
newFile.write(row.replace(m1, new_m1))
else:
newFile.write(row)
newFile.close()
print ("script terminated successfully!")
ok, here is another solution. For anybody else who reads this: this is still only a temporary solution but #Sagar and me both don't know to do it better.
import pandas as pd
df = pd.read_csv('tryout.hmo', header = 0, names = list('ABCDEFGHIJKLM'), delimiter=r'\s+', skiprows=[i for i in range(52362)])
df2 = pd.read_csv('Density.topo', header = 0, names = list('ANOP'), delimiter=r'\s+', skiprows=1)
df2 = df2.iloc[:-3, :]
df3 = df.merge(df2, how='outer', on='A')
df3[['O','P']] = df3[['O','P']].fillna(-1).astype(int).replace(-1, np.nan)
df3['B']= df3.apply(lambda x: 18 if x['B']==10 and x['O']==1 and x['P']==0 else (
19 if x['B']==10 and x['O']==1 and x['P']==1 else (
20 if x['B']==10 and x['O']==0 and x['P']==0 else x['B'])), axis=1)
df3.to_csv('new_tryout.csv')
It finished the code in less than a second, so it is far better than iterrows or itertuples.
The new csv file includes both the tryout file and the density file. They are merged together by the first column of tryout file (ID i guess)
I didn't check all of this very big file but from the few random points I checked, it seems as this way works.
I have a fragment of code which loads data from a .csv file.
It's written for Python 2.7 but in Python 3.6 does not work.
def load_new_data(self):
full = list()
with open(self.filename, 'rb') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print '\t Removing incomplete variables.'
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
the error I get is:
212
213 count = 0
--> 214 for row in myreader2:
215 if count == 0:
216 headers = row[1:]
Error: iterator should return strings, not bytes (did you open the file in
text mode?)
I did try changing the 'rb' to 'r' and 'rt' and even deleting it, as other posts here suggest, but with no success...
try this
def load_new_data(self):
full = list()
with open(self.filename, 'r') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print ('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
You should try codecs, for open file. Be careful this file encoding.
Sample:
def load_new_data(self):
with codecs.open(self.filename, 'rb', encoding="cp1251") as csv_in: # cp1251 replace for your encoding!
myreader2 = csv.reader(csv_in, delimiter=';')
headers = next(myreader2)[1:]
next(myreader2)
full = [row[1:] for row in myreader2]
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt
I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.