Problems porting code from Python 2.7 to 3.6 - python

I have a fragment of code which loads data from a .csv file.
It's written for Python 2.7 but in Python 3.6 does not work.
def load_new_data(self):
full = list()
with open(self.filename, 'rb') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print '\t Removing incomplete variables.'
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
the error I get is:
212
213 count = 0
--> 214 for row in myreader2:
215 if count == 0:
216 headers = row[1:]
Error: iterator should return strings, not bytes (did you open the file in
text mode?)
I did try changing the 'rb' to 'r' and 'rt' and even deleting it, as other posts here suggest, but with no success...

try this
def load_new_data(self):
full = list()
with open(self.filename, 'r') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print ('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df

You should try codecs, for open file. Be careful this file encoding.
Sample:
def load_new_data(self):
with codecs.open(self.filename, 'rb', encoding="cp1251") as csv_in: # cp1251 replace for your encoding!
myreader2 = csv.reader(csv_in, delimiter=';')
headers = next(myreader2)[1:]
next(myreader2)
full = [row[1:] for row in myreader2]
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df

Related

How to parse info from the log file by certain strings

I am a beginner. I have a log file like the below for ~ 1000 cycle loops,
"CycleSTART#1
Temp=26C
Fan=3000
CycleSTART#2
Temp=27C
Fan=3200
.
.
.
."
My objective is to read the Temp & fan values corresponding to cycle count. Basically I want to put everything in a table. And I tried with the simple programming
string1 = 'CycleSTART#'
string2 = 'Temp'
string3 = 'Fan'
import pandas as pd
filepath = "XXX<location of txt file>"
with open(filepath) as fp:
line=fp.readline()
cnt = 1
while line:
cnt += 1
flag = 0
index = 0
for line in fp:
if string1 in line:
flag = 1
break
if flag == 1:
lword=len(line)
extracted_string1 = line[11:11+lword]
for line in fp:
if string2 in line:
flag = 2
break
if flag == 2:
lword=len(line)
extracted_string2 = line[6:6+lword]
for line in fp:
if string3 in line:
flag = 3
break
if flag == 3:
lword=len(line)
extracted_string3 = line[5:5+lword]
data = {'cycle': [extracted_string1],
'temp' : [extracted_string2],
'Fan' : [extracted_string3],
df = pd.DataFrame(data, columns = ['cycle', 'temp', 'Fan']
print (df)
f.close()
Tried with this but every time, I get the first cycle value and its not looping to the next cycles.
I would rewrite a little bit and use splits to stay away from regex.
import pandas as pd
def add_to_dict(item, key, dic):
if dic.get(item, False):
dic[item].append(item)
else:
dic[item] = [item]
filepath = "XXX<location of txt file>"
data_container = {}
with open(filepath, "r") as f:
for indx, line in enumerate(f):
if indx % 3 == 0:
value = int(line.split("#")[-1]) # Split on # and then convert to number
key = "cycle"
elif indx % 3 == 1:
temp = float(temp.split("=")[-1][:-1]) # Split on = and then remove the C to hold the temperature value
key = "temp"
elif indx % 3 == 2:
fan = int(fan.split("=")[-1]) # Almost same as above, split on = to get the numerical value at the end and convert it to int
key = "Fan"
add_to_dict(value, key, data_container)
dataframe = pd.DataFrame.from_dict(data_container)

How to iterate over the rows from 2 files, compare the values and update the value in a file when the condition is met?

For changing the values from 10 to 18, 19 or 20, I am splitting the string, access the substrings and then trying to change it. Its working but just not changing the values. Here is the solution I am trying to implement:
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
readme = open(topoFileName, "r")
Lines = readme.readlines()
readme.close()
newFile = open(newFileName,"w")
for row in oldLines:
for line in Lines:
tmp = line.split()
list = row.rstrip()
tmp1 = list.split()
newFile.write(row)
if row.find("BEG_ELEM_DATA") > -1:
if tmp[0] == tmp1[0]:
if tmp[2] == 1 and tmp[3] == 0:
# it is magnet, value 18
newFile.write(tmp1.replace(tmp1[1], "18"))
elif tmp[2] == 1 and tmp[3] == 1:
# it is iron, value 19
newFile.write(tmp1.replace(tmp1[1], "19"))
else:
# it is air, value 20
newFile.write(tmp1.replace(tmp1[1], "20"))
newFile.close()
I would really appreciate it if you could able to solve this problem in above script, then I guess it should work.
I'm also still a beginner in Python, but I tried to solve your problem and here is my solution:
I guess there are way better ways to do it because here you have to import all data to a dataframe before comparing it.
Also I don't know if you can read your data with pd.read_csv to a dataframe because I don't know *.hmo and *.topo
import pandas as pd
df = pd.read_csv('tryout.csv', delimiter=';')
df2 = pd.read_csv('density.csv', delimiter=';')
for idx, row in df.iterrows():
for idx2, row2 in df2.iterrows():
if row[0] == row2[0]:
if row2[2] == 1 and row2[3] == 0 :
# it is magnet, value 18
row[1] = 18
elif row2[2] == 1 and row2[3] == 1 :
# it is iron, value 19
row[1] = 19
else:
# it is air, value 20
row[1] = 20
df.to_csv('new_tryout.csv')
What my code is doing here, it loads both files to dataframes. Then iterate over every line to compare where the ID in both files is the same (e.g 3749).
If true there are the 3 if statements whether it is magnet/iron/air and change the value in df to the right number.
At the end save the new df to a new file 'new_tryout.csv'
I created 2 testfiles for it and it worked the way it should.
Finally, here is the solution you were searching for.
import pandas as pd
df2 = pd.read_csv('Density.topo', header = 0, names = list('ABCD'), delimiter=r'\s+', skiprows=1)
df2[['C', 'D']]= df2[['C', 'D']].round()
new_file_content=''
with open('tryout.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
search_idx = df2[df2['A'] == ID_to_search_for].index[0]
if df2['C'][search_idx] == 1 and df2['D'][search_idx] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif df2['C'][search_idx] == 1 and df2['D'][search_idx] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
if you don't want to use dataframes, you can do it like this:
with open('density.topo') as f:
lists_of_list = [line.rstrip().split() for line in f]
new_file_content=''
with open('tryout_test.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
for idx, sublist in enumerate(lists_of_list):
if sublist[0] == ID_to_search_for:
if lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
ok, here is my final answer. It does (again) all things you were searching for. Please debug your code in your IDE if there is a problem. You should start using context manager instead of open and closing files step by step.
I wrote the new code around your code in the question and added some comments to it.
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
m = int(oldLines[3])
print(m)
new_m = m+3
m1 = str(m)
new_m1 = str(new_m)
Phrase = "END_COMP_DATA"
#n = "Phrase not found" #not used --> not needed
with open(oldFileName,"r") as oldFile:
for number, lin in enumerate(oldFile):
if Phrase in lin:
n = number
#insert 3 lines to tryout_new at the right position (--> row n)
magnet = f" {m+1} "'" topo_magnet"'"\n"
iron = f" {m+2} "'" topo_iron"'"\n"
air = f" {m+3} "'" topo_air"'"\n"
oldLines[n:n] = [magnet, iron, air]
newFile = open(newFileName,"w")
flag = 0
with open('density.topo') as f:
data_density = [line.rstrip().split() for line in f]
for idx, row in enumerate(oldLines):
lst = row.rstrip() #I think you shouldn't name a variable like a class in python (list). use 'lst' or something like that
tmp_tryout = lst.split()
if row.find("BEG_ELEM_DATA") > -1:
flag = 1
if flag == 1 and len(tmp_tryout)>1:
# if the column has more than 2 columns (after split), check for the "10"
if tmp_tryout[1] == '10':
# density_idx_line searchs in density.topo for a match with tmp_tryout[0] (e.g. 3749) and stores the whole line
density_idx_line = list(filter(lambda x: x[0] == tmp_tryout[0], data_density))
if len(density_idx_line) >0:
if density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1e-05':
# the ' 10 ' is the 10 with a whitespace before and after it. Only like this only the 10 gets replaced (and not e.g. 3104 to 3184)
newFile.write(row.replace(' 10 ', ' 18 '))
elif density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1.0':
newFile.write(row.replace(' 10 ', ' 19 '))
else:
newFile.write(row.replace(' 10 ', ' 20 '))
else:
newFile.write(row)
else:
if idx == 3:
newFile.write(row.replace(m1, new_m1))
else:
newFile.write(row)
newFile.close()
print ("script terminated successfully!")
ok, here is another solution. For anybody else who reads this: this is still only a temporary solution but #Sagar and me both don't know to do it better.
import pandas as pd
df = pd.read_csv('tryout.hmo', header = 0, names = list('ABCDEFGHIJKLM'), delimiter=r'\s+', skiprows=[i for i in range(52362)])
df2 = pd.read_csv('Density.topo', header = 0, names = list('ANOP'), delimiter=r'\s+', skiprows=1)
df2 = df2.iloc[:-3, :]
df3 = df.merge(df2, how='outer', on='A')
df3[['O','P']] = df3[['O','P']].fillna(-1).astype(int).replace(-1, np.nan)
df3['B']= df3.apply(lambda x: 18 if x['B']==10 and x['O']==1 and x['P']==0 else (
19 if x['B']==10 and x['O']==1 and x['P']==1 else (
20 if x['B']==10 and x['O']==0 and x['P']==0 else x['B'])), axis=1)
df3.to_csv('new_tryout.csv')
It finished the code in less than a second, so it is far better than iterrows or itertuples.
The new csv file includes both the tryout file and the density file. They are merged together by the first column of tryout file (ID i guess)
I didn't check all of this very big file but from the few random points I checked, it seems as this way works.

Count Not Counting Properly

I've scraped a bunch of words from the dictionary, and created a massive CSV file with all of them, one word per row.
I have another function, which reads from that massive CSV file, and then creates smaller CSV files.
The function is supposed to create CSV files with only 500 words/rows, but something is amiss. The first file has 501 words/rows. The rest of the files have 502 words/rows.
Man, maybe I'm tired, but I can't seem to spot what exactly is causing this in my code. Or is there nothing wrong with my code at all?
Below is the part of the function that I'm assuming is causing the problem. The full function can be seen below that.
Suspect Part of Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
The Whole Function
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
idiomsfilename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\idioms.csv'
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if 'idiom' in row[0] and row[0] != ' idiom':
term = row[0][:-5]
aw = 'a' if os.path.exists(idiomsfilename) else 'w'
with open(idiomsfilename, aw, newline='') as idiomsfile:
idiomsfieldnames = ['idiom']
idiomswriter = csv.DictWriter(idiomsfile, fieldnames=idiomsfieldnames)
idiomswriter.writerow({
'idiom':term
})
continue
else:
if ' ' in term:
term = term.replace(' ', '')
if count <= limit:
count += 1
else:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
print(term)
So the reason why the files have weird number of rows is because of your if-else conditions.
You increment count when count is less than or equal to limit. For your very first iteration, you increment to 1 then write your first term, then increment and so on. Because you use <= instead of the strict inequality, you will still increment at count = 500 and write the 501st word.
From the second loop onwards, your first word is written at count = 0. The loop terminates again at count = 501 so you write 502 words this time.
To fix this, check for count >= limit, and create a new file if so. Increment count after you write to the CSV file and not before. That should help.
def create_csv_files():
limit = 500
count = 0
filecount = 1
zfill = 3
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format('1'.zfill(zfill))
with open('C:\\Users\\Anthony\\Desktop\\Scrape\\Results\\dictionary.csv') as readfile:
csvReader = csv.reader(readfile)
for row in csvReader:
term = row[0]
if ' ' in term:
term = term.replace(' ', '')
# Remove if and keep else
if count >= limit:
count = 0
filecount += 1
filename = 'C:\\Users\\Anthony\\Desktop\\Scrape\\Dictionary\\terms{}.csv'.format(str(filecount).zfill(zfill))
aw = 'a' if os.path.exists(filename) else 'w'
with open(filename, aw, newline='') as writefile:
fieldnames = [ 'term' ]
writer = csv.DictWriter(writefile, fieldnames=fieldnames)
writer.writerow({
'term': term
})
count += 1 # Increment here

Python: Average Prie per Year

Would anyone be able to help me with the below? I'm trying to create a program that can open the "notepad.txt" file and calculate the average price for the month of October.
notepad.txt
10-15-2012:3.886
10-22-2012:3.756
10-29-2012:3.638
infile = open('notepad.txt', 'r')
def clean_data():
line1 = infile.readline()
split1 = line1.rstrip('\n')
items = split1[0].split('-')
del items[0]
del items[0]
master = []
master = master + split1 + items
master = list(map(float, master))
print(master)
print(total)
line1 = infile.readline()
clean_data()
this prints and returns the average
def clean_data(infile):
lines = infile.readlines()
total = 0.0
num = 0
for line in lines:
spl = line.strip().split(":")
total += float(spl[len(spl)-1])
num += 1
average = total/num
print(average)
return average
def sum_data():
n,c = 0,0
with open('notepad.txt', 'r') as infile:
x = infile.readline()
# for october 10
if x[:3]=='10-' and x[6:10]=='2010';
n += float(x[12:])
c += 1
print(n/c)
If you want to use Pandas:
from io import StringIO
import pandas as pd
notepadtxt = StringIO("""10-15-2012:3.886
10-22-2012:3.756
10-29-2012:3.638""")
df = pd.read_csv(notepadtxt, sep='\:',header=None, engine='python')
df[0] = pd.to_datetime(df[0])
df=df.set_index(0)
df.resample('M').mean().values[0][0]
Output:
3.7600000000000002
The following vanilla Python code should suffice:
infile = open('notepad.txt', 'r')
def clean_data():
data = []
for line in infile:
data.append(line.strip().split(':'))
values = []
for value in data:
values.append(float(value[1]))
avg_price = sum(values)/len(values)
print(avg_price)
clean_data()
infile.close()

My function to extract totals is exhausting my input file for future reading

The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt

Categories