Having trouble reading CSV files with rows of different column count - python

I am having trouble reading through a file where the rows have different lengths. Specifically, I know that the file is 13 rows long and that rows 1 and 13 have 2 values in them where the rest (2-12) have 4. I want to get one value from row 1 and one value from row 13, and one value from each of rows 2-12 depending on whether or not their preceding value is equal to "credit" or "debit". Since the rows have different lengths I get 'index out of range' errors. Any help would be greatly appreciated. Thanks!
class Checkbook:
"""Checkbook class for list of check transactions"""
def __init__(self, filename):
"""initializer for Checkbook class"""
self.name = filename
self.debitList = []
self.creditList = []
self.startAmt = 0
self.endAmt = 0
self.shouldBeBal = 0
with open(filename) as csvFile:
readCSV = csv.reader(csvFile, delimiter = ',')
#rowCount = sum(1 for row in readCSV) - 1
#print(rowCount)
next(csvFile)
#in range(1, rowCount, 1):
for row in readCSV:
if (row[2] == " debit"):
debitAmt = row[3]
self.debitList.append(debitAmt)
elif (row[2] == " credit"):
creditAmt = row[3]
self.creditList.append(creditAmt)

Well, you have to either avoid the IndexError
for row in readCSV:
if len(row) > 2: # make sure the row is long enough
if (row[2] == " debit"): # now this can't fail
# ...
elif (row[2] == " credit"):
# ...
or handle it:
for row in readCSV:
try:
if (row[2] == " debit"):
# ...
elif (row[2] == " credit"):
# ...
except IndexError:
pass # do nothing

Related

Getting a Total for a value in a csv file row that is a text

I am trying to get the total number of Mondays in my cvs file. My current code will return all the Mondays, but I need it to return 1972. I am at a loss. I was trying it is searchcursor, but that was a nightmare. I am new to programming python so I am look for your individual wisdoms. Thank you for your time code is below.
Csv_file_data, I am trying to just get the total Mondays out of this csv
import csv
with open(r"C:\users\david\OneDrive\Documents\ArcGIS\Projects\MyProject1\Burglaries_TableToExcel.csv", 'r') as monday:
reader = csv.reader(monday,delimiter =",")
title = next(reader)[16]
found_section = False
header = None
DayOfWeek_index = None
DayOfWeek_sum = 'Monday'
for row in reader:
if not found_section:
if len(row) > 0:
if row[16] == "DayOfWeek":
header = next(reader)
DayOfWeek_index = header_index("Monday")
found_section = True
else:
if len(row) > 0:
DayOfWeek_sum += float(row[DayOfWeek_index])
else:
break
print(DayOfWeek_sum)
An example, not tested as I was not going to hand transcribe the data from the image.
import csv
with open(r"C:\users\david\OneDrive\Documents\ArcGIS\Projects\MyProject1\Burglaries_TableToExcel.csv", 'r') as monday:
mon_ct = 0
cvs_dict = csv.DictReader(monday)
for row in csv_dict:
if row["DayOfWeek"] == "Monday":
mon_ct += 1

How to iterate over the rows from 2 files, compare the values and update the value in a file when the condition is met?

For changing the values from 10 to 18, 19 or 20, I am splitting the string, access the substrings and then trying to change it. Its working but just not changing the values. Here is the solution I am trying to implement:
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
readme = open(topoFileName, "r")
Lines = readme.readlines()
readme.close()
newFile = open(newFileName,"w")
for row in oldLines:
for line in Lines:
tmp = line.split()
list = row.rstrip()
tmp1 = list.split()
newFile.write(row)
if row.find("BEG_ELEM_DATA") > -1:
if tmp[0] == tmp1[0]:
if tmp[2] == 1 and tmp[3] == 0:
# it is magnet, value 18
newFile.write(tmp1.replace(tmp1[1], "18"))
elif tmp[2] == 1 and tmp[3] == 1:
# it is iron, value 19
newFile.write(tmp1.replace(tmp1[1], "19"))
else:
# it is air, value 20
newFile.write(tmp1.replace(tmp1[1], "20"))
newFile.close()
I would really appreciate it if you could able to solve this problem in above script, then I guess it should work.
I'm also still a beginner in Python, but I tried to solve your problem and here is my solution:
I guess there are way better ways to do it because here you have to import all data to a dataframe before comparing it.
Also I don't know if you can read your data with pd.read_csv to a dataframe because I don't know *.hmo and *.topo
import pandas as pd
df = pd.read_csv('tryout.csv', delimiter=';')
df2 = pd.read_csv('density.csv', delimiter=';')
for idx, row in df.iterrows():
for idx2, row2 in df2.iterrows():
if row[0] == row2[0]:
if row2[2] == 1 and row2[3] == 0 :
# it is magnet, value 18
row[1] = 18
elif row2[2] == 1 and row2[3] == 1 :
# it is iron, value 19
row[1] = 19
else:
# it is air, value 20
row[1] = 20
df.to_csv('new_tryout.csv')
What my code is doing here, it loads both files to dataframes. Then iterate over every line to compare where the ID in both files is the same (e.g 3749).
If true there are the 3 if statements whether it is magnet/iron/air and change the value in df to the right number.
At the end save the new df to a new file 'new_tryout.csv'
I created 2 testfiles for it and it worked the way it should.
Finally, here is the solution you were searching for.
import pandas as pd
df2 = pd.read_csv('Density.topo', header = 0, names = list('ABCD'), delimiter=r'\s+', skiprows=1)
df2[['C', 'D']]= df2[['C', 'D']].round()
new_file_content=''
with open('tryout.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
search_idx = df2[df2['A'] == ID_to_search_for].index[0]
if df2['C'][search_idx] == 1 and df2['D'][search_idx] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif df2['C'][search_idx] == 1 and df2['D'][search_idx] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
if you don't want to use dataframes, you can do it like this:
with open('density.topo') as f:
lists_of_list = [line.rstrip().split() for line in f]
new_file_content=''
with open('tryout_test.hmo', 'r') as f:
for line in f:
if line[11:13] == '10':
if line[3].isspace():
ID_to_search_for = line[4:8] # number with 4 digits
else:
ID_to_search_for = line[3:8] # number with 5 digits
for idx, sublist in enumerate(lists_of_list):
if sublist[0] == ID_to_search_for:
if lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 0:
change = '18' #magnet
new_line = line[:11] + change + line[13:]
elif lists_of_list[idx][2] == 1 and lists_of_list[idx][3] == 1:
change = '19' #iron
new_line = line[:11] + change + line[13:]
else:
change = '20' #air
new_line = line[:11] + change + line[13:]
new_file_content += new_line
else:
new_file_content += line
with open('tryout_changed.hmo', 'w') as f:
f.write(new_file_content)
ok, here is my final answer. It does (again) all things you were searching for. Please debug your code in your IDE if there is a problem. You should start using context manager instead of open and closing files step by step.
I wrote the new code around your code in the question and added some comments to it.
oldFileName = 'tryout.hmo'
newFileName = 'tryout_NEW.hmo'
topoFileName = 'Density.topo'
readme = open( oldFileName, "r" )
oldLines = readme.readlines()
m = int(oldLines[3])
print(m)
new_m = m+3
m1 = str(m)
new_m1 = str(new_m)
Phrase = "END_COMP_DATA"
#n = "Phrase not found" #not used --> not needed
with open(oldFileName,"r") as oldFile:
for number, lin in enumerate(oldFile):
if Phrase in lin:
n = number
#insert 3 lines to tryout_new at the right position (--> row n)
magnet = f" {m+1} "'" topo_magnet"'"\n"
iron = f" {m+2} "'" topo_iron"'"\n"
air = f" {m+3} "'" topo_air"'"\n"
oldLines[n:n] = [magnet, iron, air]
newFile = open(newFileName,"w")
flag = 0
with open('density.topo') as f:
data_density = [line.rstrip().split() for line in f]
for idx, row in enumerate(oldLines):
lst = row.rstrip() #I think you shouldn't name a variable like a class in python (list). use 'lst' or something like that
tmp_tryout = lst.split()
if row.find("BEG_ELEM_DATA") > -1:
flag = 1
if flag == 1 and len(tmp_tryout)>1:
# if the column has more than 2 columns (after split), check for the "10"
if tmp_tryout[1] == '10':
# density_idx_line searchs in density.topo for a match with tmp_tryout[0] (e.g. 3749) and stores the whole line
density_idx_line = list(filter(lambda x: x[0] == tmp_tryout[0], data_density))
if len(density_idx_line) >0:
if density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1e-05':
# the ' 10 ' is the 10 with a whitespace before and after it. Only like this only the 10 gets replaced (and not e.g. 3104 to 3184)
newFile.write(row.replace(' 10 ', ' 18 '))
elif density_idx_line[0][2] == '1.0' and density_idx_line[0][3] == '1.0':
newFile.write(row.replace(' 10 ', ' 19 '))
else:
newFile.write(row.replace(' 10 ', ' 20 '))
else:
newFile.write(row)
else:
if idx == 3:
newFile.write(row.replace(m1, new_m1))
else:
newFile.write(row)
newFile.close()
print ("script terminated successfully!")
ok, here is another solution. For anybody else who reads this: this is still only a temporary solution but #Sagar and me both don't know to do it better.
import pandas as pd
df = pd.read_csv('tryout.hmo', header = 0, names = list('ABCDEFGHIJKLM'), delimiter=r'\s+', skiprows=[i for i in range(52362)])
df2 = pd.read_csv('Density.topo', header = 0, names = list('ANOP'), delimiter=r'\s+', skiprows=1)
df2 = df2.iloc[:-3, :]
df3 = df.merge(df2, how='outer', on='A')
df3[['O','P']] = df3[['O','P']].fillna(-1).astype(int).replace(-1, np.nan)
df3['B']= df3.apply(lambda x: 18 if x['B']==10 and x['O']==1 and x['P']==0 else (
19 if x['B']==10 and x['O']==1 and x['P']==1 else (
20 if x['B']==10 and x['O']==0 and x['P']==0 else x['B'])), axis=1)
df3.to_csv('new_tryout.csv')
It finished the code in less than a second, so it is far better than iterrows or itertuples.
The new csv file includes both the tryout file and the density file. They are merged together by the first column of tryout file (ID i guess)
I didn't check all of this very big file but from the few random points I checked, it seems as this way works.

Python read XML file (near 50mb)

I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?
The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict

Error uploading a date object along with a string to a SQL db

I have a date object that needs to be uploaded into a database from a CSV file. When I make a query to upload the row into DB. I get this error:
Incorrect syntax near the keyword 'of'. (156) (SQLExecDirectW)")
Code to upload data:
with open(UploadFile, "r") as uploadData:
i = 0
flag = 0
formatter_string = "%d/%m/%y"
for row in reader:
if(flag == 0):
flag = flag + 1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
cursor.execute("insert into "+UploadTable+" values ("+row[0]+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")")
print "insert into "+UploadTable+" values ("+str(row[0])+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")"
i = i + 1
print 'inserted ' + str(i) + ' rows'
cnxn.commit()
row[0] is date
nullcheckint/nullcheckstr: checking if the row to be not null
with open(UploadFile, "r") as uploadData:
i=0
flag=0
formatter_string = "%d/%m/%y"
d=[]
for row in reader:
if(flag==0):
flag=flag+1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
temp=[]
for val,i in enumerate(row):
if val==0:
temp.append(str(row[0]))
elif val == 1:
temp.append(nullcheckstr(row[1]))
else:
temp.append(nullcheckint(row[1]))
d.append(temp)
for row in d:
cursor.execute("insert into "+UploadTable+" values(?,?,?,?,?,?,?,?)",row)
print 'inserted rows'
cnxn.commit
Give this code a try. Its a LOT cleaner and is easier to debug. Without knowing what your input file looks like, I have to assume the bug was in your jumbled code.
A few bugs/inconsistencies:
You never use uploadData
You never declare reader
with open(UploadFile, "r") as uploadData:
i = 0
# Since this is just a flag, use True/False
flag = False
formatter_string = "%d/%m/%y"
for row in reader:
if not flag:
flag = True
else:
datetime_object = datetime.strptime(row[0], formatter_string)
# Format each section of the row
row[0] = datetime_object.date()
row[1] = nullcheckstr(row[1])
row[2:] = list(map(nullcheckint, row[2:]))
# Use `str.format` to make this statement MUCH cleaner
sql = "insert into {} values ({})".format(UploadTable, ','.join(row))
cursor.execute(sql)
i = i + 1
print 'inserted {} rows'.format(str(i))
cnxn.commit()

Python : populate a new column with an if/else statement

I have a csv file composed of three column.
My goal is to add a fourth column and populate it with a statement based on my columns n°2 and n°3.
Here is the beginning of my code :
import csv, sys, locale, operator
abord = "/home/julien/csv/ABORD.csv"
file1 = open (abord, 'rb')
reader1 = csv.reader (file1, delimiter = ';', quotechar=' ')
next(reader1)
for row1 in reader1:
ID = row1 [0]
LARG_1 = row1 [1]
LARG_2 = row1 [2]
And I want to make things like that :
if LARG_1 > 10 and LARG_2 <20:
print "result OK" in a fourth column "CONTROL"
else:
print "result fail" in the fourth column "CONTROL"
then save the csv, nom composed of 4 columns
Do you know how I could do it ? Thank you !
You have to write to another file (using a csv.Writer)
sourcepath = "/home/julien/csv/ABORD.csv"
destpath = "/home/julien/csv/ABORD-fixed.csv"
with open(sourcepath, "rb") as source, open(destpath, "wb") as dest:
# XXX are you sure you want this as quotechar ???
reader = csv.reader(source, delimiter = ';', quotechar=' ')
writer = csv.writer(dest, delimiter = ';', quotechar=' ')
# first copy the (augmented) headers
headers = reader.next()
headers.append("CONTROL")
writer.writerow(headers)
# then let's loop on the content
for rownum, row in enumerate(reader):
# we need to convert data to int
# adding proper error handling here might help...
# status = "result OK" if (int(row[1]) > 10 and int(row[2]) < 20) else "result fail"
try:
l1 = int(row[1])
l2 = int(row[2])
except (TypeError, ValueError), e:
err = "non integer value for l1 and or l2 in row %s line %s - got : %s" % (
rownum, reader.linenum, row
)
print >> sys.stderr, err
result = "invalid values"
else:
if l1 > 10 and l2 < 20:
result = "result OK"
elif rownum == 42: # stupid value for if/elif/else exemple
result = "what's the question ?"
else:
result = "result fail"
row.append(result)
writer.writerow(row)
If needed you can then delete the source file and rename the new one.

Categories