I have a csv file composed of three column.
My goal is to add a fourth column and populate it with a statement based on my columns n°2 and n°3.
Here is the beginning of my code :
import csv, sys, locale, operator
abord = "/home/julien/csv/ABORD.csv"
file1 = open (abord, 'rb')
reader1 = csv.reader (file1, delimiter = ';', quotechar=' ')
next(reader1)
for row1 in reader1:
ID = row1 [0]
LARG_1 = row1 [1]
LARG_2 = row1 [2]
And I want to make things like that :
if LARG_1 > 10 and LARG_2 <20:
print "result OK" in a fourth column "CONTROL"
else:
print "result fail" in the fourth column "CONTROL"
then save the csv, nom composed of 4 columns
Do you know how I could do it ? Thank you !
You have to write to another file (using a csv.Writer)
sourcepath = "/home/julien/csv/ABORD.csv"
destpath = "/home/julien/csv/ABORD-fixed.csv"
with open(sourcepath, "rb") as source, open(destpath, "wb") as dest:
# XXX are you sure you want this as quotechar ???
reader = csv.reader(source, delimiter = ';', quotechar=' ')
writer = csv.writer(dest, delimiter = ';', quotechar=' ')
# first copy the (augmented) headers
headers = reader.next()
headers.append("CONTROL")
writer.writerow(headers)
# then let's loop on the content
for rownum, row in enumerate(reader):
# we need to convert data to int
# adding proper error handling here might help...
# status = "result OK" if (int(row[1]) > 10 and int(row[2]) < 20) else "result fail"
try:
l1 = int(row[1])
l2 = int(row[2])
except (TypeError, ValueError), e:
err = "non integer value for l1 and or l2 in row %s line %s - got : %s" % (
rownum, reader.linenum, row
)
print >> sys.stderr, err
result = "invalid values"
else:
if l1 > 10 and l2 < 20:
result = "result OK"
elif rownum == 42: # stupid value for if/elif/else exemple
result = "what's the question ?"
else:
result = "result fail"
row.append(result)
writer.writerow(row)
If needed you can then delete the source file and rename the new one.
Related
I am having trouble reading through a file where the rows have different lengths. Specifically, I know that the file is 13 rows long and that rows 1 and 13 have 2 values in them where the rest (2-12) have 4. I want to get one value from row 1 and one value from row 13, and one value from each of rows 2-12 depending on whether or not their preceding value is equal to "credit" or "debit". Since the rows have different lengths I get 'index out of range' errors. Any help would be greatly appreciated. Thanks!
class Checkbook:
"""Checkbook class for list of check transactions"""
def __init__(self, filename):
"""initializer for Checkbook class"""
self.name = filename
self.debitList = []
self.creditList = []
self.startAmt = 0
self.endAmt = 0
self.shouldBeBal = 0
with open(filename) as csvFile:
readCSV = csv.reader(csvFile, delimiter = ',')
#rowCount = sum(1 for row in readCSV) - 1
#print(rowCount)
next(csvFile)
#in range(1, rowCount, 1):
for row in readCSV:
if (row[2] == " debit"):
debitAmt = row[3]
self.debitList.append(debitAmt)
elif (row[2] == " credit"):
creditAmt = row[3]
self.creditList.append(creditAmt)
Well, you have to either avoid the IndexError
for row in readCSV:
if len(row) > 2: # make sure the row is long enough
if (row[2] == " debit"): # now this can't fail
# ...
elif (row[2] == " credit"):
# ...
or handle it:
for row in readCSV:
try:
if (row[2] == " debit"):
# ...
elif (row[2] == " credit"):
# ...
except IndexError:
pass # do nothing
The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt
i have a directory with 5+ invalid CSV files. i have no problems reading the files and then writing them as "good" CSV files one at a time. But when i try to process a second file i get "IndexError: array index out of range"
import xlrd
import csv, sys, os
import datetime, time
import logging
import Gmail_email
program = "CleanCSV"
date = datetime.datetime(1899, 12, 30)
argv0=""
argv1 = 'c:/tmp/checkEmail/' #input directory
argv2 = "f:/foo/in/bar-" #output directory
sys.argv = [argv0, argv1, argv2]
inDir = sys.argv[1]#input directory
outDir = sys.argv[2] #output directory
lList = [] #holder list to hold names of files to be processed
def processFiles():
try: #Makes list of local files in lDir, Populates lList
if os.listdir(inDir) == []: #checks for files in lDir
logging.info('No Files to upload')
exit()
else:
for file_name in os.listdir(inDir):
#print file_name
if os.path.isfile(inDir+file_name):
lList.append(file_name) # populate local dir list
if 'Thumbs.db' in lList: #remove windows thumbs file
lList.remove('Thumbs.db')
logging.info('Files to be checked')
logging.info('%s', lList )
#print lList, 'lList'
except Exception, e:
Gmail_email.email(e, program)
logging.warning('Error with local files')
logging.warning('%s', e)
exit()
for each in lList: #calls on cleanup method for each file in lLIst
filePath= inDir+each
print filePath, "filepath"
testFile(filePath)
def testFile(filePath):
try:
with open(filePath, "rb") as csvfile:
spamreader= csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
#print "good file, most likely"
pass
except Exception, e:
logging.warning('Error with local files')
logging.warning('%s', e)
#print "cleaing bad file", filePath
cleanBadFile(filePath)
def cleanBadFile(filePath):
timestr = time.strftime("%Y%m%d-%H%M%S")
#print "bad file trying to clean"
f = open(outDir+timestr+".csv", 'ab')
try: #can i read the file
workbook = xlrd.open_workbook(filePath)
#will error here if bad xlrd cannot open it
print workbook.sheet_names()
#print workbook
except Exception, e:
#print e, " error"
pass
worksheet = workbook.sheet_by_name('Sheet')
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
#print worksheet.ncols, 'num cells'
curr_row = -1
while curr_row < num_rows: #goes over every row
num_cells = worksheet.ncols - 1
curr_row += 1
row = worksheet.row(curr_row)
print row, "row"
curr_cell = -1
print worksheet.row_len(curr_row), "row len"
print curr_row, curr_cell, "curr row, curr cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print ' ', cell_type, ':', cell_value
values= []
if cell_type == 0: #tests if first value in row is data
#assuming that good rows will have a value in the first cell of each row
#if no data row is not copied to new file
print "bad line"
pass
else:
while curr_cell < num_cells:
curr_cell += 1
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
#print cell_type, ":", cell_value
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"
values.append(cell_value)
#print values, "values"
csv.writer(f, delimiter=',',
quotechar=',', quoting=csv.QUOTE_MINIMAL).writerow( values )
f.close()
print f.closed
print "ah"
curr_cell= 0
curr_row = 0
#print "checking file:", readFile
processFiles()
#print "exit"
exit
The error messsage
Traceback (most recent call last):
File "F:\cleanCSV.py", line 132, in <module>
processFiles()
File "F:\cleanCSV.py", line 51, in processFiles
testFile(filePath)
File "F:\cleanCSV.py", line 64, in testFile
cleanBadFile(filePath)
File "F:\cleanCSV.py", line 106, in cleanBadFile
cell_type = worksheet.cell_type(curr_row, curr_cell)
File "C:\Python27\lib\site-packages\xlrd\sheet.py", line 413, in cell_type
return self._cell_types[rowx][colx]
IndexError: array index out of range
I feel like I need to "reset" a counting variable to but think i have them all. I don't know what to do.
Two lines before the line causing the exception curr_cell is set to -1 which can't be a valid cell index. A comment some lines further down suggests you expect that to be the first cell in the row, so the index should be 0 instead of -1.
I moved my +1 (curr_cell+=1) down 3 lines.
while curr_cell < num_cells:
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
#print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print cell_type, ":", cell_value
curr_cell += 1
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"
I am trying to write some output to csv from my code below. First column should have all of the valid IDs with a header that says “Valid (count in parenthesis)”. The second column should contain a list of all of the non-valid IDs and have a header that says “Non-valid (count in parenthesis)”. Any idea how I do this?
import csv
# csv_path = r'C:\temp\data\fileA'
csv_path = r'C:\temp\data\fileA'
reader = csv.reader(open(csv_path, 'r'), dialect='excel-tab')
reader.next() # ignore heading
min_id = 1503332138
max_id = 1503632138
valid_ids = []
invalid = []
x = 0
for line in reader:
pv = line[1]
if id.isdigit() and int(id) >= min_id and int(id) <= max_id:
if id not in valid_ids:
valid_ids.append(id)
else:
if id not in invalid:
invalid.append(id)
print 'Valid IDs (',len(valid_ids),')'
for valid in valid_ids:
print valid
print 'Invalid IDs (',len(invalid),')'
for invalid in invalid:
print invalid
# ...
# Continuing from point where you have valid_ids and invalid lists populated
data = [('Valid IDs', valid_ids), ('Invalid IDs', invalid)]
# Create header
header = []
for (label, id_list) in data:
label_with_count = '%s (%d)' % (label, len(id_list))
header.append(label_with_count)
# Write to CSV file
with open('path_to_output_file.csv') as out_csv_file:
csv_writer = csv.writer(out_csv_file)
csv_writer.writerow(header)
for (idx, dataset) in enumerate(data):
(label, id_list) = dataset
for id in id_list:
row = (idx * ['']) + [id] + ((len(data) - idx - 1) * [''])
csv_writer.writerow(row)
I have a date object that needs to be uploaded into a database from a CSV file. When I make a query to upload the row into DB. I get this error:
Incorrect syntax near the keyword 'of'. (156) (SQLExecDirectW)")
Code to upload data:
with open(UploadFile, "r") as uploadData:
i = 0
flag = 0
formatter_string = "%d/%m/%y"
for row in reader:
if(flag == 0):
flag = flag + 1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
cursor.execute("insert into "+UploadTable+" values ("+row[0]+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")")
print "insert into "+UploadTable+" values ("+str(row[0])+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")"
i = i + 1
print 'inserted ' + str(i) + ' rows'
cnxn.commit()
row[0] is date
nullcheckint/nullcheckstr: checking if the row to be not null
with open(UploadFile, "r") as uploadData:
i=0
flag=0
formatter_string = "%d/%m/%y"
d=[]
for row in reader:
if(flag==0):
flag=flag+1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
temp=[]
for val,i in enumerate(row):
if val==0:
temp.append(str(row[0]))
elif val == 1:
temp.append(nullcheckstr(row[1]))
else:
temp.append(nullcheckint(row[1]))
d.append(temp)
for row in d:
cursor.execute("insert into "+UploadTable+" values(?,?,?,?,?,?,?,?)",row)
print 'inserted rows'
cnxn.commit
Give this code a try. Its a LOT cleaner and is easier to debug. Without knowing what your input file looks like, I have to assume the bug was in your jumbled code.
A few bugs/inconsistencies:
You never use uploadData
You never declare reader
with open(UploadFile, "r") as uploadData:
i = 0
# Since this is just a flag, use True/False
flag = False
formatter_string = "%d/%m/%y"
for row in reader:
if not flag:
flag = True
else:
datetime_object = datetime.strptime(row[0], formatter_string)
# Format each section of the row
row[0] = datetime_object.date()
row[1] = nullcheckstr(row[1])
row[2:] = list(map(nullcheckint, row[2:]))
# Use `str.format` to make this statement MUCH cleaner
sql = "insert into {} values ({})".format(UploadTable, ','.join(row))
cursor.execute(sql)
i = i + 1
print 'inserted {} rows'.format(str(i))
cnxn.commit()