I am trying to write some output to csv from my code below. First column should have all of the valid IDs with a header that says “Valid (count in parenthesis)”. The second column should contain a list of all of the non-valid IDs and have a header that says “Non-valid (count in parenthesis)”. Any idea how I do this?
import csv
# csv_path = r'C:\temp\data\fileA'
csv_path = r'C:\temp\data\fileA'
reader = csv.reader(open(csv_path, 'r'), dialect='excel-tab')
reader.next() # ignore heading
min_id = 1503332138
max_id = 1503632138
valid_ids = []
invalid = []
x = 0
for line in reader:
pv = line[1]
if id.isdigit() and int(id) >= min_id and int(id) <= max_id:
if id not in valid_ids:
valid_ids.append(id)
else:
if id not in invalid:
invalid.append(id)
print 'Valid IDs (',len(valid_ids),')'
for valid in valid_ids:
print valid
print 'Invalid IDs (',len(invalid),')'
for invalid in invalid:
print invalid
# ...
# Continuing from point where you have valid_ids and invalid lists populated
data = [('Valid IDs', valid_ids), ('Invalid IDs', invalid)]
# Create header
header = []
for (label, id_list) in data:
label_with_count = '%s (%d)' % (label, len(id_list))
header.append(label_with_count)
# Write to CSV file
with open('path_to_output_file.csv') as out_csv_file:
csv_writer = csv.writer(out_csv_file)
csv_writer.writerow(header)
for (idx, dataset) in enumerate(data):
(label, id_list) = dataset
for id in id_list:
row = (idx * ['']) + [id] + ((len(data) - idx - 1) * [''])
csv_writer.writerow(row)
Related
I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()
I am using Python to parse data from the following csv file -
{::[name]str1_str2_str3[0]},1,U0.00 - Sensor1 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},2,U0.01 - Sensor2 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},3,U0.02 - \n,1,0,12
{::[name]str1_str2_str3[0]},4,U0.03 - Sensor4 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},5,U0.04 - \n,1,0,12
From the column1, I am parsing the value 0 within the [ ]. Then the value in column2 and from column3, I am parsing the substring "Sensor1 Not Ready" and then printing to another file as follows -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
and so on...
Now when I print the parsed values I get the following -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
SENSOR2_NOT_READY 0,3
SENSOR4_NOT_READY 0,4
SENSOR4_NOT_READY 0,5
I want to skip printing the lines with no data in column3 (for example - lines 3 and 5 in the csv file). How should I do that?
Expected output -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
SENSOR4_NOT_READY 0,4
Following is my Python script -
with open('filename.csv','rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tag_name = row[0]
bit_num = row[1]
error_name = row[2]
# Regular expressions
term0 = '\[(\d)\].*'
term1 = '(\d+)'
term2 = r'.*-\s([\w\s]+)\\n'
capture0 = list(re.search(term0, tag_name).groups())
capture1 = list(re.search(term1, bit_num).groups())
temp = re.search(term2, error_name)
if temp:
result = list(temp.groups())
else:
None
result[-1] = '_'.join(result[-1].split()).upper()
capture2 = ','.join(result)
tp = (capture0[0], capture1[0], capture2) # Tuple
f.write('{2} {0},{1},\n'.format(tp[0], tp[1], tp[2]))
Build a regex that searches for 'normal' lines. Maybe something like "^U0.0[1-5] - \n$"? Then use something like if not re.search(x): before you print the error.
with open('filename.csv','rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tag_name = row[0]
bit_num = row[1]
error_name = row[2]
# Regular expressions
term0 = '\[(\d)\].*'
term1 = '(\d+)'
term2 = r'.*-\s([\w\s]+)\\n'
term3 = '^U0.0[1-5] - \n$'
capture0 = list(re.search(term0, tag_name).groups())
capture1 = list(re.search(term1, bit_num).groups())
temp = re.search(term2, error_name)
if temp:
result = list(temp.groups())
else:
None
result[-1] = '_'.join(result[-1].split()).upper()
capture2 = ','.join(result)
tp = (capture0[0], capture1[0], capture2) # Tuple
if not re.search(temp3,error_name):
f.write('{2} {0},{1},\n'.format(tp[0], tp[1], tp[2])) #I assume this is the print line?
The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt
I have a date object that needs to be uploaded into a database from a CSV file. When I make a query to upload the row into DB. I get this error:
Incorrect syntax near the keyword 'of'. (156) (SQLExecDirectW)")
Code to upload data:
with open(UploadFile, "r") as uploadData:
i = 0
flag = 0
formatter_string = "%d/%m/%y"
for row in reader:
if(flag == 0):
flag = flag + 1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
cursor.execute("insert into "+UploadTable+" values ("+row[0]+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")")
print "insert into "+UploadTable+" values ("+str(row[0])+","+nullcheckstr(row[1])+","+nullcheckint(row[2])+","+nullcheckint(row[3])+","+nullcheckint(row[4])+","+nullcheckint(row[5])+","+nullcheckint(row[6])+","+nullcheckint(row[7])+","+nullcheckint(row[8])+")"
i = i + 1
print 'inserted ' + str(i) + ' rows'
cnxn.commit()
row[0] is date
nullcheckint/nullcheckstr: checking if the row to be not null
with open(UploadFile, "r") as uploadData:
i=0
flag=0
formatter_string = "%d/%m/%y"
d=[]
for row in reader:
if(flag==0):
flag=flag+1
else:
datetime_object = datetime.strptime(row[0], formatter_string)
row[0] = datetime_object.date()
temp=[]
for val,i in enumerate(row):
if val==0:
temp.append(str(row[0]))
elif val == 1:
temp.append(nullcheckstr(row[1]))
else:
temp.append(nullcheckint(row[1]))
d.append(temp)
for row in d:
cursor.execute("insert into "+UploadTable+" values(?,?,?,?,?,?,?,?)",row)
print 'inserted rows'
cnxn.commit
Give this code a try. Its a LOT cleaner and is easier to debug. Without knowing what your input file looks like, I have to assume the bug was in your jumbled code.
A few bugs/inconsistencies:
You never use uploadData
You never declare reader
with open(UploadFile, "r") as uploadData:
i = 0
# Since this is just a flag, use True/False
flag = False
formatter_string = "%d/%m/%y"
for row in reader:
if not flag:
flag = True
else:
datetime_object = datetime.strptime(row[0], formatter_string)
# Format each section of the row
row[0] = datetime_object.date()
row[1] = nullcheckstr(row[1])
row[2:] = list(map(nullcheckint, row[2:]))
# Use `str.format` to make this statement MUCH cleaner
sql = "insert into {} values ({})".format(UploadTable, ','.join(row))
cursor.execute(sql)
i = i + 1
print 'inserted {} rows'.format(str(i))
cnxn.commit()
I have a csv file composed of three column.
My goal is to add a fourth column and populate it with a statement based on my columns n°2 and n°3.
Here is the beginning of my code :
import csv, sys, locale, operator
abord = "/home/julien/csv/ABORD.csv"
file1 = open (abord, 'rb')
reader1 = csv.reader (file1, delimiter = ';', quotechar=' ')
next(reader1)
for row1 in reader1:
ID = row1 [0]
LARG_1 = row1 [1]
LARG_2 = row1 [2]
And I want to make things like that :
if LARG_1 > 10 and LARG_2 <20:
print "result OK" in a fourth column "CONTROL"
else:
print "result fail" in the fourth column "CONTROL"
then save the csv, nom composed of 4 columns
Do you know how I could do it ? Thank you !
You have to write to another file (using a csv.Writer)
sourcepath = "/home/julien/csv/ABORD.csv"
destpath = "/home/julien/csv/ABORD-fixed.csv"
with open(sourcepath, "rb") as source, open(destpath, "wb") as dest:
# XXX are you sure you want this as quotechar ???
reader = csv.reader(source, delimiter = ';', quotechar=' ')
writer = csv.writer(dest, delimiter = ';', quotechar=' ')
# first copy the (augmented) headers
headers = reader.next()
headers.append("CONTROL")
writer.writerow(headers)
# then let's loop on the content
for rownum, row in enumerate(reader):
# we need to convert data to int
# adding proper error handling here might help...
# status = "result OK" if (int(row[1]) > 10 and int(row[2]) < 20) else "result fail"
try:
l1 = int(row[1])
l2 = int(row[2])
except (TypeError, ValueError), e:
err = "non integer value for l1 and or l2 in row %s line %s - got : %s" % (
rownum, reader.linenum, row
)
print >> sys.stderr, err
result = "invalid values"
else:
if l1 > 10 and l2 < 20:
result = "result OK"
elif rownum == 42: # stupid value for if/elif/else exemple
result = "what's the question ?"
else:
result = "result fail"
row.append(result)
writer.writerow(row)
If needed you can then delete the source file and rename the new one.