The objective of this script is to take an incoming csv file, read it with a DictReader,
take the keys that were read, see if they match any of the pre-designated values in the fieldMap dictionary, and if they do match, append those keys to my hdrlist. Then, write the header list to an outputted file call ofp.
This issue that I am having is that when I don't a key that matches one of the pre-designated values in the fieldMap, I need to insert a blank (' ').
I've tried appending blank values to the hdrlist in an else statement and having a blank key value pair in my fieldMap dictionary:
if row.has_key(ft_test):
hdrlist.append(ft_test)
else:
hdrlist.append('')
'':[''] #blank key:value pair
,but then my:
if hdrlen != len(hdrlist)-1:
print "Cannot Cannot find a key for %s in file %s" % (ft,fn)"
error handling statement returns more print statements than I think it should, and I'm not sure as to why.
If anyone can shed some light as to how to insert blank into my ofp.write(fmtstring), it would be greatly appreciated.
Also, if anyone could shed some light as to why i get more print statement than I think I should with the above else statement, it would be greatly appreciated as well.
My whole script is below, and if there is any other info needed to help me with this code, I will gladly provide it.
Here is a sample of an input file that would produce to many print statements.
input_file.csv = {'cust_no':1, 'streetaddr':'2103 Union Ave','address2':' ','city':'Chicago'}
#!/usr/bin/env python
import sys, csv, glob
fieldMap = {'zipcode':['Zip5', 'zip9','zipcode','ZIP','zip_code','zip','ZIPCODE'],
'firstname':['firstname','FIRSTNAME'],
'lastname':['lastname','LASTNAME'],
'cust_no':['cust_no','CUST_NO'],
'user_name':['user_name','USER_NAME'],
'status':['status','STATUS'],
'cancel_date':['cancel_date','CANCEL_DATE'],
'reject_date':['REJECT_DATE','reject_date'],
'streetaddr':['streetaddr','STREETADDR','ADDRESS','address'],
'streetno':['streetno','STREETNO'],
'streetnm':['streetnm','STREETNM'],
'suffix':['suffix','SUFFIX'], #suffix of street name: dr, ave, st
'city':['city','CITY'],
'state':['state','STATE'],
'phone_home':['phone_home','PHONE_HOME'],
'email':['email','EMAIL'],
'':['']
}
def readFile(fn,ofp):
count = 0
CSVreader = csv.DictReader(open(fn,'rb'), dialect='excel', delimiter=',')
for row in CSVreader:
count+= 1
if count == 1:
hdrlist = []
for ft in fieldMap.keys():
hdrlen = len(hdrlist)
for ft_test in fieldMap[ft]:
if row.has_key(ft_test):
hdrlist.append(ft_test)
if hdrlen != len(hdrlist)-1:
print "Cannot find a key for %s in file %s" % (ft,fn)
if len(hdrlist) != 16:
print "Another error. Not all header's have been assigned new values."
if count < 5:
x=len(hdrlist)
fmtstring = "%s\t" * len(hdrlist) % tuple(row[x] for x in hdrlist)
ofp.write(fmtstring)
break
if __name__ == '__main__':
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
ofp.write("zipcode\tfirstname\tlastname\tcust_no\tuser_name\tstatus\t"
"cancel_date\treject_date\tstreetaddr\tstreetno\tstreetnm\t"
"suffix\tcity\tstate\tphone_home\temail")
for filename in filenames:
readFile(filename,ofp)
Sample data:
cust_no,status,streetaddr,address2,city,state,zipcode,billaddr,servaddr,title,latitude,longitude,custsize,telemarket,dirmail,nocredhold,email,phone_home,phone_work,phone_fax,phone_page,phone_cell,phone_othr,taxrate1,taxrate2,taxrate3,taxtot,company,firstname,lastname,user_name,dpbc,container,seq,paytype_am,paytype_di,paytype_mc,paytype_vi
0,0,'123 fake st.',,'chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,,,,,,,,,,,'bob','smith','bob smith',,,,'TRUE','TRUE','TRUE','TRUE'
0,0,'123 fake st.','','chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,'','','','','','','','','','','bob','smith','bob smith','','','','TRUE','TRUE','TRUE','TRUE'
If all you want is a hdrlist of the recognized field names in the csv file being processed, you can create it by comparing the values in the DictReader.fieldnames attribute to the contents of fieldMap immediately after creating the DictReader because doing so with a filenames argument will automatically read in the header row of the file.
I also changed your fieldMap dictionary into an OrderedDict so it would preserve the order of the keys.
import glob
from collections import OrderedDict
import csv
import sys
fieldMap = OrderedDict([
('zipcode', ['zipcode', 'ZIPCODE', 'Zip5', 'zip9', 'ZIP', 'zip_code', 'zip']),
('firstname', ['firstname', 'FIRSTNAME']),
('lastname', ['lastname', 'LASTNAME']),
('cust_no', ['cust_no', 'CUST_NO']),
('user_name', ['user_name', 'USER_NAME']),
('status', ['status', 'STATUS']),
('cancel_date', ['cancel_date', 'CANCEL_DATE']),
('reject_date', ['reject_date', 'REJECT_DATE']),
('streetaddr', ['streetaddr', 'STREETADDR', 'ADDRESS', 'address']),
('streetno', ['streetno', 'STREETNO']),
('streetnm', ['streetnm', 'STREETNM']),
('suffix', ['suffix', 'SUFFIX']), # suffix of street name: dr, ave, st
('city', ['city', 'CITY']),
('state', ['state', 'STATE']),
('phone_home', ['phone_home',' PHONE_HOME']),
('email', ['email', 'EMAIL']),
])
def readFile(fn,ofp):
with open(fn, 'rb') as csvfile:
# the following reads the header line into csvReader.fieldnames
csvReader = csv.DictReader(csvfile, dialect='excel', delimiter=',')
# create a list of recognized fieldnames in the csv file
hdrlist = []
for ft in fieldMap:
for ft_test in fieldMap[ft]:
if ft_test in csvReader.fieldnames:
hdrlist.append(ft_test)
break
else:
hdrlist.append(None) # placeholder (could also be '')
hdrlen = len(hdrlist)
ofp.write('hdrlist: {}\n'.format(hdrlist))
if hdrlen != len(fieldMap):
print "Note that not all field names were present in file."
ofp.write("\t".join(fieldMap) + '\n')
for row in csvReader:
fmtstring = "%s\t" * hdrlen % tuple(
row[field] if field else 'NA' for field in hdrlist)
ofp.write(fmtstring+'\n')
if __name__ == '__main__':
# sys.argv = [sys.argv[0], 'ofp_input.csv'] # hardcode for testing
if len(sys.argv) != 2:
print "Error: Filename argument missing!"
sys.exit(-1)
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
for filename in filenames:
readFile(filename, ofp)
Related
Create a function that takes a file name (and path if needed) as the argument. In the function, open and read in the file mountains.csv. Use a try/catch to be sure the file exists and is readable. If the file location is wrong or it can't be opened, print an error that begins with "Error:". (You can test it with a junk path or filename that doesn't exist.)
Split each line by the comma, and make a dictionary where the key is the mountain name (the first element) and the height is the value, the second element. Make sure to convert the height to a number. Then print the keys and values of the dictionary using .items(), in readable sentences that say, for instance, "The height of K2 is 8611 meters." Return the dictionary at the end of the function.
Reminder about print with {} in your string: use print(string.format(variable)) to fill in the {} with your variable. If there are 2 {}'s, use .format(var1, var2)
This is what I got so far:
import csv
def mountain_height(filename):
""" Read in a csv file of mountain names and heights.
Parse the lines and print the names and heights.
Return the data as a dictionary.
The key is the mountain and the height is the value.
"""
mountains = dict()
msg = "The height of {} is {} meters."
err_msg = "Error: File doesn't exist or is unreadable."
# TYPE YOUR CODE HERE.
with open('mountains.csv', 'r') as handle:
reader = csv.reader(handle, delimiter=',')
for row in reader:
name = row[0]
height = row[1]
int(height)
dictionary = {name: height}
for k,v in dictionary.items():
print(k,v)
return dictionary
And there's the csv file:
You're nearly there. You simply need to add an entry to mountains for each iteration of the loop:
mountains = dict()
with open('mountains.csv', 'r') as handle:
reader = csv.reader(handle, delimiter=',')
for row in reader:
name = row[0]
height = row[1]
mountains[name] = int(height)
Don't forget to check if the file exists! I added an extra check so the function works with or without ".csv" file extension specified.
You also want to print a nice string using msg.format(name, height)
Lastly don't return the dictionary inside of the for loop! This ends your function and you will only see one message printed out.
For bonus points you can use csv.DictReader to read CSV files more efficiently. If the CSV does not have a header column, you need to pass fieldnames (i.e. name, height) yourself.
from csv import DictReader
def mountain_height(filename):
msg = "The height of {} is {} meters."
err_msg = "Error: File doesn't exist or is unreadable."
if filename.split('.')[-1] != 'csv':
filename += '.csv'
try:
open(filename)
except FileNotFoundError:
print(err_msg)
with open(filename) as f:
reader = DictReader(f, fieldnames=['name', 'height'], delimiter=',')
mountain_heights = {
row['name']: int(row['height']) for row in reader
}
for name, height in mountain_heights.items():
print(msg.format(name, height))
return mountain_heights
I am trying to create a generic filter to split file on the condition from the Yaml file.
My code is running Pandas but as the environment is not having Pandas module I am trying to achieve it through CSV library.
When I am hard coding the value at q its working but when I am trying to pass it from the config file its not working. Also I want pass multiple checks on the same column like('','Balance). So Asset goest to one file and ('','Balance) in another.
import sys
import yaml
import csv
def dynamicQuery(config_file, data_file, outputPath):
"""Loading Configuration file into dataframe"""
try:
with open(config_file) as file:
doc = yaml.full_load(file)
except Exception as err:
print("Error Configuration data file: ", err)
try:
for k, v in doc.items():
if k != 'column':
filename = k
k = doc[k]
q = ' , '.join(f'{v} ' for q, v in k.items())
q = '"' + str(strip(q)) + '"'
print(q) #-- "Asset"
df = csv.reader(open(data_file), delimiter=',')
df = filter(lambda x: (x[2] == q), df) # Not working here
#df = filter(lambda x: x[2] == "Asset", df) --> this is working
csv.writer(open(filename + ".txt", 'w', newline=' '), delimiter=',').writerows(df)
print("File is created for " + filename)
except Exception as err:
print("Error executing queries and saving output data file: ", err)
def main():
if len(sys.argv) == 3:
"""File will be passed as parameter """
config_file = sys.argv[1]
data_file = sys.argv[2]
dynamicQuery(config_file, data_file)
else:
usage()
def usage():
print("Usage: python splitGenric.py config_file data_file ")
main()
Sample file
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
1237,ACV,,sample
1238,ACV,,sample
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
Config.yaml
Asset :
filter1: '"Asset"'
Balance:
filter1: '"Balance"'
filter2: '""'
The YAML configuration file format is not particularly convenient for this, and yaml is not a standard Python module. I would probably go for something like regular expressions instead of a YAML file. But just to sort out the immediate problems, the problem here is that you are mixing up Python syntax and literal quoting characters. You are assembling a string containing literal double quotes around Asset for example, where your CSV file does not contain double quotes around this value; and so you are effectively comparing if 'Asset' == '"Asset"' which of course is False.
The following might not do exactly what you want, but should at least demonstrate a rough first cut of what I think you are trying to do here.
with open(config_file) as file:
config = yaml.full_load(file)
filters = dict()
for k, v in config.items():
handle = open(k + '.txt', 'w', newline='')
writer = csv.writer(handle, delimiter=',')
filt = {'handle': handle, 'writer': writer, 'conditions': []}
for _, expr in v.items():
filt['conditions'].append(expr.strip('"'))
filters[k] = filt
with open(data_file) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
for handle, conf in filters.items():
for i in range(len(conf['conditions'])):
if row[2] == conf['conditions'][i]:
conf['writer'].writerow(row)
break
for handle, conf in filters.items():
conf['handle'].close()
I'm guessing you used pyyaml which seems to be the dominant YAML module for Python.
I tried to use the config.yaml, but I've got this error
File "C:\Users\XXXXXX\AppData\Local\Programs\Python\Python36-32\lib\site-packages\yaml\parser.py", line 439, in parse_block_mapping_key
"expected <block end>, but found %r" % token.id, token.start_mark)
yaml.parser.ParserError: while parsing a block mapping
in "config.yml", line 5, column 5
expected <block end>, but found ','
in "config.yml", line 5, column 17
But I will pretend it worked and the content was loaded in a dictionary, as it appears to be the intention.
The dictionary is as:
doc = {'Asset':'Asset','Balance':[' ','Balance']}
#load directly to dataframe
df = pd.read_csv('sample.txt',header=None)
handler = ''
for k,v in doc.items():
kList = {k:[]} #making empty lists with k values
if isinstance(v,str): #Asset is string
fil = v
else:
for i in range(len(v)): #Balance is list of values
if v[i]:
fil = v[i]
else:
handler = k #replace the null
for types in df.values:
if fil in types:
kList[k].append(types) #append types to corresponding list
csv.writer(open(k+".txt", 'a', newline='\n'), delimiter=',').writerows(kList[k])
if handler: #there is null values
nulls = df[df.isnull().any(axis=1)].values.tolist()
csv.writer(open(handler+".txt", 'a', newline='\n'), delimiter=',').writerows(nulls)
The result are two files, with the following contents:
Asset.txt:
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
Balance.txt:
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
1237,ACV,nan,sample
1238,ACV,nan,sample
I have a CSV file that looks something like this:
Date,Person,Time Out,Time Back,Restaurant,Calories,Delicious?
6/20/2016,August,11:58,12:45,Black Bear,850,Y
6/20/2016,Marcellus,12:00,12:30,Brought Lunch,,Y
6/20/2016,Jessica,11:30,12:30,Wendy's,815,N
6/21/2016,August,12:05,1:01,Brought Lunch,,Y
So far I have managed to print each row into a list of strings (ex. - ['Date', 'Person', 'Time Out', etc.] or ['6/20/2016', 'August', '11:58' etc.]).
Now I need to do 2 more things:
Add an ID header and sequential numeric string to each row (for ex. - ['ID', 'Date',
'Person', etc.] and ['1', '6/20/2016', 'August', etc.])
Separate each row so that they can be formatted into insert
statements rather than just having the program print out every single row one after another (for ex. - INSERT INTO Table ['ID', 'Date', 'Person', etc.] VALUES ['1', '6/20/2016', 'August', etc.])
Here is the code that has gotten me as far as I am now:
import csv
openFile = open('test.csv', 'r')
csvFile = csv.reader(openFile)
for row in csvFile:
print (row)
openFile.close()
Try this (I ignored the ID part since you can use the mySQL auto_increment)
import csv
openFile = open('test.csv', 'r')
csvFile = csv.reader(openFile)
header = next(csvFile)
headers = map((lambda x: '`'+x+'`'), header)
insert = 'INSERT INTO Table (' + ", ".join(headers) + ") VALUES "
for row in csvFile:
values = map((lambda x: '"'+x+'"'), row)
print (insert +"("+ ", ".join(values) +");" )
openFile.close()
You can use this functions if you want to mantain type conversion, i have used it to put data into google big query with a string sql statement.
PS: You can put other types on the function
import csv
def convert(value):
for type in [int, float]:
try:
return type(value)
except ValueError:
continue
# All other types failed it is a string
return value
def construct_string_sql(file_path, table_name, schema_name):
string_SQL = ''
try:
with open(file_path, 'r') as file:
reader = csv.reader(file)
headers = ','.join(next(reader))
for row in reader:
row = [convert(x) for x in row].__str__()[1:-1]
string_SQL += f'INSERT INTO {schema_name}.{table_name}({headers}) VALUES ({row});'
except:
return ''
return string_SQL
You can use this open-source tool to generate batch INSERT statements: https://simranjitk.github.io/sql-converter/.
I need to get information from a list and add a column year from name. I still not sure how to add one field 'year' in record. Can I use append?
And about output file, I just need use outputcsv.writerow(records) isn't it?
This is a part of code that I stuck:
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
outFile = open('babyQldAll.csv','w')
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name[-4:] #extract year from file names
records = extract_names(filename)
# Get (name, count, gender) from list "records",
# and add value of "year" and write into output file (using "for" loop )
Output file look like:
2010,Lola,69,Girl
And input, I have 5 file babyQld2010.csv, babyQld2011.csv, babyQld2012.csv, babyQld2012.csv, babyQld2014.csv which contains:
Mia,425,William,493
and I have to sort it in format and I already done it and save in list 'records'
Lola,69,Girl
now I need to add one field 'year' on 'record' list and export csv file.
This is my full code:
import csv
def extract_names(filename):
''' Extract babyname, count, gender from a csv file,
and return the data in a list.
'''
inFile = open(filename, 'rU')
csvFile = csv.reader(inFile, delimiter=',')
# Initialization
records = []
rowNum = 0
for row in csvFile:
if rowNum != 0:
# +++++ You code here ++++
# Read each row of csv file and save information in list 'records'
# as (name, count, gender)
records.append([row[0], row[1], "Female"])
records.append([row[2], row[3], "Male"])
print('Process each row...')
rowNum += 1
inFile.close()
return(records)
#### Start main program #####
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
with open('babyQldAll.csv','w') as outFile:
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name.split('.')[0][-4:] #extract year from file names
records = extract_names(filename)
for record in records:
csvFile_out.write([year] + record)
print("Write in csv file...")
outFile.close()
To get the year from the csv file you can simply split the string at '.' and then take the last four characters from the first part of the split. Example -
>>> s = 'babyQld2010.csv'
>>> s.split('.')[0][-4:]
'2010'
Then just simply iterate over your list of records, which you say is correct, for each list within in, use list contatenation to create a new list with year at the start and write that to csv file.
I would also suggest that you use with statement for opening the file to write to (and even in the function where you are reading from the other csv files). Example -
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
with open('babyQldAll.csv','w') as outFile:
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name.split('.')[0][-4:] #extract year from file names
records = extract_names(filename)
for record in records:
csvFile_out.writerow([year] + record)
Yes, you can just append the year column to each row as you read it in from your source files. You can read in & write out each row as a dictionary so that you can use your existing column headers to address the data if you need to massage it on the way through.
Using the csv.DictWriter() method you specify your headers (fieldnames) when you set it up. You can then write them out with the writeheader() method.
import csv
file_list = ['babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv']
outFile = open('babyQldAll.csv', 'wb')
csv_writer = csv.DictWriter(outFile,
fieldnames=['name','count','gender','year'])
csv_write_out.writeheader()
for a_file in file_list:
name,ext = a_file.split('.')
year = name[-4:]
with open(a_file, 'rb') as inFile:
csv_read_in = csv.DictReader(inFile)
for row in csv_read_in:
row['year'] = year
csv_writer.writerow(row)
outfile.close()
Hope this helps.
I am trying to iterate through several CSV files in a directory and grab a particular cell (same cell location) from each CSV file (cell location found when opened in Excel) and then post all similar cells in a single CSV or xls file, one after the other.
I have writen the code below (with some researched help) but I am just iterating over the first csv file in my list and printing the same value each time, dependant on the number of CSV files in my list. Could anybody point me in the right direction?
Here's my poor attempt!
import xlwt
import xlrd
import csv
import glob
import os
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
filename = ('outputList.csv', 'a')
fname = files[n]
workbookr = xlrd.open_workbook(fname)
sheetr = workbookr.sheet_by_index(0)
workbookw = xlwt.Workbook()
sheetw = workbookw.add_sheet('test')
while n<len(files):
fname = files[n]
workbookr = xlrd.open_workbook(fname[n])
data = [sheetr.cell_value(12, 1) for col in range(sheetr.ncols)]
for index, value in enumerate(data):
sheetw.write(row, index, value)
workbookw.save('outputList.csv')
row = row +1
n = n+1
workbookw.save('outputList.csv')
My code is still a bit messy, I may have leftover code from my various attempts!
Thanks
MikG
Assuming you are just trying to make a CSV file of the same cells from each file. So if you had 4 files, your output file will have 4 entries.
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
outputfile = open('outputList.csv', 'w')
cellrow = 12 #collect the cell (12, 1) from each file and put it in the output list
cellcolumn = 1
while n<len(files):
fname = files[n]
currentfile = open(fname,'r')
for i in range (cellrow):
currentrow = currentfile.readline()
# print currentrow #for testing
columncnt=0
currentcell = ''
openquote = False
for char in currentrow:
if char == '"' and not openquote:
openquote = True
elif char == '"' and openquote:
openquote = False
elif char == ',' and not openquote:
columncnt+=1
if columncnt == cellcolumn:
cellvalue = currentcell
# print cellvalue #for testing
currentcell=''
else:
currentcell += char
outputfile.write (cellvalue + ',')
currentfile.close()
n += 1
outputfile.close()
It seemed to me that since you already had a CSV it would be easier to deal with as a regular file and parse through to find the right information, plus nothing to import. Happy coding!
I think you have an error at this line in the while loop:
workbookr = xlrd.open_workbook(fname[n])
must be:
workbookr = xlrd.open_workbook(fname)
otherwise your workbookr remains as you set it before outside the loop:
fname = files[n]
workbookr = xlrd.open_workbook(fname)
which is the first file in your list.
Since they are just csv files, there is no need for the excel libraries.
#!/usr/bin/env python
import argparse, csv
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge csv files on field', version='%(prog)s 1.0')
parser.add_argument('infile', nargs='+', type=str, help='list of input files')
parser.add_argument('--col', type=int, default=0, help='Column to grab')
parser.add_argument('--row', type=int, default=0, help='Row to grab')
parser.add_argument('--out', type=str, default='temp.csv', help='name of output file')
args = parser.parse_args()
data = []
for fname in args.infile:
with open(fname, 'rb') as df:
reader = csv.reader(df)
for index, line in enumerate(reader):
if index == args.row:
data.push(line[args.column])
del reader
writer = csv.writer(open(args.out, "wb"), dialect='excel')
writer.writerows(data)
del writer