Parsing csv file and splitting into sub files

Parsing csv file and splitting into sub files - python

I am trying to create a generic filter to split file on the condition from the Yaml file.
My code is running Pandas but as the environment is not having Pandas module I am trying to achieve it through CSV library.
When I am hard coding the value at q its working but when I am trying to pass it from the config file its not working. Also I want pass multiple checks on the same column like('','Balance). So Asset goest to one file and ('','Balance) in another.
import sys
import yaml
import csv
def dynamicQuery(config_file, data_file, outputPath):
"""Loading Configuration file into dataframe"""
try:
with open(config_file) as file:
doc = yaml.full_load(file)
except Exception as err:
print("Error Configuration data file: ", err)
try:
for k, v in doc.items():
if k != 'column':
filename = k
k = doc[k]
q = ' , '.join(f'{v} ' for q, v in k.items())
q = '"' + str(strip(q)) + '"'
print(q) #-- "Asset"
df = csv.reader(open(data_file), delimiter=',')
df = filter(lambda x: (x[2] == q), df) # Not working here
#df = filter(lambda x: x[2] == "Asset", df) --> this is working
csv.writer(open(filename + ".txt", 'w', newline=' '), delimiter=',').writerows(df)
print("File is created for " + filename)
except Exception as err:
print("Error executing queries and saving output data file: ", err)
def main():
if len(sys.argv) == 3:
"""File will be passed as parameter """
config_file = sys.argv[1]
data_file = sys.argv[2]
dynamicQuery(config_file, data_file)
else:
usage()
def usage():
print("Usage: python splitGenric.py config_file data_file ")
main()
Sample file
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
1237,ACV,,sample
1238,ACV,,sample
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
Config.yaml
Asset :
filter1: '"Asset"'
Balance:
filter1: '"Balance"'
filter2: '""'

The YAML configuration file format is not particularly convenient for this, and yaml is not a standard Python module. I would probably go for something like regular expressions instead of a YAML file. But just to sort out the immediate problems, the problem here is that you are mixing up Python syntax and literal quoting characters. You are assembling a string containing literal double quotes around Asset for example, where your CSV file does not contain double quotes around this value; and so you are effectively comparing if 'Asset' == '"Asset"' which of course is False.
The following might not do exactly what you want, but should at least demonstrate a rough first cut of what I think you are trying to do here.
with open(config_file) as file:
config = yaml.full_load(file)
filters = dict()
for k, v in config.items():
handle = open(k + '.txt', 'w', newline='')
writer = csv.writer(handle, delimiter=',')
filt = {'handle': handle, 'writer': writer, 'conditions': []}
for _, expr in v.items():
filt['conditions'].append(expr.strip('"'))
filters[k] = filt
with open(data_file) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
for handle, conf in filters.items():
for i in range(len(conf['conditions'])):
if row[2] == conf['conditions'][i]:
conf['writer'].writerow(row)
break
for handle, conf in filters.items():
conf['handle'].close()
I'm guessing you used pyyaml which seems to be the dominant YAML module for Python.

I tried to use the config.yaml, but I've got this error
File "C:\Users\XXXXXX\AppData\Local\Programs\Python\Python36-32\lib\site-packages\yaml\parser.py", line 439, in parse_block_mapping_key
"expected <block end>, but found %r" % token.id, token.start_mark)
yaml.parser.ParserError: while parsing a block mapping
in "config.yml", line 5, column 5
expected <block end>, but found ','
in "config.yml", line 5, column 17
But I will pretend it worked and the content was loaded in a dictionary, as it appears to be the intention.
The dictionary is as:
doc = {'Asset':'Asset','Balance':[' ','Balance']}
#load directly to dataframe
df = pd.read_csv('sample.txt',header=None)
handler = ''
for k,v in doc.items():
kList = {k:[]} #making empty lists with k values
if isinstance(v,str): #Asset is string
fil = v
else:
for i in range(len(v)): #Balance is list of values
if v[i]:
fil = v[i]
else:
handler = k #replace the null
for types in df.values:
if fil in types:
kList[k].append(types) #append types to corresponding list
csv.writer(open(k+".txt", 'a', newline='\n'), delimiter=',').writerows(kList[k])
if handler: #there is null values
nulls = df[df.isnull().any(axis=1)].values.tolist()
csv.writer(open(handler+".txt", 'a', newline='\n'), delimiter=',').writerows(nulls)
The result are two files, with the following contents:
Asset.txt:
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
Balance.txt:
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
1237,ACV,nan,sample
1238,ACV,nan,sample

Related

Json2csv converter for Python 3

I'm using the Json2csv code to convert the Yelp dataset to csv files (Available here: https://github.com/Yelp/dataset-examples/blob/master/json_to_csv_converter.py)
This code was originally used with Python 2 but I'm using Python 3; I've made some changes so it's now working with Python 3 except that I'm getting b' preceding the strings (which indicates that it is a byte sequence).
I've added encoding='utf-8' to convert it to string but my csv file still shows the b''
Example: business_id
b'7KPBkxAOEtb3QeIL9PEErg'
What do I need to change to make it write strings instead of bytes?
Thanks
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import json
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path,encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
csv_file.writerow(get_row(line_contents, column_names))
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path, encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
for k, v in line_contents.items():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
elif line_value is not None:
row.append('{0}'.format(line_value))
else:
row.append('')
return row
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
parser = argparse.ArgumentParser(
description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
)
parser.add_argument(
'json_file',
type=str,
help='The json file to convert.',
)
args = parser.parse_args()
json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)

Just a guess:
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
If the value is str you don't need to encode it in Python 3 - all strings in Python 3 are unicode. You probably should check if the value is an instance of bytes instead.
if isinstance(line_value, bytes):
row.append('{0}'.format(line_value.decode('utf-8')))
[update]
No, that line is checking if it is string versus number... so str is correct – Luluperam
Are you sure? Lets say line_value is the string "foo":
line_value = 'foo'
Now try this:
>>> row = []
>>> if isinstance(line_value, str):
... row.append('{0}'.format(line_value.encode('utf-8')))
>>> print(row)
["b'foo'"]
That is the source of your bytes literal in the CSV file. Now lets try the version I so kindly suggested before dismissing it:
>>> line_value = b'foo'
>>> row = []
>>> if isinstance(line_value, bytes):
... row.append('{0}'.format(line_value.decode('utf-8')))
>>> print(row)
['foo']

yaml.dump is dumping in a dict, and I don't want it to do that

I was trying to make a tool that updates yaml values in files that have "PENDING" in them. It does work, but I need it to be formatted like this:
fields:
setName: ("name")
WishName: ("name")
WishNameState: ("PENDING")
However, it wants to dump it in this format:
fields: {WishName: ("name"), WishNameState: ("APPROVED"), setName: ("name")}
How can I make it dump in the format I want it to?
Here's my code, so you know how I'm currently doing it:
import glob
import os
import yaml
def processFile(f,t):
data = open(f,'rb').read()
lines = data.replace('\r\n','\n').split('\n')
lines_found = []
for i,x in enumerate(lines):
if t in x:
lines_found.append(i+1)
return lines_found
term = 'PENDING'
for x in glob.glob('*.yaml'):
r = processFile(x,term)
if r:
with open(x) as f:
yamlfile = yaml.load(f)
fields = yamlfile['fields']
name = fields['WishName']
print('Name: ' + name)
print('Approve or reject?')
aor = raw_input('a/r: ')
if aor == 'a':
fields['setName'] = name
fields['WishNameState'] = '("APPROVED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
elif aor == 'r':
fields['WishNameState'] = '("REJECTED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
else:
'Invalid response. Shutting down...'
sys.exit()
print('End of results!')
Any and all help is appreciated! Thanks :)

In your code, replace
yaml.dump(yamlfile, f)
with
yaml.dump(yamlfile, f, default_flow_style=False)

Extract tags from one column in CSV using Python [duplicate]

This question already has answers here:
Parsing out single column from csv into text file using python
(3 answers)
Closed 8 years ago.
I am trying to extract tagged entities from a csv file using python. This file contains tagged entities in multiple columns of the csv file. I only want python to process one specific column. Can anybody show me how to do this?
This is my code:
from bs4 import BeautifulSoup
import csv
input_name = "file.csv" # File names for input and output
output_name = "entities.csv"
def incrementEntity(entity_string, dictionary):
try:
dictionary[entity_string] += 1
except KeyError:
dictionary[entity_string] = 1
def outputResults(dictionary, entity_type, f):
for i in sorted(dictionary, key=dictionary.get, reverse=True):
print i, '\t', entity_type, '\t', dictionary[i]
f.writerow([i, entity_type, dictionary[i]])
try:
f = open(input_name, 'r')
soup = BeautifulSoup(f)
f.close()
except IOError, message:
print message
raise ValueError("Input file could not be opened")
locations = {}
people = {}
orgs = {}
for i in soup.find_all():
entity_name = i.get_text()
entity_type = i.name
if (entity_type == 'i-loc' or entity_type == 'b-loc'):
incrementEntity(entity_name, locations)
elif (entity_type == 'b-org' or entity_type == 'i-org'):
incrementEntity(entity_name, orgs)
elif (entity_type == 'b-per' or entity_type == 'i-per'):
incrementEntity(entity_name, people)
else:
continue
output_file = open(output_name, 'w')
f = csv.writer(output_file)
print "Entity\t\tType\t\tCount"
print "------\t\t----\t\t-----"
f.writerow(["Entity", "Type", "Count"])
outputResults(locations, 'location', f)
outputResults(people, 'person', f)
outputResults(orgs, 'organization', f)
output_file.close()

By definition, a CSV is a file in which data is separated by commas. So all you have to do is use the .split() method of the string you are dealing with.
Example:
csvline = 'Joe,25,M'
age = csvline.split(',')[1]
I don't know exactly what kind of data you are trying to process, but since you are trying to use BeautifulSoup I will assume your CSV file contains plain HTML-like data in some of its columns AND that you want to join the data of all those columns to process it with BeautifulSoup. That being the case you could try something like:
f = open(input_name, 'r')
htmlstring = '\n'.join([line.split(',')[1] for line in f])
soup = BeautifulSoup(htmlstring)
f.close()

Insert blanks into output file pointer (ofp)

The objective of this script is to take an incoming csv file, read it with a DictReader,
take the keys that were read, see if they match any of the pre-designated values in the fieldMap dictionary, and if they do match, append those keys to my hdrlist. Then, write the header list to an outputted file call ofp.
This issue that I am having is that when I don't a key that matches one of the pre-designated values in the fieldMap, I need to insert a blank (' ').
I've tried appending blank values to the hdrlist in an else statement and having a blank key value pair in my fieldMap dictionary:
if row.has_key(ft_test):
hdrlist.append(ft_test)
else:
hdrlist.append('')
'':[''] #blank key:value pair
,but then my:
if hdrlen != len(hdrlist)-1:
print "Cannot Cannot find a key for %s in file %s" % (ft,fn)"
error handling statement returns more print statements than I think it should, and I'm not sure as to why.
If anyone can shed some light as to how to insert blank into my ofp.write(fmtstring), it would be greatly appreciated.
Also, if anyone could shed some light as to why i get more print statement than I think I should with the above else statement, it would be greatly appreciated as well.
My whole script is below, and if there is any other info needed to help me with this code, I will gladly provide it.
Here is a sample of an input file that would produce to many print statements.
input_file.csv = {'cust_no':1, 'streetaddr':'2103 Union Ave','address2':' ','city':'Chicago'}
#!/usr/bin/env python
import sys, csv, glob
fieldMap = {'zipcode':['Zip5', 'zip9','zipcode','ZIP','zip_code','zip','ZIPCODE'],
'firstname':['firstname','FIRSTNAME'],
'lastname':['lastname','LASTNAME'],
'cust_no':['cust_no','CUST_NO'],
'user_name':['user_name','USER_NAME'],
'status':['status','STATUS'],
'cancel_date':['cancel_date','CANCEL_DATE'],
'reject_date':['REJECT_DATE','reject_date'],
'streetaddr':['streetaddr','STREETADDR','ADDRESS','address'],
'streetno':['streetno','STREETNO'],
'streetnm':['streetnm','STREETNM'],
'suffix':['suffix','SUFFIX'], #suffix of street name: dr, ave, st
'city':['city','CITY'],
'state':['state','STATE'],
'phone_home':['phone_home','PHONE_HOME'],
'email':['email','EMAIL'],
'':['']
}
def readFile(fn,ofp):
count = 0
CSVreader = csv.DictReader(open(fn,'rb'), dialect='excel', delimiter=',')
for row in CSVreader:
count+= 1
if count == 1:
hdrlist = []
for ft in fieldMap.keys():
hdrlen = len(hdrlist)
for ft_test in fieldMap[ft]:
if row.has_key(ft_test):
hdrlist.append(ft_test)
if hdrlen != len(hdrlist)-1:
print "Cannot find a key for %s in file %s" % (ft,fn)
if len(hdrlist) != 16:
print "Another error. Not all header's have been assigned new values."
if count < 5:
x=len(hdrlist)
fmtstring = "%s\t" * len(hdrlist) % tuple(row[x] for x in hdrlist)
ofp.write(fmtstring)
break
if __name__ == '__main__':
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
ofp.write("zipcode\tfirstname\tlastname\tcust_no\tuser_name\tstatus\t"
"cancel_date\treject_date\tstreetaddr\tstreetno\tstreetnm\t"
"suffix\tcity\tstate\tphone_home\temail")
for filename in filenames:
readFile(filename,ofp)
Sample data:
cust_no,status,streetaddr,address2,city,state,zipcode,billaddr,servaddr,title,latitude,longitude,custsize,telemarket,dirmail,nocredhold,email,phone_home,phone_work,phone_fax,phone_page,phone_cell,phone_othr,taxrate1,taxrate2,taxrate3,taxtot,company,firstname,lastname,user_name,dpbc,container,seq,paytype_am,paytype_di,paytype_mc,paytype_vi
0,0,'123 fake st.',,'chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,,,,,,,,,,,'bob','smith','bob smith',,,,'TRUE','TRUE','TRUE','TRUE'
0,0,'123 fake st.','','chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,'','','','','','','','','','','bob','smith','bob smith','','','','TRUE','TRUE','TRUE','TRUE'

If all you want is a hdrlist of the recognized field names in the csv file being processed, you can create it by comparing the values in the DictReader.fieldnames attribute to the contents of fieldMap immediately after creating the DictReader because doing so with a filenames argument will automatically read in the header row of the file.
I also changed your fieldMap dictionary into an OrderedDict so it would preserve the order of the keys.
import glob
from collections import OrderedDict
import csv
import sys
fieldMap = OrderedDict([
('zipcode', ['zipcode', 'ZIPCODE', 'Zip5', 'zip9', 'ZIP', 'zip_code', 'zip']),
('firstname', ['firstname', 'FIRSTNAME']),
('lastname', ['lastname', 'LASTNAME']),
('cust_no', ['cust_no', 'CUST_NO']),
('user_name', ['user_name', 'USER_NAME']),
('status', ['status', 'STATUS']),
('cancel_date', ['cancel_date', 'CANCEL_DATE']),
('reject_date', ['reject_date', 'REJECT_DATE']),
('streetaddr', ['streetaddr', 'STREETADDR', 'ADDRESS', 'address']),
('streetno', ['streetno', 'STREETNO']),
('streetnm', ['streetnm', 'STREETNM']),
('suffix', ['suffix', 'SUFFIX']), # suffix of street name: dr, ave, st
('city', ['city', 'CITY']),
('state', ['state', 'STATE']),
('phone_home', ['phone_home',' PHONE_HOME']),
('email', ['email', 'EMAIL']),
])
def readFile(fn,ofp):
with open(fn, 'rb') as csvfile:
# the following reads the header line into csvReader.fieldnames
csvReader = csv.DictReader(csvfile, dialect='excel', delimiter=',')
# create a list of recognized fieldnames in the csv file
hdrlist = []
for ft in fieldMap:
for ft_test in fieldMap[ft]:
if ft_test in csvReader.fieldnames:
hdrlist.append(ft_test)
break
else:
hdrlist.append(None) # placeholder (could also be '')
hdrlen = len(hdrlist)
ofp.write('hdrlist: {}\n'.format(hdrlist))
if hdrlen != len(fieldMap):
print "Note that not all field names were present in file."
ofp.write("\t".join(fieldMap) + '\n')
for row in csvReader:
fmtstring = "%s\t" * hdrlen % tuple(
row[field] if field else 'NA' for field in hdrlist)
ofp.write(fmtstring+'\n')
if __name__ == '__main__':
# sys.argv = [sys.argv[0], 'ofp_input.csv'] # hardcode for testing
if len(sys.argv) != 2:
print "Error: Filename argument missing!"
sys.exit(-1)
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
for filename in filenames:
readFile(filename, ofp)

Removing Duplicate CSV Entries with python

I have just completed a script that (sigh) finally works. It searches twitter for keywords. The results are written to a csv with 4 columns of keyword, Tweet, Lat, Lon (location). The code that I'm using is:
import tweepy
import csv
keywordList = ['McDonalds', 'Taco Bell', 'Burger King',]
for keyword in keywordList:
result = tweepy.api.search(q=keyword,rpp=1000,page=2, geocode= "34.085422,-117.900879,500mi" )
with open(r'C:\Temp\results.csv', 'a') as acsv:
w = csv.writer(acsv)
for tweet in result:
lat, lon = tweet.geo if tweet.geo else ('', '')
try:
a = tweet.geo['coordinates']
print a[0] , a[1]
print tweet.text
w.writerow((keyword, tweet.text, a[0] , a[1]))
except:
pass
I want to use task manager or python to run this search every 5 minutes but It will rewrite duplicates. I was going to use the following code to remove duplicates but two things happen. The resutls2.csv is blank and when I go to open the csv, it is locked and I have to view it in a read only. I tried f1.close(), writer.close() etc but it says 'csv.reader' object has no attribute close.
My biggest concern is getting no duplicates either by writing to the new csv or somehow removing and writing to the same table on each search. Any suggestions are much appreciated!!
import csv
f1 = csv.reader(open(r'C:\Temp\results.csv', 'rb'))
writer = csv.writer(open(r'C:\Temp\results2.csv', 'wb'))
tweet = set()
for row in f1:
if row[1] not in tweet:
writer.writerow(row)
tweet.add( row[1] )
f1.close()
writer.close()

Here's a refactored version:
Edit: unicode, what fun - I've added a .decode() call in read_csv() and an .encode() call in append_csv(); this should solve your problem (I think - you might need to decide on a string codec).
import tweepy
import csv
from collections import defaultdict
import time
FILE = 'c:/temp/results.csv'
KEYWORDS = ['McDonalds', 'Taco Bell', 'Burger King']
WHERE = "34.085422,-117.900879,500mi"
DELAY = 300 # seconds
def _float(s, err=None):
try:
return float(s)
except ValueError:
return err
def _str(f, err=""):
return err if f is None else str(f)
def read_csv(fname=FILE):
data = defaultdict(dict)
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
for kw,tw,lat,lon in incsv:
# added .decode() call to handle saved unicode chars
data[kw][tw.decode()] = (_float(lat), _float(lon))
return data
def append_csv(data, fname=FILE):
with open(fname, "ab") as outf:
outcsv = csv.writer(outf)
# added .encode() call to handle saved unicode chars
outcsv.writerows((kw,tw.encode(),_str(lat),_str(lon)) for kw,dat in data.iteritems() for tw,(lat,lon) in dat.iteritems())
def search_twitter(keywords=KEYWORDS, loc=WHERE):
data = defaultdict(dict)
for kw in keywords:
for tweet in tweepy.api.search(q=kw, rpp=1000, page=2, geocode=loc):
data[kw][tweet.text] = tweet.geo if tweet.geo else (None,None)
return data
def calc_update(old_data, new_data):
diff = defaultdict(dict)
for kw,dat in new_data.iteritems():
for tw,loc in dat.iteritems():
if tw not in old_data[kw]:
diff[kw][tw] = old_data[kw][tw] = loc
return old_data, diff
def show_data(data):
for kw,dat in data.iteritems():
for tw,(lat,lon) in dat.iteritems():
print("<{},{}> {} [{}]".format(_str(lat,"???"), _str(lon,"???"), tw, kw))
def main():
data = read_csv()
while True:
new_data = search_twitter()
data,diff = calc_update(data, new_data)
append_csv(diff)
show_data(diff)
time.sleep(DELAY)
if __name__=="__main__":
main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parsing csv file and splitting into sub files - python

Related

Json2csv converter for Python 3

yaml.dump is dumping in a dict, and I don't want it to do that

Extract tags from one column in CSV using Python [duplicate]

Insert blanks into output file pointer (ofp)

Removing Duplicate CSV Entries with python

Categories

Resources