I'm using the Json2csv code to convert the Yelp dataset to csv files (Available here: https://github.com/Yelp/dataset-examples/blob/master/json_to_csv_converter.py)
This code was originally used with Python 2 but I'm using Python 3; I've made some changes so it's now working with Python 3 except that I'm getting b' preceding the strings (which indicates that it is a byte sequence).
I've added encoding='utf-8' to convert it to string but my csv file still shows the b''
Example: business_id
b'7KPBkxAOEtb3QeIL9PEErg'
What do I need to change to make it write strings instead of bytes?
Thanks
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import json
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path,encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
csv_file.writerow(get_row(line_contents, column_names))
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path, encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
for k, v in line_contents.items():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
elif line_value is not None:
row.append('{0}'.format(line_value))
else:
row.append('')
return row
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
parser = argparse.ArgumentParser(
description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
)
parser.add_argument(
'json_file',
type=str,
help='The json file to convert.',
)
args = parser.parse_args()
json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)
Just a guess:
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
If the value is str you don't need to encode it in Python 3 - all strings in Python 3 are unicode. You probably should check if the value is an instance of bytes instead.
if isinstance(line_value, bytes):
row.append('{0}'.format(line_value.decode('utf-8')))
[update]
No, that line is checking if it is string versus number... so str is correct – Luluperam
Are you sure? Lets say line_value is the string "foo":
line_value = 'foo'
Now try this:
>>> row = []
>>> if isinstance(line_value, str):
... row.append('{0}'.format(line_value.encode('utf-8')))
>>> print(row)
["b'foo'"]
That is the source of your bytes literal in the CSV file. Now lets try the version I so kindly suggested before dismissing it:
>>> line_value = b'foo'
>>> row = []
>>> if isinstance(line_value, bytes):
... row.append('{0}'.format(line_value.decode('utf-8')))
>>> print(row)
['foo']
This question already has answers here:
Parsing out single column from csv into text file using python
(3 answers)
Closed 8 years ago.
I am trying to extract tagged entities from a csv file using python. This file contains tagged entities in multiple columns of the csv file. I only want python to process one specific column. Can anybody show me how to do this?
This is my code:
from bs4 import BeautifulSoup
import csv
input_name = "file.csv" # File names for input and output
output_name = "entities.csv"
def incrementEntity(entity_string, dictionary):
try:
dictionary[entity_string] += 1
except KeyError:
dictionary[entity_string] = 1
def outputResults(dictionary, entity_type, f):
for i in sorted(dictionary, key=dictionary.get, reverse=True):
print i, '\t', entity_type, '\t', dictionary[i]
f.writerow([i, entity_type, dictionary[i]])
try:
f = open(input_name, 'r')
soup = BeautifulSoup(f)
f.close()
except IOError, message:
print message
raise ValueError("Input file could not be opened")
locations = {}
people = {}
orgs = {}
for i in soup.find_all():
entity_name = i.get_text()
entity_type = i.name
if (entity_type == 'i-loc' or entity_type == 'b-loc'):
incrementEntity(entity_name, locations)
elif (entity_type == 'b-org' or entity_type == 'i-org'):
incrementEntity(entity_name, orgs)
elif (entity_type == 'b-per' or entity_type == 'i-per'):
incrementEntity(entity_name, people)
else:
continue
output_file = open(output_name, 'w')
f = csv.writer(output_file)
print "Entity\t\tType\t\tCount"
print "------\t\t----\t\t-----"
f.writerow(["Entity", "Type", "Count"])
outputResults(locations, 'location', f)
outputResults(people, 'person', f)
outputResults(orgs, 'organization', f)
output_file.close()
By definition, a CSV is a file in which data is separated by commas. So all you have to do is use the .split() method of the string you are dealing with.
Example:
csvline = 'Joe,25,M'
age = csvline.split(',')[1]
I don't know exactly what kind of data you are trying to process, but since you are trying to use BeautifulSoup I will assume your CSV file contains plain HTML-like data in some of its columns AND that you want to join the data of all those columns to process it with BeautifulSoup. That being the case you could try something like:
f = open(input_name, 'r')
htmlstring = '\n'.join([line.split(',')[1] for line in f])
soup = BeautifulSoup(htmlstring)
f.close()
The objective of this script is to take an incoming csv file, read it with a DictReader,
take the keys that were read, see if they match any of the pre-designated values in the fieldMap dictionary, and if they do match, append those keys to my hdrlist. Then, write the header list to an outputted file call ofp.
This issue that I am having is that when I don't a key that matches one of the pre-designated values in the fieldMap, I need to insert a blank (' ').
I've tried appending blank values to the hdrlist in an else statement and having a blank key value pair in my fieldMap dictionary:
if row.has_key(ft_test):
hdrlist.append(ft_test)
else:
hdrlist.append('')
'':[''] #blank key:value pair
,but then my:
if hdrlen != len(hdrlist)-1:
print "Cannot Cannot find a key for %s in file %s" % (ft,fn)"
error handling statement returns more print statements than I think it should, and I'm not sure as to why.
If anyone can shed some light as to how to insert blank into my ofp.write(fmtstring), it would be greatly appreciated.
Also, if anyone could shed some light as to why i get more print statement than I think I should with the above else statement, it would be greatly appreciated as well.
My whole script is below, and if there is any other info needed to help me with this code, I will gladly provide it.
Here is a sample of an input file that would produce to many print statements.
input_file.csv = {'cust_no':1, 'streetaddr':'2103 Union Ave','address2':' ','city':'Chicago'}
#!/usr/bin/env python
import sys, csv, glob
fieldMap = {'zipcode':['Zip5', 'zip9','zipcode','ZIP','zip_code','zip','ZIPCODE'],
'firstname':['firstname','FIRSTNAME'],
'lastname':['lastname','LASTNAME'],
'cust_no':['cust_no','CUST_NO'],
'user_name':['user_name','USER_NAME'],
'status':['status','STATUS'],
'cancel_date':['cancel_date','CANCEL_DATE'],
'reject_date':['REJECT_DATE','reject_date'],
'streetaddr':['streetaddr','STREETADDR','ADDRESS','address'],
'streetno':['streetno','STREETNO'],
'streetnm':['streetnm','STREETNM'],
'suffix':['suffix','SUFFIX'], #suffix of street name: dr, ave, st
'city':['city','CITY'],
'state':['state','STATE'],
'phone_home':['phone_home','PHONE_HOME'],
'email':['email','EMAIL'],
'':['']
}
def readFile(fn,ofp):
count = 0
CSVreader = csv.DictReader(open(fn,'rb'), dialect='excel', delimiter=',')
for row in CSVreader:
count+= 1
if count == 1:
hdrlist = []
for ft in fieldMap.keys():
hdrlen = len(hdrlist)
for ft_test in fieldMap[ft]:
if row.has_key(ft_test):
hdrlist.append(ft_test)
if hdrlen != len(hdrlist)-1:
print "Cannot find a key for %s in file %s" % (ft,fn)
if len(hdrlist) != 16:
print "Another error. Not all header's have been assigned new values."
if count < 5:
x=len(hdrlist)
fmtstring = "%s\t" * len(hdrlist) % tuple(row[x] for x in hdrlist)
ofp.write(fmtstring)
break
if __name__ == '__main__':
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
ofp.write("zipcode\tfirstname\tlastname\tcust_no\tuser_name\tstatus\t"
"cancel_date\treject_date\tstreetaddr\tstreetno\tstreetnm\t"
"suffix\tcity\tstate\tphone_home\temail")
for filename in filenames:
readFile(filename,ofp)
Sample data:
cust_no,status,streetaddr,address2,city,state,zipcode,billaddr,servaddr,title,latitude,longitude,custsize,telemarket,dirmail,nocredhold,email,phone_home,phone_work,phone_fax,phone_page,phone_cell,phone_othr,taxrate1,taxrate2,taxrate3,taxtot,company,firstname,lastname,user_name,dpbc,container,seq,paytype_am,paytype_di,paytype_mc,paytype_vi
0,0,'123 fake st.',,'chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,,,,,,,,,,,'bob','smith','bob smith',,,,'TRUE','TRUE','TRUE','TRUE'
0,0,'123 fake st.','','chicago','il',60185,'123 billaddr st.','123 servaddr st.','mr.',43.123,54.234 ,2000,'TRUE','TRUE','TRUE','email#email.com',(666)555-6666,'','','','','','','','','','','bob','smith','bob smith','','','','TRUE','TRUE','TRUE','TRUE'
If all you want is a hdrlist of the recognized field names in the csv file being processed, you can create it by comparing the values in the DictReader.fieldnames attribute to the contents of fieldMap immediately after creating the DictReader because doing so with a filenames argument will automatically read in the header row of the file.
I also changed your fieldMap dictionary into an OrderedDict so it would preserve the order of the keys.
import glob
from collections import OrderedDict
import csv
import sys
fieldMap = OrderedDict([
('zipcode', ['zipcode', 'ZIPCODE', 'Zip5', 'zip9', 'ZIP', 'zip_code', 'zip']),
('firstname', ['firstname', 'FIRSTNAME']),
('lastname', ['lastname', 'LASTNAME']),
('cust_no', ['cust_no', 'CUST_NO']),
('user_name', ['user_name', 'USER_NAME']),
('status', ['status', 'STATUS']),
('cancel_date', ['cancel_date', 'CANCEL_DATE']),
('reject_date', ['reject_date', 'REJECT_DATE']),
('streetaddr', ['streetaddr', 'STREETADDR', 'ADDRESS', 'address']),
('streetno', ['streetno', 'STREETNO']),
('streetnm', ['streetnm', 'STREETNM']),
('suffix', ['suffix', 'SUFFIX']), # suffix of street name: dr, ave, st
('city', ['city', 'CITY']),
('state', ['state', 'STATE']),
('phone_home', ['phone_home',' PHONE_HOME']),
('email', ['email', 'EMAIL']),
])
def readFile(fn,ofp):
with open(fn, 'rb') as csvfile:
# the following reads the header line into csvReader.fieldnames
csvReader = csv.DictReader(csvfile, dialect='excel', delimiter=',')
# create a list of recognized fieldnames in the csv file
hdrlist = []
for ft in fieldMap:
for ft_test in fieldMap[ft]:
if ft_test in csvReader.fieldnames:
hdrlist.append(ft_test)
break
else:
hdrlist.append(None) # placeholder (could also be '')
hdrlen = len(hdrlist)
ofp.write('hdrlist: {}\n'.format(hdrlist))
if hdrlen != len(fieldMap):
print "Note that not all field names were present in file."
ofp.write("\t".join(fieldMap) + '\n')
for row in csvReader:
fmtstring = "%s\t" * hdrlen % tuple(
row[field] if field else 'NA' for field in hdrlist)
ofp.write(fmtstring+'\n')
if __name__ == '__main__':
# sys.argv = [sys.argv[0], 'ofp_input.csv'] # hardcode for testing
if len(sys.argv) != 2:
print "Error: Filename argument missing!"
sys.exit(-1)
filenames = glob.glob(sys.argv[1])
ofp = sys.stdout
for filename in filenames:
readFile(filename, ofp)
I have just completed a script that (sigh) finally works. It searches twitter for keywords. The results are written to a csv with 4 columns of keyword, Tweet, Lat, Lon (location). The code that I'm using is:
import tweepy
import csv
keywordList = ['McDonalds', 'Taco Bell', 'Burger King',]
for keyword in keywordList:
result = tweepy.api.search(q=keyword,rpp=1000,page=2, geocode= "34.085422,-117.900879,500mi" )
with open(r'C:\Temp\results.csv', 'a') as acsv:
w = csv.writer(acsv)
for tweet in result:
lat, lon = tweet.geo if tweet.geo else ('', '')
try:
a = tweet.geo['coordinates']
print a[0] , a[1]
print tweet.text
w.writerow((keyword, tweet.text, a[0] , a[1]))
except:
pass
I want to use task manager or python to run this search every 5 minutes but It will rewrite duplicates. I was going to use the following code to remove duplicates but two things happen. The resutls2.csv is blank and when I go to open the csv, it is locked and I have to view it in a read only. I tried f1.close(), writer.close() etc but it says 'csv.reader' object has no attribute close.
My biggest concern is getting no duplicates either by writing to the new csv or somehow removing and writing to the same table on each search. Any suggestions are much appreciated!!
import csv
f1 = csv.reader(open(r'C:\Temp\results.csv', 'rb'))
writer = csv.writer(open(r'C:\Temp\results2.csv', 'wb'))
tweet = set()
for row in f1:
if row[1] not in tweet:
writer.writerow(row)
tweet.add( row[1] )
f1.close()
writer.close()
Here's a refactored version:
Edit: unicode, what fun - I've added a .decode() call in read_csv() and an .encode() call in append_csv(); this should solve your problem (I think - you might need to decide on a string codec).
import tweepy
import csv
from collections import defaultdict
import time
FILE = 'c:/temp/results.csv'
KEYWORDS = ['McDonalds', 'Taco Bell', 'Burger King']
WHERE = "34.085422,-117.900879,500mi"
DELAY = 300 # seconds
def _float(s, err=None):
try:
return float(s)
except ValueError:
return err
def _str(f, err=""):
return err if f is None else str(f)
def read_csv(fname=FILE):
data = defaultdict(dict)
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
for kw,tw,lat,lon in incsv:
# added .decode() call to handle saved unicode chars
data[kw][tw.decode()] = (_float(lat), _float(lon))
return data
def append_csv(data, fname=FILE):
with open(fname, "ab") as outf:
outcsv = csv.writer(outf)
# added .encode() call to handle saved unicode chars
outcsv.writerows((kw,tw.encode(),_str(lat),_str(lon)) for kw,dat in data.iteritems() for tw,(lat,lon) in dat.iteritems())
def search_twitter(keywords=KEYWORDS, loc=WHERE):
data = defaultdict(dict)
for kw in keywords:
for tweet in tweepy.api.search(q=kw, rpp=1000, page=2, geocode=loc):
data[kw][tweet.text] = tweet.geo if tweet.geo else (None,None)
return data
def calc_update(old_data, new_data):
diff = defaultdict(dict)
for kw,dat in new_data.iteritems():
for tw,loc in dat.iteritems():
if tw not in old_data[kw]:
diff[kw][tw] = old_data[kw][tw] = loc
return old_data, diff
def show_data(data):
for kw,dat in data.iteritems():
for tw,(lat,lon) in dat.iteritems():
print("<{},{}> {} [{}]".format(_str(lat,"???"), _str(lon,"???"), tw, kw))
def main():
data = read_csv()
while True:
new_data = search_twitter()
data,diff = calc_update(data, new_data)
append_csv(diff)
show_data(diff)
time.sleep(DELAY)
if __name__=="__main__":
main()