Related
I need to stripe the white spaces from a CSV file that I read
import csv
aList=[]
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
aList.append(row)
# I need to strip the extra white space from each string in the row
return(aList)
There's also the embedded formatting parameter: skipinitialspace (the default is false)
http://docs.python.org/2/library/csv.html#csv-fmt-params
aList=[]
with open(self.filename, 'r') as f:
reader = csv.reader(f, skipinitialspace=False,delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
aList.append(row)
return(aList)
In my case, I only cared about stripping the whitespace from the field names (aka column headers, aka dictionary keys), when using csv.DictReader.
Create a class based on csv.DictReader, and override the fieldnames property to strip out the whitespace from each field name (aka column header, aka dictionary key).
Do this by getting the regular list of fieldnames, and then iterating over it while creating a new list with the whitespace stripped from each field name, and setting the underlying _fieldnames attribute to this new list.
import csv
class DictReaderStrip(csv.DictReader):
#property
def fieldnames(self):
if self._fieldnames is None:
# Initialize self._fieldnames
# Note: DictReader is an old-style class, so can't use super()
csv.DictReader.fieldnames.fget(self)
if self._fieldnames is not None:
self._fieldnames = [name.strip() for name in self._fieldnames]
return self._fieldnames
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
return [[x.strip() for x in row] for row in reader]
You can do:
aList.append([element.strip() for element in row])
The most memory-efficient method to format the cells after parsing is through generators. Something like:
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
yield (cell.strip() for cell in row)
But it may be worth moving it to a function that you can use to keep munging and to avoid forthcoming iterations. For instance:
nulls = {'NULL', 'null', 'None', ''}
def clean(reader):
def clean(row):
for cell in row:
cell = cell.strip()
yield None if cell in nulls else cell
for row in reader:
yield clean(row)
Or it can be used to factorize a class:
def factory(reader):
fields = next(reader)
def clean(row):
for cell in row:
cell = cell.strip()
yield None if cell in nulls else cell
for row in reader:
yield dict(zip(fields, clean(row)))
You can create a wrapper object around your file that strips away the spaces before the CSV reader sees them. This way, you can even use the csv file with cvs.DictReader.
import re
class CSVSpaceStripper:
def __init__(self, filename):
self.fh = open(filename, "r")
self.surroundingWhiteSpace = re.compile("\s*;\s*")
self.leadingOrTrailingWhiteSpace = re.compile("^\s*|\s*$")
def close(self):
self.fh.close()
self.fh = None
def __iter__(self):
return self
def next(self):
line = self.fh.next()
line = self.surroundingWhiteSpace.sub(";", line)
line = self.leadingOrTrailingWhiteSpace.sub("", line)
return line
Then use it like this:
o = csv.reader(CSVSpaceStripper(filename), delimiter=";")
o = csv.DictReader(CSVSpaceStripper(filename), delimiter=";")
I hardcoded ";" to be the delimiter. Generalising the code to any delimiter is left as an exercise to the reader.
Read a CSV (or Excel file) using Pandas and trim it using this custom function.
#Definition for strippping whitespace
def trim(dataset):
trim = lambda x: x.strip() if type(x) is str else x
return dataset.applymap(trim)
You can now apply trim(CSV/Excel) to your code like so (as part of a loop, etc.)
dataset = trim(pd.read_csv(dataset))
dataset = trim(pd.read_excel(dataset))
and here is Daniel Kullmann excellent solution adapted to Python3:
import re
class CSVSpaceStripper:
"""strip whitespaces around delimiters in the file
NB has hardcoded delimiter ";"
"""
def __init__(self, filename):
self.fh = open(filename, "r")
self.surroundingWhiteSpace = re.compile(r"\s*;\s*")
self.leadingOrTrailingWhiteSpace = re.compile(r"^\s*|\s*$")
def close(self):
self.fh.close()
self.fh = None
def __iter__(self):
return self
def __next__(self):
line = self.fh.readline()
line = self.surroundingWhiteSpace.sub(";", line)
line = self.leadingOrTrailingWhiteSpace.sub("", line)
return line
I figured out a very simple solution:
import csv
with open('filename.csv') as f:
reader = csv.DictReader(f)
rows = [ { k.strip(): v.strip() for k,v in row.items() } for row in reader ]
The following code may help you:
import pandas as pd
aList = pd.read_csv(r'filename.csv', sep='\s*,\s*', engine='python')
I have a csv file where each record is a LinkedIn contact. I have to recreate another csv file where each contact it was reached only after a specific date (ex all the contact that are connected to me after 1/04/2017).
So this is my implementation:
def import_from_csv(file):
key_order = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
single_person = {"FirstName": row["FirstName"], "LastName": row["LastName"],
"EmailAddress": row["EmailAddress"], "Company": row["Company"],
"ConnectedOn": parser.parse(row["ConnectedOn"])}
od = OrderedDict((k, single_person[k]) for k in key_order)
linkedin_contacts.append(od)
return linkedin_contacts
the first script give to me a list of ordered dict, i dont know if the way i used to achive the correct order is good, also seeing some example (like here) i'm not using the od.update method, but i dont think i need it, is it correct?
Now i wrote a second function to filter the list:
def filter_by_date(connections):
filtered_list = []
target_date = parser.parse("01/04/2017")
for row in connections:
if row["ConnectedOn"] > target_date:
filtered_list.append(row)
return filtered_list
Am I doing this correctly?
Is there a way to optimize the code? Thanks
First point: you don't need the OrderedDict at all, just use a csv.DictWriter to write the filtered csv.
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
with open("/apth/to/final.csv", "wb") as f:
writer = csv.DictWriter(f, fieldnames)
writer.writeheader()
writer.writerows(filtered_contacts)
Second point: you don't need to create a new dict from the one yielded by the csv reader, just update the ConnectedOn key in place :
def import_from_csv(file):
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
row["ConnectedOn"] = parser.parse(row["ConnectedOn"])
linkedin_contacts.append(row)
return linkedin_contacts
And finally, if all you have to do is take the source csv, filter out records on ConnectedOn and write the result, you don't need to load the whole source in memory, create a filtered list (in memory again) and write the filtered list, you can stream the whole operation:
def filter_csv(source_path, dest_path, date):
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
target = parser.parse(date)
with open(source_path, "rb") as source, open(dest_path, "wb") as dest:
reader = csv.DictReader(source)
writer = csv.DictWriter(dest, fieldnames)
# if you want a header line with the fieldnames - else comment it out
writer.writeheaders()
for row in reader:
row_date = parser.parse(row["ConnectedOn"])
if row_date > target:
writer.writerow(row)
And here you are, plain and simple.
NB : I don't know what "parser.parse()" is but as others answers mention, you'd probably be better using the datetime module instead.
For filtering you could use filter() function:
def filter_by_date(connections):
target_date = datetime.strptime("01/04/2017", '%Y/%m/%d').date()
return list(filter(lambda x: x["ConnectedOn"] > target_date, connections))
And instead of creating simple dict and then fill its values into OrderedDict you could write values directly to the OrderedDict:
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = datetime.strptime(row["ConnectedOn"], '%Y/%m/%d').date()
linkedin_contacts.append(od)
If you know date format you don't need python_dateutil, you could use built-in datetime.datetime.strptime() with needed format.
Because you don't precise the format string.
Use :
from datetime import datetime
format = '%d/%m/%Y'
date_text = '01/04/2017'
# inverse by datetime.strftime(format)
datetime.strptime(date_text, format)
#....
# with format as global
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = strptime(row["ConnectedOn"], format)
linkedin_contacts.append(od)
Do:
def filter_by_date(connections, date_text):
target_date = datetime.strptime(date_text, format)
return [x for x in connections if x["ConnectedOn"] > target_dat]
I'm trying to write a program by change an open file, and I need to add a new line in the print.
In the open txt.file, it shows like this (I use"_" replace blank):
Name_____Height(m)_____Weight(kg)
Bill________1.58__________58
Mary_____1.65__________43
...
And now I want to add a new row like this:
Name_____Height(m)_____Weight(kg)_____Age(year)<---The new vertical line
Bill________1.58__________58_____________15
Mary_____1.65__________43_____________17
And my code it's:
data_file = open("file.txt", "r")
print(data_file.read())
data_file.close()
So, how could I add another vertical line in the open file? Moreover, If I want to add more rows, how can I do this?
One more thing, I use the python 3.5
I wrote a little class to do everything you asked for and more. Implementation examples are done at the bottom. Let me know if this works for you.
class Feed(object):
def __init__(self, file_name, sep, naming_convention=None):
self.file_name = file_name
self.feed_item_naming = naming_convention
self.sep = sep
self.feed = self.load_feed()
def get_head(self, file=None):#lmao...
'''
Get the header
'''
if not file:
head = open(self.file_name).readline().split(self.sep)
else:
head = file[0].split(self.sep)
return head
def __repr__(self):
return repr(self.feed)
def load_feed(self):
'''
load a feed object
set the key of each item to the naming convention
if we have multiple item names we increment the name bill becomes bill_2 and then bill_3 etc...
'''
#first we open the file and grab the headers
file = [x.rstrip() for x in open(self.file_name).readlines()]
self.header = self.get_head(file)
if not self.feed_item_naming and self.feed_item_naming not in self.header:
self.feed_item_naming = self.header[0]
data = {}
for line in file[1:]:
if line != '':
line = line.split(self.sep)
pos = line[self.header.index(self.feed_item_naming)]
while pos in data:
try:
ending = int(pos[-1])+1
pos.replace(pos[-1], ending)
except:
pos = pos+'_2'
data[pos] = {}
for item in self.header:
data[pos][item] = line[self.header.index(item)]
return data
def unload_feed(self, file_name=None, sep=None):
'''
write the modified feed back out to a data file
'''
if not file_name:
file_name = self.file_name
if not sep:
sep = self.sep
with open(file_name, 'wb') as file:
for i in self.header:
if i != self.header[-1]:
file.write(i+sep)
else:
file.write(i)
file.write('\n')
for i in self.feed:
for x in self.header:
if x != self.header[-1]:
file.write(str(self.feed[i][x])+sep)
else:
file.write(str(self.feed[i][x]))
file.write('\n')
def add_key(self, key, default_value=None):
'''
Add a key to each of the items
'''
if key not in self.header:
for i in self.feed:
self.feed[i][key]=default_value
self.header.append(key)
def get_key_value(self, item, key):
'''
get the value for the items key
'''
return self.feed[item][key]
def get_item(self, item):
'''
get an individual item
'''
return self.feed[item]
def set_key_value(self, item, key, value):
'''
set the value of each items key
{item:{key:value, key:value}, item...etc}
'''
self.feed[item][key] = value
def set_key_values(self, item, key_value_dict):
'''
set multiple key values for an item
'''
for k,v in key_value_dict.iteritems():
self.set_key_value(item, k, v)
def add_item(self, item):
'''
Add a new item
'''
while item in self.feed:
try:
end = str(int(item[-1])+1)
item = item.replace(item[-1], end)
except:
item = item+'_2'
self.feed[item] = {}
self.feed[item][self.feed_item_naming] = item
for i in self.header:
if i != self.feed_item_naming:
self.feed[item][i] = None
f = Feed('file.txt', '_____', 'Name') #initialize a new feed object, make sure that all seperators are the same for each item in your file
f.add_item('Angela') #add a new item
f.set_key_values('Angela', {'Height(m)':5, 'Weight(kg)':123}) #set the new items height and weight
f.add_key('Position')#create a new key for each item
f.unload_feed() #write the feed back to the file
print(f)
If by "add a new vertical line" you mean "add a new column" to your file, you can do this with the help of the csv module.
The code below works by reading the contents of your file as a list, making the changes, and then writing the updated list back to the file. You can add rows to your file this way, as well.
import csv
with open('file.txt', 'r') as f:
reader = list(csv.reader(f, delimiter=' ')) # if your file is delimited by spaces, tabs, etc.
# include that value here. It appears that
# your file is space-delimited, but that's
# just a guess based on the info in your question.
for i,row in enumerate(reader):
if i == 0:
row.append('Age(year)')
if i == 1:
row.append('15')
if i == 2:
row.append('17')
with open('file.txt','w') as f:
wr = csv.writer(f, delimiter=' ')
for row in reader:
wr.writerow(row)
# file.txt output:
# Name Height(m) Weight(kg) Age(year)
# Bill 1.58 58 15
# Mary 1.6 43 17
This code also uses with statements when working with your file. Using either with or close() (like you included in your question) is the correct practice when working with files. A with statement is easy to use because it closes your file automatically.
This is my first attempt about subclassing, so I need some hints from you experts..
I'm trying to subclass csv.DictReader / Writer to have a higher level class to do something like this :
a = CsvRdr('filename.csv')
for row in a.rows:
# do something with dict returned in row
a.close()
I've come up with a subclass like this :
class CsvRdr(csv.DictReader):
def __init__(self, filename):
self.__fo = open(filename, 'rb')
self.__delim = '\t'
self.rows = csv.DictReader(self.__fo, self.__delim)
self.rows.__init__(self.__fo, self.__del)
def close(self):
self.__fo.close()
but when I do :
for i in a.rows:
print i
it returns an unformatted dict containing the delimiter \t as key :
{'\t': 'seriesid\tseriesname\tstatus\tquality\tgroup\tpath'}
{'\t': '80337\tMad Men\tAiring\thdtv\tTV Shows\t/share/MD0_DATA/SORT/TV Shows/Mad Men'}
{'\t': '271910\tHalt and Catch Fire\tHiatus\thdtv\tTV Shows\t/share/MD0_DATA/SORT/TV
instead of a dict containing the proper fieldnames and the relative values splitted by delimiter
But when I'm going to instantiate DictReader from another function, all that i need to do is :
fo = open(filename, 'rb')
reader = csv.DictReader(fo, delimiter='\t')
and the reader object correctly gives you the desired output.
Any suggestion ?
I've not clear in my mind the subclassing process, and what'ive found online till now didn't help me.
TIA
Enrico
Your posted code would barf with an AttributeError, you have self._del when you mean to have self._delim.
Beyond that, your other issue is that you invoke the constructor incorrectly:
self.rows = csv.DictReader(self.__fo, self.__delim)
should be
self.rows = csv.DictReader(self.__fo, delimiter = self.__delim)
Looking at the constructor signature for DictReader we see what actually happened:
csv.DictReader(self, f, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds)
Your self.__delim argument was set to the fieldnames parameter. This is what Python (2.7 anyway) does by default when you give a non-keyword position argument to a function that has only keyword arguments remaining. It fills in the next keyword argument in the signature using the positional argument.
So you're telling DictReader "Hey this CSV has only one column, and it's name is '\t'". So DictReader does the logical thing which is to only have one value per row, that value being the entire line.
Finally this line:
self.rows.__init__(self.__fo, self.__del)
Isn't doing anything, you are just repeating the constructor call in a more explicit way.
Here's how I would re-write what you were trying to do:
class CsvRdr(object):
def __init__(self, filename):
self.__fo = open(filename, 'rb')
self.__delim = '\t'
self.rows = csv.DictReader(self.__fo, delimiter = self.__delim)
def close(self):
self.__fo.close()
Notice I change csv.DictReader to object, this is because this pattern you are using is actually delegation and not subclassing or inheritance. You are just setting one of your objects attributes to an instance of the class you are interested in using, and your methods just call that instance's methods in more convenient ways.
In the end, I solved in this way :
class CsvRdr(object):
def __init__(self, filename, delimiter=None):
self.__fo = open(filename, 'rb')
self.__delim = ( delimiter if delimiter else '\t' )
self.__rows = csv.DictReader(self.__fo, delimiter = self.__delim)
def __iter__(self):
return self.__rows
def close(self):
self.__fo.close()
Class called by this function :
def CsvRead(filename):
try:
reader = CsvRdr(filename)
return reader
except IOError, e:
print "Error reading file : %s ERROR=%s" % (filename, e)
sys.exit(2)
In this second attempt, I added the iter magic method to mimic the original behaviour of Csv.DictReader, so you can loop thru data in the usual way, instead of using object.rows method :
reader = CsvRead(catalog)
seriesnames = [ row['seriesname'].lower() for row in reader ]
reader.close()
I need to stripe the white spaces from a CSV file that I read
import csv
aList=[]
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
aList.append(row)
# I need to strip the extra white space from each string in the row
return(aList)
There's also the embedded formatting parameter: skipinitialspace (the default is false)
http://docs.python.org/2/library/csv.html#csv-fmt-params
aList=[]
with open(self.filename, 'r') as f:
reader = csv.reader(f, skipinitialspace=False,delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
aList.append(row)
return(aList)
In my case, I only cared about stripping the whitespace from the field names (aka column headers, aka dictionary keys), when using csv.DictReader.
Create a class based on csv.DictReader, and override the fieldnames property to strip out the whitespace from each field name (aka column header, aka dictionary key).
Do this by getting the regular list of fieldnames, and then iterating over it while creating a new list with the whitespace stripped from each field name, and setting the underlying _fieldnames attribute to this new list.
import csv
class DictReaderStrip(csv.DictReader):
#property
def fieldnames(self):
if self._fieldnames is None:
# Initialize self._fieldnames
# Note: DictReader is an old-style class, so can't use super()
csv.DictReader.fieldnames.fget(self)
if self._fieldnames is not None:
self._fieldnames = [name.strip() for name in self._fieldnames]
return self._fieldnames
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
return [[x.strip() for x in row] for row in reader]
You can do:
aList.append([element.strip() for element in row])
The most memory-efficient method to format the cells after parsing is through generators. Something like:
with open(self.filename, 'r') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
yield (cell.strip() for cell in row)
But it may be worth moving it to a function that you can use to keep munging and to avoid forthcoming iterations. For instance:
nulls = {'NULL', 'null', 'None', ''}
def clean(reader):
def clean(row):
for cell in row:
cell = cell.strip()
yield None if cell in nulls else cell
for row in reader:
yield clean(row)
Or it can be used to factorize a class:
def factory(reader):
fields = next(reader)
def clean(row):
for cell in row:
cell = cell.strip()
yield None if cell in nulls else cell
for row in reader:
yield dict(zip(fields, clean(row)))
You can create a wrapper object around your file that strips away the spaces before the CSV reader sees them. This way, you can even use the csv file with cvs.DictReader.
import re
class CSVSpaceStripper:
def __init__(self, filename):
self.fh = open(filename, "r")
self.surroundingWhiteSpace = re.compile("\s*;\s*")
self.leadingOrTrailingWhiteSpace = re.compile("^\s*|\s*$")
def close(self):
self.fh.close()
self.fh = None
def __iter__(self):
return self
def next(self):
line = self.fh.next()
line = self.surroundingWhiteSpace.sub(";", line)
line = self.leadingOrTrailingWhiteSpace.sub("", line)
return line
Then use it like this:
o = csv.reader(CSVSpaceStripper(filename), delimiter=";")
o = csv.DictReader(CSVSpaceStripper(filename), delimiter=";")
I hardcoded ";" to be the delimiter. Generalising the code to any delimiter is left as an exercise to the reader.
Read a CSV (or Excel file) using Pandas and trim it using this custom function.
#Definition for strippping whitespace
def trim(dataset):
trim = lambda x: x.strip() if type(x) is str else x
return dataset.applymap(trim)
You can now apply trim(CSV/Excel) to your code like so (as part of a loop, etc.)
dataset = trim(pd.read_csv(dataset))
dataset = trim(pd.read_excel(dataset))
and here is Daniel Kullmann excellent solution adapted to Python3:
import re
class CSVSpaceStripper:
"""strip whitespaces around delimiters in the file
NB has hardcoded delimiter ";"
"""
def __init__(self, filename):
self.fh = open(filename, "r")
self.surroundingWhiteSpace = re.compile(r"\s*;\s*")
self.leadingOrTrailingWhiteSpace = re.compile(r"^\s*|\s*$")
def close(self):
self.fh.close()
self.fh = None
def __iter__(self):
return self
def __next__(self):
line = self.fh.readline()
line = self.surroundingWhiteSpace.sub(";", line)
line = self.leadingOrTrailingWhiteSpace.sub("", line)
return line
I figured out a very simple solution:
import csv
with open('filename.csv') as f:
reader = csv.DictReader(f)
rows = [ { k.strip(): v.strip() for k,v in row.items() } for row in reader ]
The following code may help you:
import pandas as pd
aList = pd.read_csv(r'filename.csv', sep='\s*,\s*', engine='python')