automatically limit rows per file and create new files - python

I am working on a script that will write a massive amount of data to a .csv file. I would like to automatically limit rows per file and create new files.

A simple approach would be to use a class to keep track of your rows and write to a new file whenever needed (e.g. self.max_row)
For example:
import csv
class MyCSV:
def __init__(self):
self.max_row = 10
self.cur_row = 0
self.file_number = 0
self.file_handle = None
def write_row(self, row):
if self.cur_row >= self.max_row or self.file_handle == None:
self.cur_row = 0
self.file_number += 1
if self.file_handle:
self.file_handle.close()
self.file_handle = open(f'output_{self.file_number:04}.csv', 'w', newline='')
self.csv_handle = csv.writer(self.file_handle)
self.csv_handle.writerow(row)
self.cur_row += 1
my_csv = MyCSV()
for row in range(1000): # create some simulated rows
output_row = [row, "value1", "value2"]
my_csv.write_row(output_row)
This would create output filenames of the form output_0001.csv containing 10 rows per file. Obviously you can adjust this as needed.
You could also use a csv.DictWriter() instead and pass a dictionary for each row.

Related

csv row rearrangement and also preserve value order, python

I have a csv file that looks like this:
Current csv file
Would like make a new csv file that looks like this:
Desired csv file
My first thoughts are to:
r1= []
r2 = []
with open('loadba1.csv', "r") as csv_file:
data = csv.reader(csv_file, delimiter=',')
f or rows in data:
r1.append(rows[0])
r2.append(rows[1])
r1 will give - TRUE
r2 will give - 'L_602356450160818331', 'wan1'
Then loop through again r2 to pull out each value and 'somehow' combine.
I also cannot loose the value relationship e.g. TRUE - wan1 - L_602356450160818331
I am not sure of the approach I should take. Please advise.
What you probably want to do is use a manual while loop rather than for:
with open('loadba1.csv', "r") as csv_file:
data = csv.reader(csv_file, delimiter=',')
while True:
try:
load_bal, interface = next(data)
except StopIteration:
break # end of file
try:
_, the_id = next(data)
except StopIteration:
raise ValueError("No ID row for %s" % interface)
... # write out (load_bal, interface, the_id)
import pandas as pd
df = pd.Dataframe("csv1.csv", index = False)
result = []
Id, LoadBal, Interface = "","",""
for index, row in df.iterrows():
for col, val in row.iteritems():
if col == 0 and val:
LoadBal = val
elif LoadBal:
Interface = val
result.append({"LoadBal": LoadBal, "Interface": Interface,"Id": Id })
Id, LoadBal, Interface = "","",""
elif col!=0:
Id = val
result = pd.DataFrame(result)
result.to_csv("csv2.csv")

Excel sniffing, regex, and substring matching

I have this code that searches for a "phrase" in a "column" within all of the spreadsheets within a directory, and then outputs the matching date, time, and position into an "output.csv" (the position is on the same line, but the date and time are in the same row, 0-7 rows up from the 'phrase' row position). I need for it to be able to find the "phrase" within a cell, but right now, it only works for exact matches. If a cell in column 20 contained "phrase one", the example below wouldn't write the to the output file.
import os
import xlrd
from xlrd import open_workbook
import datetime
from datetime import time
import csv
# edit these params
outputfile = 'output.csv'
phrase = 'phrase'
column = 20
rootdir = '.'
def writeToCSV(datalist,outputfile):
with open(outputfile, 'w') as f:
for sublist in datalist:
for item in sublist:
f.write(item + ',')
f.write('\n')
def getdata(filename,row):
# print(row)
# print(filename,'filename')
wb = open_workbook(filename)
items = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
rows = []
# print(filename,' file')
for row1 in range(row,row-10, -1):
# print()
if row1 >= 0 and row1 < number_of_rows:
rowNo = sheet.cell(row1, 2).value
try :
if rowNo != '' and int(rowNo):
datetime1 = datetime.datetime(*xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=0), wb.datemode))
date_values = xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=1), wb.datemode)
time_value = time(*date_values[3:])
# print(time_value)
items.append(str(rowNo))
items.append(str(datetime1))
items.append(str(time_value))
# items[str(rowNo)]= str(datetime1)+'-'+str(time_value)
break
except Exception as e:
pass
# print(e)
# print(items)
return items
def extractData(filename,searchString,column):
wb = open_workbook(filename)
dataList = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
rowdata = []
for col in range(number_of_columns):
value = (sheet.cell(row, col).value)
if value == searchString :
if col == column :
data = getdata(filename,row)
dataList.append(data)
# print(value)
# rowdata.append(value)
# print(len(rowdata))
return dataList
def main():
for subdir, dirs, files in os.walk(rootdir):
for file in files:
fullname =os.path.join(subdir, file)
list = subdir.split('\\')
date = ''
if len(list) > 2 :
date = list[1].split('-')[1] +'-'+ list[2]
# print(date)
# print(file)
if date != '' :
namelist = file.split('-')
if len(namelist)> 2:
if (namelist[0] in date) and (namelist[1] in date):
# print(file)
data = extractData(fullname,phrase,column)
if len(data) > 0 :
writeToCSV(data,outputfile)
if __name__ == '__main__':
main() # call main method
I understand that regex can easily find substrings within a string, but I don't get exactly where to make the modification within the code. In a different language or if the code was written differently, I would try to add an if statement that would write the data to the output file if the string contained "phrase", but I can't determine where the code tries to qualify that the phrase matches the cell value. Any insight on this is appreciated.
In the function extractData you make the comparison if value == searchString :. That is where you check if the string value (from your Excel file) is the same as the searchString (your "pharse").
You can replace that with Pythons searchString in value. The line should look like if searchString in value: You do not need regex if you only looking for substrings.
There's a few things that seem to be causing issues, but the main one might be that your extractData module has found your search string in a row in a specific sheet, and your getData module is using the same row in all of the sheets in the workbook, without validating whether every sheet goes up to that row. It would be better to pass along which sheet you found the search string in, and have "getData" just search that specific sheet.

How to add a new line in an open txt file in python?

I'm trying to write a program by change an open file, and I need to add a new line in the print.
In the open txt.file, it shows like this (I use"_" replace blank):
Name_____Height(m)_____Weight(kg)
Bill________1.58__________58
Mary_____1.65__________43
...
And now I want to add a new row like this:
Name_____Height(m)_____Weight(kg)_____Age(year)<---The new vertical line
Bill________1.58__________58_____________15
Mary_____1.65__________43_____________17
And my code it's:
data_file = open("file.txt", "r")
print(data_file.read())
data_file.close()
So, how could I add another vertical line in the open file? Moreover, If I want to add more rows, how can I do this?
One more thing, I use the python 3.5
I wrote a little class to do everything you asked for and more. Implementation examples are done at the bottom. Let me know if this works for you.
class Feed(object):
def __init__(self, file_name, sep, naming_convention=None):
self.file_name = file_name
self.feed_item_naming = naming_convention
self.sep = sep
self.feed = self.load_feed()
def get_head(self, file=None):#lmao...
'''
Get the header
'''
if not file:
head = open(self.file_name).readline().split(self.sep)
else:
head = file[0].split(self.sep)
return head
def __repr__(self):
return repr(self.feed)
def load_feed(self):
'''
load a feed object
set the key of each item to the naming convention
if we have multiple item names we increment the name bill becomes bill_2 and then bill_3 etc...
'''
#first we open the file and grab the headers
file = [x.rstrip() for x in open(self.file_name).readlines()]
self.header = self.get_head(file)
if not self.feed_item_naming and self.feed_item_naming not in self.header:
self.feed_item_naming = self.header[0]
data = {}
for line in file[1:]:
if line != '':
line = line.split(self.sep)
pos = line[self.header.index(self.feed_item_naming)]
while pos in data:
try:
ending = int(pos[-1])+1
pos.replace(pos[-1], ending)
except:
pos = pos+'_2'
data[pos] = {}
for item in self.header:
data[pos][item] = line[self.header.index(item)]
return data
def unload_feed(self, file_name=None, sep=None):
'''
write the modified feed back out to a data file
'''
if not file_name:
file_name = self.file_name
if not sep:
sep = self.sep
with open(file_name, 'wb') as file:
for i in self.header:
if i != self.header[-1]:
file.write(i+sep)
else:
file.write(i)
file.write('\n')
for i in self.feed:
for x in self.header:
if x != self.header[-1]:
file.write(str(self.feed[i][x])+sep)
else:
file.write(str(self.feed[i][x]))
file.write('\n')
def add_key(self, key, default_value=None):
'''
Add a key to each of the items
'''
if key not in self.header:
for i in self.feed:
self.feed[i][key]=default_value
self.header.append(key)
def get_key_value(self, item, key):
'''
get the value for the items key
'''
return self.feed[item][key]
def get_item(self, item):
'''
get an individual item
'''
return self.feed[item]
def set_key_value(self, item, key, value):
'''
set the value of each items key
{item:{key:value, key:value}, item...etc}
'''
self.feed[item][key] = value
def set_key_values(self, item, key_value_dict):
'''
set multiple key values for an item
'''
for k,v in key_value_dict.iteritems():
self.set_key_value(item, k, v)
def add_item(self, item):
'''
Add a new item
'''
while item in self.feed:
try:
end = str(int(item[-1])+1)
item = item.replace(item[-1], end)
except:
item = item+'_2'
self.feed[item] = {}
self.feed[item][self.feed_item_naming] = item
for i in self.header:
if i != self.feed_item_naming:
self.feed[item][i] = None
f = Feed('file.txt', '_____', 'Name') #initialize a new feed object, make sure that all seperators are the same for each item in your file
f.add_item('Angela') #add a new item
f.set_key_values('Angela', {'Height(m)':5, 'Weight(kg)':123}) #set the new items height and weight
f.add_key('Position')#create a new key for each item
f.unload_feed() #write the feed back to the file
print(f)
If by "add a new vertical line" you mean "add a new column" to your file, you can do this with the help of the csv module.
The code below works by reading the contents of your file as a list, making the changes, and then writing the updated list back to the file. You can add rows to your file this way, as well.
import csv
with open('file.txt', 'r') as f:
reader = list(csv.reader(f, delimiter=' ')) # if your file is delimited by spaces, tabs, etc.
# include that value here. It appears that
# your file is space-delimited, but that's
# just a guess based on the info in your question.
for i,row in enumerate(reader):
if i == 0:
row.append('Age(year)')
if i == 1:
row.append('15')
if i == 2:
row.append('17')
with open('file.txt','w') as f:
wr = csv.writer(f, delimiter=' ')
for row in reader:
wr.writerow(row)
# file.txt output:
# Name Height(m) Weight(kg) Age(year)
# Bill 1.58 58 15
# Mary 1.6 43 17
This code also uses with statements when working with your file. Using either with or close() (like you included in your question) is the correct practice when working with files. A with statement is easy to use because it closes your file automatically.

proper use of class (csv reader example)

I've done the following CSV reader class:
class CSVread(object):
filtered = []
def __init__(self, file):
self.file = file
def get_file(self):
try:
with open(self.file, "r") as f:
self.reader = [row for row in csv.reader(f, delimiter = ";")]
return self.reader
except IOError as err:
print("I/O error({0}): {1}".format(errno, strerror))
return
def get_num_rows(self):
print(sum(1 for row in self.reader))
Which can be used with the following example:
datacsv = CSVread("data.csv") # ; seperated file
for row in datacsv.get_file(): # prints all the rows
print(row)
datacsv.get_num_rows() # number of rows in data.csv
My goal is to filter out the content of the csv file (data.csv) by filtering column 12 by the keyword "00GG". I can get it to work outside the class like this:
with open("data.csv") as csvfile:
reader = csv.reader(csvfile, delimiter = ";")
filtered = []
filtered = filter((lambda row: row[12] in ("00GG")), list(reader))
Code below returns an empty list (filtered) when it's defined inside the class:
def filter_data(csv_file):
filtered = filter((lambda row: row[12] in ("00GGL")), self.reader)
return filtered
Feedback for the existing code is also appreciated.
Could it be that in the first filter example you are searching for 00GG whereas in the second one you are searching for 00GGL?
Regardless, if you want to define filter_data() within the class you should write is as a method of the class. That means that it takes a self parameter, not a csv_file:
def filter_data(self):
filtered = filter((lambda row: row[12] in ("00GGL")), self.reader)
return filtered
Making it more general:
def filter_data(self, column, values):
return filter((lambda row: row[column] in values), self.reader)
Now you can call it like this:
datacsv.filter_data(12, ('00GGL',))
which should work if the input data does indeed contain rows with 00GGL in column 12.
Note that filter_data() should only be called after get_file() otherwise there is no self.reader. Unless you have a good reason not to read in the data when the CSVread object is created (e.g. you are aiming for lazy evaluation), you should read it in then. Otherwise, set self.reader = [] which will prevent failure in other methods.

writing a csv to file

I need to write the output of the code I have to a file so I can call it later. I need to call the output not the original test1 file. the code I have that makes the output is below and works fine, I just can't get it to a file a can call later.
import csv
file1 = open('C:/Users/Gallatin/Documents/adamenergy.csv',"r") #Open CSV File in Read Mode
reader = csv.reader(file1) #Create reader object which iterates over lines
class Object: #Object to store unique data
def __init__(self, name, produce, amount):
self.name = name
self.produce = produce
self.amount = amount
rownum = 0 #Row Number currently iterating over
list = [] #List to store objects
def checkList(name, produce, amount):
for object in list: #Iterate through list
if object.name == name and object.produce == produce: #Check if name and produce combination exists
object.amount += int(amount) #If it does add to amount variable and break out
return
newObject = Object(name, produce, int(amount)) #Create a new object with new name, produce, and amount
list.append(newObject) #Add to list and break out
for row in reader: #Iterate through all the rows
if rownum == 0: #Store header row seperately to not get confused
header = row
else:
name = row[0] #Store name
produce = row[1] #Store produce
amount = row[2] #Store amount
if len(list) == 0: #Default case if list = 0
newObject = Object(name, produce, int(amount))
list.append(newObject)
else: #If not...
checkList(name, produce, amount)
rownum += 1
for each in list:
print each.name,each.produce,each.amount
With the print it generates the output i want correctly, but i need to write this output to a file so I can call it later using ndiff to compare to another csv file I will run through similar code above
There's several approaches you can take:
You can either run the program differently; instead of running:
./generate
run
./generate > output_file.csv
This uses shell redirection to save the standard output to whatever file you specify. This is extremely flexible and very easy to build into future scripts. It's especially awesome if you accept input on standard input or via a named file, it makes your tool extremely flexible.
You can modify your program to write to a specific file. Open the file with something like:
output = open("output.csv", "w")
and then write output using strings along the lines of
output.write(each.name + "," + str(each.produce) + "," + str(each.amount))
You could use the same csv module to also write files. This might be the better approach for long-term use, because it'll automatically handle complicated cases of inputs that include , characters and other difficult cases.
Simply redirect output to the file.
For example,
C:> python myfile.py > output.txt
At the top of the file, open your output file:
outf = open("my_output_file.csv", "wb")
then later:
print >>outf, each.name,each.produce,each.amount
Just use CSV writer
add below code:
csvWriter = csv.writer(open('csvFileSource.csv', 'wb'), delimiter=',', quotechar='|',quoting=csv.QUOTE_MINIMAL)
csvWriter.writerow(each.name + each.produce + each.amount)
full code :
import csv
file1 = open('csvFileSource.csv',"r") #Open CSV File in Read Mode
reader = csv.reader(file1) #Create reader object which iterates over lines
class Object: #Object to store unique data
def __init__(self, name, produce, amount):
self.name = name
self.produce = produce
self.amount = amount
rownum = 0 #Row Number currently iterating over
list = [] #List to store objects
def checkList(name, produce, amount):
for object in list: #Iterate through listif object.name == name and object.produce == produce: #Check if name and produce combination exists
object.amount += int(amount) #If it does add to amount variable and break out
return
newObject = Object(name, produce, int(amount)) #Create a new object with new name, produce, and amount
list.append(newObject) #Add to list and break out
for row in reader: #Iterate through all the rows
if rownum == 0: #Store header row seperately to not get confused
header = row
else:
name = row[0] #Store name
produce = row[1] #Store produce
amount = row[2] #Store amount
if len(list) == 0: #Default case if list = 0
newObject = Object(name, produce, int(amount))
list.append(newObject)
else: #If not...
checkList(name, produce, amount)
rownum += 1
csvWriter = csv.writer(open('csvFileDestination.csv', 'wb'), delimiter=',', quotechar='|',quoting=csv.QUOTE_MINIMAL)
for each in list:
csvWriter.writerow(each.name + each.produce + each.amount)

Categories