Python Error 24: too many files open: Per Process Limit? - python

When using this py code to split a large csv into smaller csv's (about ) I am receiving the error:
"OSError: [Error 24] Too Many Open Files:"
After running this there should be 29,930 separate files, however its stopping after 2048.
I have done some research and it looks like there is a per process limit of 2048. How can I get around this?
#!/usr/bin/env python3
import binascii
import csv
import os.path
import sys
from tkinter.filedialog import askopenfilename, askdirectory
from tkinter.simpledialog import askinteger
def split_csv_file(f, dst_dir, keyfunc):
csv_reader = csv.reader(f)
header = next(csv_reader)
csv_writers = {}
for row in csv_reader:
k = keyfunc(row)
if k not in csv_writers:
writer = csv.writer(open(os.path.join(dst_dir, k),
mode='w', newline=''))
writer.writerow(header)
csv_writers[k] = writer
csv_writers[k].writerow(row[0:1])
def get_args_from_cli():
input_filename = sys.argv[1]
column = int(sys.argv[2])
dst_dir = sys.argv[3]
return (input_filename, column, dst_dir)
def get_args_from_gui():
input_filename = askopenfilename(
filetypes=(('CSV', '.csv'),),
title='Select CSV Input File')
column = askinteger('Choose Table Column', 'Table column')
dst_dir = askdirectory(title='Select Destination Directory')
return (input_filename, column, dst_dir)
if __name__ == '__main__':
if len(sys.argv) == 1:
input_filename, column, dst_dir = get_args_from_gui()
elif len(sys.argv) == 4:
input_filename, column, dst_dir = get_args_from_cli()
else:
raise Exception("Invalid number of arguments")
with open(input_filename, mode='r', newline='') as f:
split_csv_file(f, dst_dir, lambda r: r[column-1]+'.csv')
# if the column has funky values resulting in invalid filenames
# replace the line from above with:
# split_csv_file(f, dst_dir, lambda r: binascii.b2a_hex(r[column-1].encode('utf-8')).decode('utf-8')+'.csv')

You don't need to keep the csv writers in a dictionary. You can re-open the file to append to it:
Replace:
if k not in csv_writers:
csv_writers[k] = csv.writer(open(os.path.join(dst_dir, k),
mode='w', newline=''))
csv_writers[k].writerow(row)
With:
filename = os.path.join(dst_dir, k)
with open(filename, mode='a', newline='') as output:
csv.writer(output).writerow(row)

Related

How to keep CSV headers on the chunk files after script split?

I need help on modifying this script to include headers on the output files chunks. The script uses some input to determined how many rows per file will the process split the file by. The output files does not contain headers from the original file. I'm am seeking advice on how to implement.
import csv
import os
import sys
os_path = os.path
csv_writer = csv.writer
sys_exit = sys.exit
if __name__ == '__main__':
try:
chunk_size = int(input('Input number of rows of one chunk file: '))
except ValueError:
print('Number of rows must be integer. Close.')
sys_exit()
file_path = input('Input path to .tsv file for splitting on chunks: ')
if (
not os_path.isfile(file_path) or
not file_path.endswith('.tsv')
):
print('You must input path to .tsv file for splitting.')
sys_exit()
file_name = os_path.splitext(file_path)[0]
with open(file_path, 'r', newline='', encoding='utf-8') as tsv_file:
chunk_file = None
writer = None
counter = 1
reader = csv.reader(tsv_file, delimiter='\t', quotechar='\'')
for index, chunk in enumerate(reader):
if index % chunk_size == 0:
if chunk_file is not None:
chunk_file.close()
chunk_name = '{0}_{1}.tsv'.format(file_name, counter)
chunk_file = open(chunk_name, 'w', newline='', encoding='utf-8')
counter += 1
writer = csv_writer(chunk_file, delimiter='\t', quotechar='\'')
print('File "{}" complete.'.format(chunk_name))
writer.writerow(chunk)
You can do it by reading the header row manually when opening the input file, and then writing it at the beginning of each output file — see the ADDED comments in the code below:
...
with open(file_path, 'r', newline='', encoding='utf-8') as tsv_file:
chunk_file = None
writer = None
counter = 1
reader = csv.reader(tsv_file, delimiter='\t', quotechar="'")
header = next(reader) # Read and save header row. (ADDED)
for index, chunk in enumerate(reader):
if index % chunk_size == 0:
if chunk_file is not None:
chunk_file.close()
chunk_name = '{0}_{1}.tsv'.format(file_name, counter)
chunk_file = open(chunk_name, 'w', newline='', encoding='utf-8')
writer = csv_writer(chunk_file, delimiter='\t', quotechar="'")
writer.writerow(header) # ADDED.
print('File "{}" complete.'.format(chunk_name))
counter += 1
writer.writerow(chunk)
Note that using single-quote characters for quoting means the output files are not adhering to the CSV standard: RFC 4180

Can someone please tell me what am I doing wrong New to Python

My requirement is to look through a folder and combine only the csv files in it into one. The csv files are consistent but cannot be just concatenated. I am only interested in rows where the first element in the row is a date else I need to discard that row.
The code that I have made progress till now is below which as usual is not working:
import os
import csv
from dateutil.parser import parse
def datecheck(string):
try:
parse(string)
return True
except ValueError:
False
file_ext = "csv"
os.chdir ("C:\\BANK_PROJECT\\FILES\\RAW_SOURCE_FILES")
file_list = os.listdir("C:\\BANK_PROJECT\\FILES\\RAW_SOURCE_FILES")
with open("outfile.csv", "w") as outfile:
print(file_list)
for file in file_list:
if file.__contains__(".csv"):
with open(file, 'r') as infile:
data = csv.reader(infile)
for row in data:
if len(row) > 0:
if datecheck(row[0]):
outfilewriter = csv.writer(outfile)
outfilewriter.writerows(row)
else:
continue
import csv
from glob import glob
from dateutil.parser import parse
def datecheck(string):
try:
parse(string)
return True
except ValueError:
False
files = glob('*.csv')
with open('outfile.csv', 'w') as outfile:
writer = csv.writer(outfile)
for file in files:
with open(file) as infile:
reader = csv.reader(infile)
data = [row for row in reader if datecheck(row[0])]
writer.writerows(data)
Input file 1:
new,1,2,3
2012-01-19 17:21:0,1,2 ,3
2012-01-19,1,2,3
xx,2,3,4
Input File 2:
new,1,2,3
2012-03-19 17:21:0,1,2 ,3
yy,1,2,3
2012-04-19,1,2,3
xx,2,3,4
Output:
2012-01-19 17:21:0,1,2 ,3
2012-01-19,1,2,3
2012-03-19 17:21:0,1,2 ,3
2012-04-19,1,2,3

How to pass a file as an argument in python

I am having some issues passing an argument in a python script to take a specific file like a csv, txt, or xml
I am reviewing python and would like some feedback on why I don't see any output after running the following command: ./my_script some3455.csv
#!/usr/bin/python
import sys
import csv
import xml.etree.ElementTree as ET
FILE = str(sys.argv[1])
def run_files():
if FILE == '*.csv'
run_csv()
elif FILE == '*.txt'
run_txt()
else
run_xml()
def run_csv():
csv_file = csv.register_dialect('dialect', delimiter = '|')
with open(FILE, 'r') as file:
reader = csv.reader(file, dialect='dialect')
for row in reader:
print(row)
def run_txt():
with open(FILE, 'r') as file:
txt_contents = file.read()
print(txt_contents)
def run_xml():
tree = ET.parse(FILE)
root = tree.getroot()
for child in root.findall('Attributes')
car = child.find('Car').text
color = child.find('Color').text
print(car, color)
I have tried to pass it as without the FILE but works just for one and the other file types doesn't get identify.
You need to use fnmatch and not == to compare a string with a glob pattern:
import fnmatch
def run_files():
if fnmatch.fnmatch(FILE, '*.csv'):
run_csv()
elif fnmatch.fnmatch(FILE, '*.txt'):
run_txt()
else:
run_xml()

Python csv - replace any columns with specified value

I have the following input file with a header row:
test_in.csv
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,,,,,,8a0df668e0d49b02
-72.5647745,41.4160301,1213,KILLINGWORTH RD,,,,,,,b3ecaab86e476f46
I need to replace any of the columns with a specified string
for example CITY column's data should be replaced from "" to "MyCity"
My code only outputs the header and first row
python test_forcefld.py test_in.csv MyCity CITY out_test.csv
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
#print headers to output file
with open(out_file_name, 'w') as fou:
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
f_in2 = open(in_file_name, "rb")
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
datarow = next(reader2)
datarow[force_fld] = force_data
with open(out_file_name, 'wa') as fou:
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writeheader()
dw2.writerow(data row)
Output shows
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,MyCity,,,,,8a0df668e0d49b02
Your code is a little difficult to read, but assuming datarow is a dictionary containing your records:
In your last row, change
dw2.writerow(datarow)
Into
dw2.writerows(datarow)
While you're at it, you should also consider using datarow.keys() for your fieldnames, for conciseness.
This should do it, you just need pandas:
import pandas as pd
df = pd.read_csv(in_file_name, sep=',')
df['CITY'].fillna('MyCity', inplace=True)
And to save it:
df.to_csv(out_file_name)
You can try somthing like this in order to have your desired file:
I'm assuming your input file is called f_input.txt and your output file is called f_output.txt:
data = list(k.rstrip().split(',') for k in open("f_input.txt", 'r'))
with open("f_output.txt", 'a+') as f:
f.write(",".join(data[0]) + '\n')
for k in data[1:]:
# Modify the positions of k[:n] + your data + k[n+1]
# if you need to handle another position
f.write(",".join(k[:6]) + "MyCity" + ",".join(k[7:]) + "\n")
This worked in the end:
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
f_in2 = open(in_file_name, "r")
#print headers to output file
fou = open(out_file_name, 'wa')
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
for row in reader2:
row[force_fld] = force_data
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writerow(row)

How can I select only a particular row in a CSV file?

I have a little program that just needs to read one (and only one) row from a csv file and write the column values to a series of files. The program has three system arguments: the path to the data file, the job id (uuid), and the target row number, i.e. the row in the csv that I want to parse. It's not working, how can I fix it?
import csv
import sys
import itertools
f = sys.argv[1]
uuid = sys.argv[2]
target_row = sys.argv[3]
tmpdir="/tmp/pagekicker/"
folder = tmpdir+uuid
destination1 = folder + '/csv/row.editedby'
destination3 = folder + '/csv/row.booktitle'
destination4 = folder + '/csv/row.seeds'
destination5 = folder + '/csv/row.imprint'
f = open(f, 'rb')
f1 = open(destination1, 'w')
f3 = open(destination3, 'w')
f4 = open(destination4, 'w')
f5 = open(destination5, 'w')
target_row = int(target_row)
try:
reader = csv.reader(f) # creates the reader object
for row in itertools.islice(reader,1,1): # iterates the rows of the file in orders
editedby = row[0] # we throw away column 2
booktitle = row[2]
print row[2]
seeds = row[3]
imprint = row[4]
f1.write(editedby)
f3.write(booktitle)
f4.write(seeds)
f5.write(imprint)
f.close()
f1.close()
f3.close()
f4.close()
f5.close()
finally:
print 'done'
UPDATE: thanks Graham Bell for his suggested code. There are two "f5s" in the first line of his 'with' statement My code now looks like this:
i
mport csv
import sys
import itertools
f = sys.argv[1]
uuid = sys.argv[2]
target_row = sys.argv[3]
tmpdir="/tmp/pagekicker/"
folder = tmpdir+uuid
# os.mkdir(folder)
destination3 = folder + '/csv/row.booktitle'
destination1 = folder + '/csv/row.editedby'
destination4 = folder + '/csv/row.seeds'
destination5 = folder + '/csv/row.imprint'
with open(f, 'rb') as f, open(destination1, 'w') as f1, open(destination3, 'w') as f3, open(destination4, 'w') as f4, open(destination5, 'w') as f5:
target_row = int(target_row)
try:
reader = csv.reader(f) # creates the reader object
for row in itertools.islice(reader,1,1): # iterates the rows of the file in orders
editedby = row[0] # we throw away column 2
booktitle = row[2]
print row[2]
seeds = row[3]
imprint = row[4]
f1.write(editedby)
f3.write(booktitle)
f4.write(seeds)
f5.write(imprint)
except
print 'done'
Without the except, it generates "unexpected unindent" when I run it. With the except, it says that the except line is invalid syntax.
the csv library DictReader() object has the ability to display the current line number with:
reader = csv.DictReader(csv_file)
reader.line_num
you could iterate through and do nothing until you reach the correct line number that you need, something like this:
for row in reader:
if reader.line_num == row_you_want
do something
the DictReader class also allows you to have the first row in your CSV file to be title columns, and then you can access them like so:
reader["title_of_column1"]
which might save you some work as well, also you should use the python with block when working with files like so:
with open(f, 'rb') as f, open(destination1, 'w') as f1, open(destination3, 'w') as f3, open(destination4, 'w') as f5, open(destination5, 'w') as f5:
target_row = int(target_row)
try:
reader = csv.reader(f) # creates the reader object
for row in itertools.islice(reader,1,1): # iterates the rows of the file in orders
editedby = row[0] # we throw away column 2
booktitle = row[2]
print row[2]
seeds = row[3]
imprint = row[4]
f1.write(editedby)
f3.write(booktitle)
f4.write(seeds)
f5.write(imprint)
This way you don't have to worry about closing them all
Assuming you count rows from 1 (rather than 0), here's a standalone function that will do it:
import csv
from contextlib import contextmanager
import sys
import itertools
#contextmanager
def multi_file_manager(files, mode='r'):
""" Context manager for multiple files. """
files = [open(file, mode) for file in files]
yield files
for file in files:
file.close()
# This is the standalone function
def csv_read_row(filename, n):
""" Read and return nth row of a csv file, counting from 1. """
with open(filename, 'rb') as f:
reader = csv.reader(f)
return next(itertools.islice(reader, n-1, n))
if len(sys.argv) != 4:
print('usage: utility <csv filename> <uuid> <target row>')
sys.exit(1)
tmpdir = "/tmp/pagekicker"
f = sys.argv[1]
uuid = sys.argv[2]
target_row = int(sys.argv[3])
folder = os.path.join(tmpdir, uuid)
destinations = [folder+dest for dest in ('/csv/row.editedby',
'/csv/row.booktitle',
'/csv/row.seeds',
'/csv/row.imprint')]
with multi_file_manager(destinations, mode='w') as files:
row = csv_read_row(f, target_row)
#editedby, booktitle, seeds, imprint = row[0], row[2], row[3], row[4]
for i,j in zip(range(4), (0, 2, 3, 4)):
files[i].write(row[j]+'\n')

Categories