Which Methods Open A File in Python

Which Methods Open A File in Python - python

Im trying to run code which writes its outputs to a bunch of files (upwards of 250). Im currently using:
with open(self.file, self.mode, newline='') as file_writer:
writer = csv.writer(file_writer)
writer.writerow(row)
This should auto close my files but I'm still getting the following:
52 of 231 | Analysing player: Evolved ANN 5 Noise 05 ...
Traceback (most recent call last):
File "fullAnalysis.py", line 200, in <module>
run_one.start()
File "fullAnalysis.py", line 179, in start
print_output=False)
File "/home/vi/td/axelrod-dojo/src/axelrod_dojo/algorithms/genetic_algorithm.py", line 28, in __init__
self.pool = Pool(processes=self.processes)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/context.py", line 119, in Pool
context=self.get_context())
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/pool.py", line 174, in __init__
self._repopulate_pool()
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
w.start()
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/process.py", line 105, in start
self._popen = self._Popen(self)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/popen_fork.py", line 26, in __init__
self._launch(process_obj)
File "/home/vi/.conda/envs/project/lib/python3.6/multiprocessing/popen_fork.py", line 72, in _launch
parent_r, child_w = os.pipe()
OSError: [Errno 24] Too many open files
My question is, which methods open files in python so I can find this leak and plug it?
I currently the only file methods I'm useing is a mixture of:
os.makedirs()
os.remove()
csv.writerow() (as show with closure)
(This is the second time its happened on #52 after implementing the with closure code)
EDIT
Below is the main section of code at the top of the trace for fullAnalysis.py:
for opponent in self.opponent_list:
print(i, "of", len(self.opponent_list), "| Analysing player:", str(opponent), "...")
global_processes = 20
# Stochastic players need seeding
if opponent.classifier['stochastic']:
opponent = self._get_seeded_player_class(type(opponent))(self.global_seed)
global_processes = 1
population = axl_dojo.Population(params_class=axl_dojo.CyclerParams,
params_kwargs=cycler_kwargs,
size=POPULATION_SIZE,
# processes=global_processes,
population=getPreMadePop(POPULATION_SIZE),
objective=cycler_objective,
output_filename=self._get_file_name(opponent),
opponents=[opponent],
print_output=False)
population.run(GENERATION_LENGTH)
print("{:.2f}% Done.\tSaved to:".format((100 * i) / len(self.opponent_list)),
self._get_file_name(opponent))
TRACKER.print_diff()
# self.output_files[str(opponent)] = self._get_file_name(opponent)
i += 1
Below is the init code for the axelrod_dojo genetic_algorthm.py:
def __init__(self, params_class, params_kwargs, size, objective, output_filename,
bottleneck=None, mutation_probability=.1, opponents=None,
processes=1, weights=None,
sample_count=None, population=None, print_output=True):
self.params_class = params_class
self.bottleneck = bottleneck
self.print_output = print_output
if processes == 0:
self.processes = cpu_count()
else:
self.processes = processes
self.pool = Pool(processes=self.processes)

Related

xlwings recently stopped getting live data from excel via Range

I was running a script to get data from excel for over a year using the Xlwings range command like so...
list=Range('A1:D10').value
Suddenly, it stopper working. I had changed nothing in the code nor the system, other than maybe installing another network card.
This is the error when trying to use the Range assignment now.
Traceback (most recent call last):
File "G:\python32\fetcher.py", line 61, in <module>
listFull = getComData()
File "G:\python32\fetcher.py", line 38, in getComData
listFull=Range('A4:H184').value
File "G:\python32\lib\site-packages\xlwings\main.py", line 1490, in __init__
impl = apps.active.range(cell1).impl
File "G:\python32\lib\site-packages\xlwings\main.py", line 439, in range
return Range(impl=self.impl.range(cell1, cell2))
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 457, in range
xl1 = self.xl.Range(arg1)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 341, in xl
self._xl = get_xl_app_from_hwnd(self._hwnd)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 251, in get_xl_app_from_hwnd
disp = COMRetryObjectWrapper(Dispatch(p))
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 96, in Dispatch
return __WrapDispatch(dispatch, userName, resultCLSID, typeinfo, clsctx=clsctx)
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 37, in __WrapDispatch
klass = gencache.GetClassForCLSID(resultCLSID)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 180, in GetClassForCLSID
mod = GetModuleForCLSID(clsid)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 223, in GetModuleForCLSID
mod = GetModuleForTypelib(typelibCLSID, lcid, major, minor)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 259, in GetModuleForTypelib
mod = _GetModule(modName)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 622, in _GetModule
mod = __import__(mod_name)
ValueError: source code string cannot contain null bytes

too many files open error with multiprocessing

I have a code that uses multiprocessing over about 10000 files on a 12 core vcpu on Ubuntu.
def process_file(name):
inp = open(name)
out = open(name.split('.')[0]+'wikiout.txt','a')
for row in inp:
row = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
# process sentence
inp.close()
out.close()
if __name__ == '__main__':
processes = []
for i in 'ABCDEF':
for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
for k in range(100)
filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))
p = multiprocessing.Process(target=process_file, args=(filename,))
processes.append(p)
p.start()
for process in processes:
process.join()
For some reason I get this issue
File "wikirules.py", line 37, in <module>
p.start()
File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
File "wikirules.py", line 13, in process_file
File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'
Any clue why this might be happening? Im still new to multiprocessing. So shouldn't this not open more than 12 files at once.

Your code is trying to run
len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600
operating system processes simultaneously.
Actually multiprocessing module contains relatively low level primitives to work with multiprocessing, and for basic tasks standard library suggests more safe and convenient option - module concurrent.futures which contains Pools implementations for threads and processes, and could be very useful especially for "embarrassingly parallel" workloads.
Here is example how the code from your question could be transformed using concurrent.futures and some other python features like generators, context managers and pathlib module.
import concurrent.futures as futures
import itertools
import pathlib
import nltk
BASE_PATH = pathlib.Path(__file__).parent.absolute()
def filename_generator():
"""produce filenames sequence"""
for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"
def worker(filename: pathlib.Path):
"""do all the job"""
out_filename = filename.with_suffix('.wikiout.txt')
with open(filename) as inp, open(out_filename, "a") as out:
for row in inp:
text = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
"""process sentence"""
def main():
with futures.ProcessPoolExecutor() as pool:
# mapping future->filename, useful in case of error
task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
for f in futures.as_completed(task_to_filename):
try:
f.result()
except Exception as e:
filename = task_to_filename[f]
print(f"{filename} processing failed: {e}")
if __name__ == "__main__":
main()

Backtrader giving IndexError: array assignment index out of range

I am trying to run the following strategy:
def max_n(array, n):
return np.argpartition(array, -n)[-n:]
class CrossSectionalMR(bt.Strategy):
params = (
('num_positions', 100),
)
def __init__(self, temp):
self.inds = {}
for d in self.datas:
self.inds[d] = {}
self.inds[d]["pct"] = bt.indicators.PercentChange(d.close, period=1)
def prenext(self):
self.next()
def next(self):
available = list(filter(lambda d: len(d), self.datas)) # only look at data that existed yesterday
rets = np.zeros(len(available))
for i, d in enumerate(available):
rets[i] = self.inds[d]['pct'][0]
market_ret = np.mean(rets)
weights = -(rets - market_ret)
max_weights_index = max_n(np.abs(weights), self.params.num_positions)
max_weights = weights[max_weights_index]
weights = weights / np.sum(np.abs(max_weights))
for i, d in enumerate(available):
if i in max_weights_index:
self.order_target_percent(d, target=weights[i])
else:
self.order_target_percent(d, 0)
The full error is:
Traceback (most recent call last):
File "/home/poblivsig/Software/pycharm-2020.3.1/plugins/python/helpers/pydev/pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "/home/poblivsig/Software/pycharm-2020.3.1/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/home/poblivsig/Dropbox/meanrev/main.py", line 190, in <module>
dd, cagr, sharpe = backtest(datas, CrossSectionalMR, plot=True, num_positions=100)
File "/home/poblivsig/Dropbox/meanrev/main.py", line 181, in backtest
results = cerebro.run()
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/cerebro.py", line 1127, in run
runstrat = self.runstrategies(iterstrat)
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/cerebro.py", line 1293, in runstrategies
self._runonce(runstrats)
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/cerebro.py", line 1652, in _runonce
strat._once()
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/lineiterator.py", line 297, in _once
indicator._once()
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/lineiterator.py", line 297, in _once
indicator._once()
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/linebuffer.py", line 630, in _once
self.oncestart(self._minperiod - 1, self._minperiod)
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/lineroot.py", line 165, in oncestart
self.once(start, end)
File "/home/poblivsig/Dropbox/meanrev/venv/lib/python3.8/site-packages/backtrader/linebuffer.py", line 672, in once
dst[i] = src[i + ago]
IndexError: array assignment index out of range
python-BaseExceptio
Any help would be greatly appreciated.
I grab the data from Yahoo and store it in csv files which are then loaded up and added to Cerebro. Sometimes, the code cannot get the full list of the SPY, but I don't think that is the problem here.

Memory issue with multiprocessing in Python

I am trying to use my other cores in my python program. And the following is the basic structure/logic of my code:
import multiprocessing as mp
import pandas as pd
import gc
def multiprocess_RUN(param):
result = Analysis_Obj.run(param)
return result
class Analysis_Obj():
def __init__(self, filename):
self.DF = pd.read_csv(filename)
return
def run_Analysis(self, param):
# Multi-core option
pool = mp.Pool(processes=1)
run_result = pool.map(multiprocess_RUN, [self, param])
# Normal option
run_result = self.run(param)
return run_result
def run(self, param):
# Let's say I have written a function to count the frequency of 'param' in the target file
result = count(self.DF, param)
return result
if __name__ == "__main__":
files = ['file1.csv', 'file2.csv']
params = [1,2,3,4]
results = []
for i in range(0,len(files)):
analysis = Analysis_Obj(files[i])
for j in range(0,len(params)):
result = analysis.run_Analysis(params[j])
results.append(result)
del result
del analysis
gc.collect()
If I comment out the 'Multi-core option' and run the 'Normal option' everything runs fine. But even if I run the 'Multi-core option' with processes=1 I get a Memory Error when my for loop starts on the 2nd file. I have deliberately set it up so that I create and delete an Analysis object in each for loop, so that the file that has been processed will be cleared from memory. Clearly this hasn't worked. Advice of how to get around this would be very much appreciated.
Cheers
EDIT:
Here is the error message I have in the terminal:
Exception in thread Thread-7:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 326, in _handle_workers
pool._maintain_pool()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 230, in _maintain_pool
self._repopulate_pool()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 223, in _repopulate_pool
w.start()
File "/usr/lib/python2.7/multiprocessing/process.py", line 130, in start
self._popen = Popen(self)
File "/usr/lib/python2.7/multiprocessing/forking.py", line 121, in __init__
self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory

FileNotFoundError: [Errno 2] No such file or directory: 'test_user1_user_id'

I got an error FileNotFoundError: [Errno 2] No such file or directory: 'test_user1_user_id'. I wrote in tests.py
from datetime import datetime
from django.test import TestCase
from app.models import Companytransaction
import xlrd
# Create your tests here.
class CompanytransactionModelTests(TestCase):
def __init__(self, sheet_path):
self.book = xlrd.open_workbook(sheet_path)
self.sheet = self.book.sheet_by_index(1)
def setUp(self):
self.book = xlrd.open_workbook('./data/excel1.xlsx')
self.sheet = self.book.sheet_by_index(1)
num = 0
for row_index in range(2,4):
row = self.sheet.row_values(row_index)
user = Companytransaction(user_id=row[1], name=row[2], age=row[3])
user.save()
if num == 0:
self.user1 = Companytransaction.objects.create(user_id=row[1], name=row[2], age=row[3])
num += 1
elif num == 1:
self.user2 = Companytransaction.objects.create(user_id=row[1], name=row[2], age=row[3])
num += 1
else:
self.user3 = Companytransaction.objects.create(user_id=row[1], name=row[2], age=row[3])
def test_user1_company_id(self):
self.assertEqual(self.user1.user_id, '100')
def test_user1_corporation_id(self):
self.assertEqual(self.user1.name, 'Tom')
def test_user1_company_name(self):
self.assertEqual(self.user1.age, '29')
I run python manage.py test & ./manage.py test app.tests, but both of them shows the same error. I surely made user_id column in models.py, so I really cannot understand why this error happens. How can I fix this?
What should I write this?
Here's the traceback.
Traceback (most recent call last):
File "./manage.py", line 22, in <module>
execute_from_command_line(sys.argv)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/__init__.py", line 363, in execute_from_command_line
utility.execute()
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/__init__.py", line 355, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/commands/test.py", line 29, in run_from_argv
super(Command, self).run_from_argv(argv)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/base.py", line 283, in run_from_argv
self.execute(*args, **cmd_options)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/base.py", line 330, in execute
output = self.handle(*args, **options)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/core/management/commands/test.py", line 62, in handle
failures = test_runner.run_tests(test_labels)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/test/runner.py", line 600, in run_tests
suite = self.build_suite(test_labels, extra_tests)
File "/Users/xxx/myenv/lib/python3.5/site-packages/django/test/runner.py", line 484, in build_suite
tests = self.test_loader.loadTestsFromName(label)
File "/Users/xxx/.pyenv/versions/3.5.0/lib/python3.5/unittest/loader.py", line 190, in loadTestsFromName
return self.loadTestsFromModule(obj)
File "/Users/xxx/.pyenv/versions/3.5.0/lib/python3.5/unittest/loader.py", line 123, in loadTestsFromModule
tests.append(self.loadTestsFromTestCase(obj))
File "/Users/xxx/.pyenv/versions/3.5.0/lib/python3.5/unittest/loader.py", line 92, in loadTestsFromTestCase
loaded_suite = self.suiteClass(map(testCaseClass, testCaseNames))
File "/Users/xxx/.pyenv/versions/3.5.0/lib/python3.5/unittest/suite.py", line 24, in __init__
self.addTests(tests)
File "/Users/xxx/.pyenv/versions/3.5.0/lib/python3.5/unittest/suite.py", line 57, in addTests
for test in tests:
File "/Users/xxx/app/app/tests.py", line 12, in __init__
self.book = xlrd.open_workbook(sheet_path)
File "/Users/xxx/myenv/lib/python3.5/site-packages/xlrd/__init__.py", line 116, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'test_user1_user_id'

It looks like the issue is with your __init__ method:
def __init__(self, sheet_path):
self.book = xlrd.open_workbook(sheet_path)
self.sheet = self.book.sheet_by_index(1)
You're getting this error because you aren't providing sheet_path when CompanytransactionModelTests is initialized. We can see from the unittest.TestCase source that modelName is the first & only argument when initializing a TestCase. I'd bet a penny that you've got a method called test_user1_user_id on a class with similar __init__ code.
Setting up that test data in setUp is the right way to do what you're doing. It looks like things should work properly if you get rid of that __init__ code.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Which Methods Open A File in Python - python

Related

xlwings recently stopped getting live data from excel via Range

too many files open error with multiprocessing

Backtrader giving IndexError: array assignment index out of range

Memory issue with multiprocessing in Python

FileNotFoundError: [Errno 2] No such file or directory: 'test_user1_user_id'

Categories

Resources