I tried to run multiprocessing with a large dataset.
when i run below script with for loop, the total run time is 1.5 sec.
def get_vars(accessCode, user_profile, wt, meals, instance_method='get_wt_adherence'):
'''
Examples
--------
>> n_cpus = multiprocessing.cpu_count()
>> get_wt_adherence = partial(get_vars, user_profile, wt, meals,
instance_method='get_wt_adherence')
>> pool = multiprocessing.Pool(n_cpus-5)
>> result = pool.map(get_wt_adherence, accessCodes)
>> concated_result = pd.concat(result)
Version
-------
# 2020.03.26 Updated
: Class name edited. 'NOOM' -> 'DATA_GEN'
'''
#
COL_WEEK = ['{}week'.format(i) for i in range(1, 17)]
data_gen = DATA_GEN(accessCode, user_profile, wt, meals)
if instance_method == 'get_wt_adherence':
func = data_gen.get_wt_adherence
elif instance_method == 'get_meal_adherence':
func = data_gen.get_meal_adherence
elif instance_method == 'get_color_food':
func = data_gen.get_color_food
elif instance_method == 'get_daily_cal':
func = data_gen.get_daily_cal
row = pd.DataFrame([func(weeks) for weeks in range(1, 17)]).T
row.columns = COL_WEEK
row['accessCode'] = accessCode
return row
from noom.handler import DATA_GEN
from functools import partial
import multiprocessing
# start_time = time.time()
get_wt = partial(get_vars, user_profile=user_profile, wt=wt_logs, meals=meals, instance_method='get_wt_adherence')
for i in range(10):
get_wt(accessCodes[i])
however, when i tried to run this script usign multiprocessing, the script was not responded
Even, 'accessCodes' is list which has 100 elements.
I suspect the 'get_wt' function using partial module.
n_cpus = multiprocessing.cpu_count()
pool = multiprocessing.Pool(n_cpus-15)
result_wt = pool.map(get_wt, accessCodes) ; print('wt adherence finished')
pool.close()
How to solve this problem?
the error is below
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-22-73ddf2e21bbd> in <module>
2 n_cpus = multiprocessing.cpu_count()
3 pool = multiprocessing.Pool(n_cpus-15)
----> 4 result_wt = pool.map(get_wt_adherence, accessCodes[1:10]) ; print('wt adherence finished')
5 pool.close()
6 time.time() - start_time
/usr/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
/usr/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
/usr/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
/usr/lib/python3.6/multiprocessing/connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
/usr/lib/python3.6/multiprocessing/connection.py in _send_bytes(self, buf)
391 n = len(buf)
392 # For wire compatibility with 3.2 and lower
--> 393 header = struct.pack("!i", n)
394 if n > 16384:
395 # The payload is large so Nagle's algorithm won't be triggered
error: 'i' format requires -2147483648 <= number <= 2147483647
Related
I have a function batch_opt taking two arguments (integer i and pandas dataframe train) and return a python dictionary. When I was trying to parallelize the computation using DASK in Python, I got the type error of Delayed objects are immutable. I am new to DASK. Can anyone help me out here? Thanks.
results = []
for i in range(0, 2):
validation_res = delayed(batch_opt)(i, train)
results.append(validation_res)
start = time.time()
res = compute(*results)
print(time.time() - start)
Trace:
TypeError Traceback (most recent call last)
<ipython-input-19-8463f64dec56> in <module>
5
6 start = time.time()
----> 7 res = compute(*results)
8 print(time.time() - start)
~/.conda/envs/odop/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
568 postcomputes.append(x.__dask_postcompute__())
569
--> 570 results = schedule(dsk, keys, **kwargs)
571 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
572
~/.conda/envs/odop/lib/python3.8/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
77 pool = MultiprocessingPoolExecutor(pool)
78
---> 79 results = get_async(
80 pool.submit,
81 pool._max_workers,
~/.conda/envs/odop/lib/python3.8/site-packages/dask/local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
505 _execute_task(task, data) # Re-execute locally
506 else:
--> 507 raise_exception(exc, tb)
508 res, worker_id = loads(res_info)
509 state["cache"][key] = res
~/.conda/envs/odop/lib/python3.8/site-packages/dask/local.py in reraise(exc, tb)
313 if exc.__traceback__ is not tb:
314 raise exc.with_traceback(tb)
--> 315 raise exc
316
317
~/.conda/envs/odop/lib/python3.8/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
218 try:
219 task, data = loads(task_info)
--> 220 result = _execute_task(task, data)
221 id = get_id()
222 result = dumps((result, id))
~/.conda/envs/odop/lib/python3.8/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
<ipython-input-7-e3af5748e1cf> in batch_opt(i, train)
22 test.loc[:, 'seg'] = test.apply(lambda x: proc.assign_trxn(x), axis = 1)
23 test_policy_res, test_metrics_res = opt.analyze_result(fa_m, x, test, cum_to_day, cur_policy, policy)
---> 24 validation_res[(train_mon_yr_batch, test_mon_yr)] = {'train_policy': train_policy_res, 'train_result': train_metrics_res, 'test_policy': test_policy_res, 'test_result': test_metrics_res}
25 return validation_res
~/.conda/envs/odop/lib/python3.8/site-packages/dask/delayed.py in __setitem__(self, index, val)
564
565 def __setitem__(self, index, val):
--> 566 raise TypeError("Delayed objects are immutable")
567
568 def __iter__(self):
TypeError: Delayed objects are immutable
I'm running jupyter lab on windows and fastai.vision.utils.verify_images(fns) is giving me problems because it calls fastcore.parallel.parallel with default n_workers=8. There are many ways around it, but I was trying to figure out a code block that I could slap in any notebook and have it so all underlying calls to parallel will run with n_workers=1.
I tried the following cell:
import fastcore
import sys
_fastcore = fastcore
_parallel = lambda *args, **kwargs: fastcore.parallel.parallel(*args, **kwargs, n_workers=1)
_fastcore.parallel.parallel = _parallel
sys.modules['fastcore'] = _fastcore
fastcore.parallel.parallel
printing
<function __main__.<lambda>(*args, **kwargs)>
but when I try running verify_images it still fails as if the patch never happened
---------------------------------------------------------------------------
BrokenProcessPool Traceback (most recent call last)
<ipython-input-37-f1773f2c9e62> in <module>
3 # from mock import patch
4 # with patch('fastcore.parallel.parallel') as _parallel:
----> 5 failed = verify_images(fns)
6 # failed = L(fns[i] for i,o in enumerate(_parallel(verify_image, fns)) if not o)
7 failed
~\anaconda3\lib\site-packages\fastai\vision\utils.py in verify_images(fns)
59 def verify_images(fns):
60 "Find images in `fns` that can't be opened"
---> 61 return L(fns[i] for i,o in enumerate(parallel(verify_image, fns)) if not o)
62
63 # Cell
~\anaconda3\lib\site-packages\fastcore\parallel.py in parallel(f, items, n_workers, total, progress, pause, threadpool, timeout, chunksize, *args, **kwargs)
121 if total is None: total = len(items)
122 r = progress_bar(r, total=total, leave=False)
--> 123 return L(r)
124
125 # Cell
~\anaconda3\lib\site-packages\fastcore\foundation.py in __call__(cls, x, *args, **kwargs)
95 def __call__(cls, x=None, *args, **kwargs):
96 if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97 return super().__call__(x, *args, **kwargs)
98
99 # Cell
~\anaconda3\lib\site-packages\fastcore\foundation.py in __init__(self, items, use_list, match, *rest)
103 def __init__(self, items=None, *rest, use_list=False, match=None):
104 if (use_list is not None) or not is_array(items):
--> 105 items = listify(items, *rest, use_list=use_list, match=match)
106 super().__init__(items)
107
~\anaconda3\lib\site-packages\fastcore\basics.py in listify(o, use_list, match, *rest)
54 elif isinstance(o, list): res = o
55 elif isinstance(o, str) or is_array(o): res = [o]
---> 56 elif is_iter(o): res = list(o)
57 else: res = [o]
58 if match is not None:
~\anaconda3\lib\concurrent\futures\process.py in _chain_from_iterable_of_lists(iterable)
482 careful not to keep references to yielded objects.
483 """
--> 484 for element in iterable:
485 element.reverse()
486 while element:
~\anaconda3\lib\concurrent\futures\_base.py in result_iterator()
609 # Careful not to keep a reference to the popped future
610 if timeout is None:
--> 611 yield fs.pop().result()
612 else:
613 yield fs.pop().result(end_time - time.monotonic())
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
437 raise CancelledError()
438 elif self._state == FINISHED:
--> 439 return self.__get_result()
440 else:
441 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
I suspect it has to do with fastai.vision.utils using * imports for fastcore. Is there a way to achieve what I want?
Since the parallel function has already been imported into the fastai.vision.utils module, the correct way is to monkeypatch that module rather than fastcore.parallel:
... # your code for custom `parallel` function goes here
import fastai.vision.utils
fastai.vision.utils.parallel = _parallel # assign your custom function here
My name is Boyu. I am a college student and newbie in python and Gurobi. Currently, one step of my model is solving 5 independent LPs. These LPs are independent and each has the same number of variables and constraints. The only difference between these LPs is the values of the coefficient and they are all known before running the model.
First, I start building 5 LPs sequentially:
from gurobipy import *
from gurobipy import GRB
a={1:2,2:2,3:8,4:7,5:3}
b={1:3,2:5,3:6,4:8,5:5}
c={1:4,2:2,3:3,4:5,5:7}
d={1:1,2:7,3:3,4:2,5:9}
object_val={}
x={}
y={}
z={}
m={}
for i in [1,2,3,4,5]:
# Create a new model
m[i]=Model()
# Create variables
x[i] = m[i].addVar(vtype=GRB.CONTINUOUS)
y[i] = m[i].addVar(vtype=GRB.CONTINUOUS)
z[i] = m[i].addVar(vtype=GRB.CONTINUOUS)
# Set objective
m[i].setObjective(x[i] + y[i] + 2 * z[i] , GRB.MAXIMIZE)
# Add constraint: x + a y + b z <= c
m[i].addConstr(x[i] + a[i] * y[i] + b[i] * z[i] <= c[i])
# Add constraint: x + y >= 1
m[i].addConstr(x[i] + y[i] >= d[i])
Second, I defined the function to solve a single LP model and save it as "test.py":
def test(i):
# Optimize model
m=i[1]
m.optimize()
return m.objVal
Third, I create the input data for the function will solved by parallel:
inputs=[]
for i in [1,2,3,4,5]:
inputs.append([i,m[i]])
Finally, I tried to use "multiprocessing" package to solve these 5 LPs in parallel:
import test
import multiprocessing
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=4)
pool.map(test.test, inputs)
pool.close()
pool.join()
print('done')
However, an error occurs, it said "KeyError: 'getstate'"
KeyError Traceback (most recent call last)
<ipython-input-17-0b3639c06eb3> in <module>()
1 if __name__ == '__main__':
2 pool = multiprocessing.Pool(processes=4)
----> 3 pool.map(test.test, inputs)
4 pool.close()
5 pool.join()
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
C:\ProgramData\Anaconda3\lib\multiprocessing\reduction.py in dumps(cls, obj, protocol)
49 def dumps(cls, obj, protocol=None):
50 buf = io.BytesIO()
---> 51 cls(buf, protocol).dump(obj)
52 return buf.getbuffer()
53
model.pxi in gurobipy.Model.__getattr__()
KeyError: '__getstate__'
Could anybody give me some help for that? I am a newbie for gurobi and python and it will be really really appreciated if someone can give me some help.
Thanks.
Boyu
You need to create a separate environment for each model instance.
# Assuming: import gurobipy as gp
m[i] = gp.Model(env=gp.Env(""))
For further reference:
https://groups.google.com/forum/#!topic/gurobi/_LztwSqj-14
https://www.gurobi.com/documentation/9.0/refman/py_env2.html
I'm making a bag from a plain txt file - it's got a bunch of reviews, delimited by two newlines. But, sometimes - and I really can't predict when - it gives me FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt' while processing it
Here's the actual code
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np
import dask.bag as bag
import os
def get_next_part(file, start_index, span_index=0, blocksize=1000):
file.seek(start_index)
buffer = file.read(blocksize + span_index).decode('cp1252')
delimiter_position = buffer.find('\n\n')
if delimiter_position == -1:
return get_next_part(file, start_index, span_index + blocksize)
else:
file.seek(start_index)
return start_index, delimiter_position
def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
with open(filename, 'rb') as file_handle:
file_handle.seek(start_index)
text = file_handle.read(delimiter_position).decode(encoding)
return dict((element.split(': ')[0], element.split(': ')[1])
if len(element.split(': ')) > 1
else ('unknown', element)
for element in text.strip().split('\n'))
with open(f"{os.getcwd()}/foods.txt", 'rb') as file_handle:
size = file_handle.seek(0,2) - 1
more_data = True
output = []
current_position = next_position = 0
while more_data:
if current_position >= size:
more_data = False
else:
current_position, next_position = get_next_part(file_handle, current_position, 0)
output.append((current_position, next_position))
current_position = current_position + next_position + 2
with ProgressBar():
reviews = (bag.from_sequence(output, npartitions=104)
.map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
x[0],
x[1]))
.compute())
Sometimes it works fine, but other times it gives me something along these lines (different percentage every time):
[########## ] | 26% Completed | 54.3s
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-1-90a316620d10> in <module>()
42 with ProgressBar():
43 reviews = (bag.from_sequence(output, npartitions=104)
---> 44 .map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
45 x[0],
46 x[1]))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs)
190 get_id=_process_get_id, dumps=dumps, loads=loads,
191 pack_exception=pack_exception,
--> 192 raise_exception=reraise, **kwargs)
193 finally:
194 if cleanup:
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
109 def reraise(exc, tb=None):
110 if exc.__traceback__ is not tb:
--> 111 raise exc.with_traceback(tb)
112 raise exc
113
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in execute_task()
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in reify()
1589 def reify(seq):
1590 if isinstance(seq, Iterator):
-> 1591 seq = list(seq)
1592 if seq and isinstance(seq[0], Iterator):
1593 seq = list(map(list, seq))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in map_chunk()
1749 else:
1750 for a in zip(*args):
-> 1751 yield f(*a)
1752
1753 # Check that all iterators are fully exhausted
<ipython-input-1-90a316620d10> in <lambda>()
44 .map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
45 x[0],
---> 46 x[1]))
47 .compute())
<ipython-input-1-90a316620d10> in get_item()
18
19 def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
---> 20 with open(filename, 'rb') as file_handle:
21 file_handle.seek(start_index)
22 text = file_handle.read(delimiter_position).decode(encoding)
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt'
I've tried messing with the partition numbers - leaving it as default (101), or making sure it's a multiple of 4. Doesn't seem to have an effect.
Anyone know what's going on here? It usually works if I run it a second time, but that's still tough to deal with.
I'm using the latest version of Dask. Using conda, it's all in Jupyterlab, and I'm running it from Windows Subsystem for Linux
Thanks!
Wasn't able to fix my initial read method, but was able to find another way of doing the parallel read (with native Dask objects too!)
Sections were delimited with \n\n and the linedelimiter argument to bag didn't mean what I thought it meant, but with this I was able to figure a way to get the sections I needed: Why `linedelimiter` does not work for bag.read_text?
bag.read_text(
f"{os.getcwd()}/foods.txt",
encoding="cp1252",
blocksize="10MB",
linedelimiter="\n\n",
)
.map_partitions(lambda x: "".join(x).split("\n\n"))
I'm having this error when trying to run this code in Python using CUDA. I'm following this tutorial but i'm trying it in Windows 7 x64 machine.
https://www.youtube.com/watch?v=jKV1m8APttU
In fact, I run check_cuda() and all tests passed. Can anyone help me what is the exact issue here.
My Code:
import numpy as np
from timeit import default_timer as timer
from numbapro import vectorize, cuda
#vectorize(['float64(float64, float64)'], target='gpu')
def VectorAdd(a, b):
return a + b
def main():
N = 32000000
A = np.ones(N, dtype=np.float64)
B = np.ones(N, dtype=np.float64)
C = np.zeros(N, dtype=np.float64)
start = timer()
C = VectorAdd(A, B)
vectoradd_time = timer() - start
print("C[:5] = " + str(C[:5]))
print("C[-5:] = " + str(C[-5:]))
print("VectorAdd took %f seconds" % vectoradd_time)
if __name__ == '__main__':
main()
Error Message:
---------------------------------------------------------------------------
CudaAPIError Traceback (most recent call last)
<ipython-input-18-2436fc2ab63a> in <module>()
1 if __name__ == '__main__':
----> 2 main()
<ipython-input-17-64de53fdbe77> in main()
7
8 start = timer()
----> 9 C = VectorAdd(A, B)
10 vectoradd_time = timer() - start
11
C:\Anaconda2\lib\site-packages\numba\cuda\dispatcher.pyc in __call__(self, *args, **kws)
93 the input arguments.
94 """
---> 95 return CUDAUFuncMechanism.call(self.functions, args, kws)
96
97 def reduce(self, arg, stream=0):
C:\Anaconda2\lib\site-packages\numba\npyufunc\deviceufunc.pyc in call(cls, typemap, args, kws)
297
298 devarys.extend([devout])
--> 299 cr.launch(func, shape[0], stream, devarys)
300
301 if any_device:
C:\Anaconda2\lib\site-packages\numba\cuda\dispatcher.pyc in launch(self, func, count, stream, args)
202
203 def launch(self, func, count, stream, args):
--> 204 func.forall(count, stream=stream)(*args)
205
206 def is_device_array(self, obj):
C:\Anaconda2\lib\site-packages\numba\cuda\compiler.pyc in __call__(self, *args)
193
194 return kernel.configure(blkct, tpb, stream=self.stream,
--> 195 sharedmem=self.sharedmem)(*args)
196
197 class CUDAKernelBase(object):
C:\Anaconda2\lib\site-packages\numba\cuda\compiler.pyc in __call__(self, *args, **kwargs)
357 blockdim=self.blockdim,
358 stream=self.stream,
--> 359 sharedmem=self.sharedmem)
360
361 def bind(self):
C:\Anaconda2\lib\site-packages\numba\cuda\compiler.pyc in _kernel_call(self, args, griddim, blockdim, stream, sharedmem)
431 sharedmem=sharedmem)
432 # Invoke kernel
--> 433 cu_func(*kernelargs)
434
435 if self.debug:
C:\Anaconda2\lib\site-packages\numba\cuda\cudadrv\driver.pyc in __call__(self, *args)
1114
1115 launch_kernel(self.handle, self.griddim, self.blockdim,
-> 1116 self.sharedmem, streamhandle, args)
1117
1118 #property
C:\Anaconda2\lib\site-packages\numba\cuda\cudadrv\driver.pyc in launch_kernel(cufunc_handle, griddim, blockdim, sharedmem, hstream, args)
1158 hstream,
1159 params,
-> 1160 None)
1161
1162
C:\Anaconda2\lib\site-packages\numba\cuda\cudadrv\driver.pyc in safe_cuda_api_call(*args)
220 def safe_cuda_api_call(*args):
221 retcode = libfn(*args)
--> 222 self._check_error(fname, retcode)
223
224 setattr(self, fname, safe_cuda_api_call)
C:\Anaconda2\lib\site-packages\numba\cuda\cudadrv\driver.pyc in _check_error(self, fname, retcode)
250 errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
251 msg = "Call to %s results in %s" % (fname, errname)
--> 252 raise CudaAPIError(retcode, msg)
253
254 def get_device(self, devnum=0):
CudaAPIError: [1] Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE
I found a solution to my problem through NVIDIA Developer Forum. If you wanna know more info regarding the solution check out this link.
https://devtalk.nvidia.com/default/topic/962843/cuda-programming-and-performance/cudaapierror-1-call-to-culaunchkernel-results-in-cuda_error_invalid_value-in-python/?offset=3#4968130
In Short:
When I changed the N = 32000 or any other smaller amount, it did work nicely.
In fact, this means I am not compiling it in correct GPU type(check_cuda is the function call to verify it).
Hope my answer would help for someone.
This may mean, that you try to run more threads in one block as it is actually allowed. For me it was the case. So try to split your execution in blocks.