asyncio + asyncpg + pandas: obtain pandas.df with async selects from db - ERROR

asyncio + asyncpg + pandas: obtain pandas.df with async selects from db - ERROR - python

Edited my code - NOW it WORKS
I'm trying to obtain some date from my Postgres db through asyncpg connection pool asynchronously.
Basically my db contain about 100 different tables (per city) and i'm trying to gather all the data in one frame as fast as it possible.
import pandas as pd
import asyncpg
import asyncio
from time import time
def make_t():
lst = []
# iterator for sql tuple
for i in ['a',
'b',
'c']:
i1 = i
sql = """
SELECT
'%s' as city,
MAX(starttime) AS max_ts
FROM
"table_%s"
"""
lst.append(sql % (i, i1))
return tuple(lst)
async def get_data(pool, sql):
start = time()
async with pool.acquire() as conn:
stmt = await conn.prepare(sql)
columns = [a.name for a in stmt.get_attributes()]
data = await stmt.fetch()
print(f'Exec time: {time() - start}')
return pd.DataFrame(data, columns=columns)
async def main():
dsn = 'postgres://user:pass#127.0.0.1:5432/my_base'
cT = ['city', 'max_ts']
sqls = make_t()
pool = await asyncpg.create_pool(dsn=dsn, max_size=50)
start = time()
tasks = []
for sql in sqls:
tasks.append(loop.create_task(get_data(pool, sql)))
tasks = await asyncio.gather(*tasks)
df = pd.DataFrame(columns=cT)
for task in tasks:
# form df from corutine results
df = df.append(task.result())
print(f'total exec time: {time() - start} secs')
print('exiting main')
return df
loop = asyncio.get_event_loop()
df = loop.run_until_complete(main())
loop.close()
print('exiting program')
Python 3.6.5 :: Anaconda, Inc.
Gets me this error:
Traceback (most recent call last):
File "", line 319, in
File "/Users/fixx/anaconda3/lib/python3.6/asyncio/base_events.py", line
468, in run_until_complete
return future.result()
File "", line 308, in main
File "/Users/fixx/anaconda3/lib/python3.6/asyncio/tasks.py", line 594, in gather
for arg in set(coros_or_futures):
TypeError: unhashable type: 'list'
I cant figure out, why? My sqls in tuple!

asyncio.gather accepts coroutines as individual arguments, and you are sending it a list of tasks. You have to use the * operator to call gather correctly:
tasks = await asyncio.gather(*tasks)

Related

Python Multiprocessing/Multithreading -> 'Future' object is not iterable

I am getting this error, when using the "submit" functionality of ProcessPoolExecutor.
Exception has occurred: TypeError
'Future' object is not iterable
File "C:......\test3.py", line 28, in
for f in as_completed(res):
import time
import json
import os
import requests
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from concurrent.futures import as_completed
BAN_API_URL = 'https://api-adresse.data.gouv.fr/search/'
def get_french_addresses(request):
print(f"Started task with pid: {os.getpid()} fetch addresses: {request['search_field']}")
query_params = {'q': request['search_field'], 'type': 'housenumber', 'autocomplete': 1}
response = requests.get(BAN_API_URL, params=query_params)
print(f"Finished task with pid: {os.getpid()} to address: {request['search_field']}")
return json.loads(response.text)
request_data = [
{'search_field': '17 rue saint maur'},
{'search_field': '35 boulevard voltaire'},
{'search_field': '32 rue rivoli'},
{'search_field': 'Route de la Croqueterie'},
]
if __name__ == '__main__':
start_time = time.time()
# Execute asynchronously with multi threads
with ProcessPoolExecutor() as executor:
res = executor.submit(get_french_addresses, request_data)
print(res)
for f in as_completed(res):
print(f.result())
end_time = time.time()
print(f'Total time to run multithreads: {end_time - start_time:2f}s')

you are using submit which passes all of the data to the function at once, what you want is to use map to pass it one item at a time, like so:
res = executor.map(get_french_addresses, request_data)
or if you need to keep using submit, you will have to split your data yourself:
res = []
with ProcessPoolExecutor() as executor:
for item in request_data:
res.append(executor.submit(get_french_addresses, item ))
print(res)
for f in as_completed(res):

the simplest edit to avoid the error, is to change
for f in as_completed(res):
to
for f in as_completed([res]):
However, this way it will almost be an equivalent of a synchronous call (I say 'almost' because some code still could execute between submit and as_completed, but because of GIL it should either be async itself or invoke some IO).
If you want the function get_french_addresses to return data asyncronously (as it processes it), it must be rewritten to support that.

Gathering results from an asyncio task list with exceptions

The code...
import asyncio
import random
from time import perf_counter
from typing import Iterable
from pprint import pprint
async def coro(n, i, threshold=0.4):
await asyncio.sleep(i)
if i > threshold:
# For illustration's sake - some coroutines may raise,
# and we want to accomodate that and just test for exception
# instances in the results of asyncio.gather(return_exceptions=True)
raise Exception(f"{i} of Task-{n} is too high")
return i
async def main(it: Iterable, timeout: float) -> tuple:
tasks = [asyncio.create_task(coro(i+1, d), name=f"Task-{i+1}") for i, d in enumerate(it)]
await asyncio.wait(tasks, timeout=timeout)
return tasks # *not* (done, pending)
timeout = 0.5
random.seed(444)
n = 10
it = [random.random() for _ in range(n)]
start = perf_counter()
tasks = asyncio.run(main(it=it, timeout=timeout))
elapsed = perf_counter() - start
print(f"Done main({n}) in {elapsed:0.2f} seconds\n")
pprint(tasks)
print('----')
# does not work from here on....
res = []
for t in tasks:
try:
r = t.result() # gives an error!!!
except Exception as e:
res.append(e)
else:
res.append(r)
pprint(res)
...does not work for collection of the task results. It fails with ...
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 8, in coro
await asyncio.sleep(i)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\asyncio\tasks.py", line 654, in sleep
return await future
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 35, in <module>
r = t.result()
asyncio.exceptions.CancelledError
Task exception was never retrieved
future: <Task finished name='Task-7' coro=<coro() done, defined at c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py:7> exception=Exception('i too high')>
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 13, in coro
raise Exception("i too high")
Exception: i too high
The code was run in python 3.9.
Any idea where I am going wrong and why?
Is it because the tasks need to be cancelled after it throws an exception? I could not successfully implement it.
Inspired by: Solution to wrapping asyncio.gather SO

Your code works, the issue why you are not able to create res successfully is because the code does not raise just the normal Exception class. Since the task fails it ends up calling asyncio.exceptions.CancelledError which if we take a look in the documentation inherits from BaseException not Exception. This change is new as of Python 3.8 and since you are using Python 3.9 that change is live. Changing your code slightly to the following yields:
res = []
for t in tasks:
try:
r = t.result() # gives an error!!!
except BaseException as e:
res.append(e)
continue
res.append(r)
print(res)
[0.3088946587429545,
0.01323751590501987,
Exception('0.4844375347808497 of Task-3 is too high'),
asyncio.exceptions.CancelledError(),
asyncio.exceptions.CancelledError(),
asyncio.exceptions.CancelledError(),
Exception('0.4419557492849159 of Task-7 is too high'),
0.3113884366691503,
0.07422124156714727,
asyncio.exceptions.CancelledError()]

Python3: cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'test'", ['127.0.0.1']) when using execute_async future

I am trying to fetch data from Cassandra from a specific table and trying to insert it into another table in Cassandra after making some changes. Both the tables are located in keyspace "test". When I am trying to get the data from the first table everything works fine and it is able to fetch the data. However, in the future handler which handles the output of the first query, I am trying to insert the data into another table under the same Cassandra instance and it is gettingting failed. I am getting an error from the application stating "cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'test'", ['127.0.0.1'])" . I am not sure where I am going wrong
import threading
from threading import Event
from cassandra.query import SimpleStatement
from cassandra.cluster import Cluster
hosts=['127.0.0.1']
keyspace="test"
thread_local = threading.local()
cluster_ = Cluster(hosts)
def get_session():
if hasattr(thread_local, "cassandra_session"):
print("got session from threadlocal")
return thread_local.cassandra_session
print(" Connecting to Cassandra Host " + str(hosts))
session_ = cluster_.connect(keyspace)
print(" Connecting and creating session to Cassandra KeySpace " + keyspace)
thread_local.cassandra_session = session_
return session_
class PagedResultHandler(object):
def __init__(self, future):
self.error = None
self.finished_event = Event()
self.future = future
self.future.add_callbacks(
callback=self.handle_page,
errback=self.handle_error)
def handle_page(self, rows):
for row in rows:
process_row(row)
if self.future.has_more_pages:
self.future.start_fetching_next_page()
else:
self.finished_event.set()
def handle_error(self, exc):
self.error = exc
self.finished_event.set()
def process_row(row):
print(row)
session_ = get_session()
stmt = session_.prepare(
"INSERT INTO test.data(customer,snr,rttt, event_time) VALUES (?,?,?,?)")
results = session_.execute(stmt,
[row.customer, row.snr, row.rttt,row.created_time])
print("Done")
session = get_session()
query = "select * from test.data_log"
statement = SimpleStatement(query, fetch_size=1000)
future = session.execute_async(statement)
handler = PagedResultHandler(future)
handler.finished_event.wait()
if handler.error:
raise handler.error
cluster_.shutdown()
However, when I try to execute the python file the application is throwing an error "cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'test'", ['127.0.0.1'])" from getSession() call from "process_row" method. Clearly, the first call to Cassandra is getting succeeded without any issues. There is no connectivity issue and the Cassandra instance is running fine locally. I am able to query the data using cqlsh. If I call the process_row method outside the future handler everything is working fine, I am not sure what needs to be done to make it happen from the Future Handler.
Connecting to Cassandra Host ['127.0.0.1']
Connecting and creating session to Cassandra KeySpace test
Row(customer='abcd', snr=100, rttt=121, created_time=datetime.datetime(2020, 8, 8, 2, 26, 51))
Connecting to Cassandra Host ['127.0.0.1']
Traceback (most recent call last):
File "test/check.py", , in <module>
raise handler.error
File "cassandra/cluster.py", line 4579, in cassandra.cluster.ResponseFuture._set_result
File "cassandra/cluster.py", line 4777, in cassandra.cluster.ResponseFuture._set_final_result
File "test/check.py"", in handle_page
process_row(row)
File "test/check.py"", in process_row
session_ = get_session()
File "/test/check.py"", in get_session
session_ = cluster_.connect(keyspace)
File "cassandra/cluster.py", line 1715, in cassandra.cluster.Cluster.connect
File "cassandra/cluster.py", line 1772, in cassandra.cluster.Cluster._new_session
File "cassandra/cluster.py", line 2553, in cassandra.cluster.Session.__init__
cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'test'", ['127.0.0.1'])
Process finished with exit code 1

Ok so Cassandra recommends the following:
Use at most one Session per keyspace, or use a single Session and explicitely specify the keyspace in your queries
https://www.datastax.com/blog/4-simple-rules-when-using-datastax-drivers-cassandra
In your code you try to create a session every time the read query has retrieved some rows.
To force the code to use at most one session we can create a queue where the child thread sends the row to the main thread and the main thread handles it further by executing the insert query. We do this in the main thread because I've experienced issues by executing queries in child thread.
callback_queue = Queue()
session = cluster_.connect(keyspace)
session.row_factory = dict_factory # because queue doesn't accept a Row instance
class PagedResultHandler(object):
...
def handle_page(self, rows):
for row in rows:
callback_queue.put(row) # here we pass the row as a dict to the queue
...
def process_rows():
while True:
try:
row = callback_queue.get() # here we retrieve the row as a dict from the child thread
stmt = session.prepare(
"INSERT INTO test.data(customer,snr,rttt, event_time) VALUES (?,?,?,?,?)")
results = session.execute(stmt,
[row['customer'], row['snr'], row['rttt'], row['created_time']])
print("Done")
except Empty:
pass
query = "select * from test.data_log"
statement = SimpleStatement(query, fetch_size=1000)
future = session.execute_async(statement)
handler = PagedResultHandler(future)
process_rows() # for now the code will hang here because we have an infinite loop in this function
handler.finished_event.wait()
if handler.error:
raise handler.error
cluster_.shutdown()
This will get it to work, but I would replace the while True else you will get into an infinite loop.

Ok so in that case we do 2 things, we can use multithreading and batch inserting. I think if we batch insert parallelism is not required, because that will speed things up from the client side fast enough. multithreading wouldn't add much more speed to it as it is not a cpu intensive task.
session = cluster_.connect(keyspace)
session.row_factory = dict_factory
class Fetcher:
def __init__(self, session):
self.session = session
query = "select * from test.data_log"
self.statement = SimpleStatement(query, fetch_size=1000)
def run(self):
rows = self.session.execute(self.statement)
temp_rows = []
total = 0
for row in rows:
temp_rows.append(row)
if len(temp_rows) == 1000:
handler = PagedResultHandler(self.session, temp_rows)
handler.start()
temp_rows = []
handler = PagedResultHandler(self.session, temp_rows)
handler.start()
def handle_error(self, err=None):
print(err)
class PagedResultHandler(threading.Thread):
def __init__(self, session, rows):
super().__init__()
self.session = session
self.error = None
self.rows = rows
self.finished_event = Event()
def run(self):
batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
stmt = session.prepare("INSERT INTO test.data(id, customer,snr,rttt, event_time) VALUES (?,?,?,?,?)")
for row in self.rows:
batch.add(stmt, [1, row['customer'], row['snr'], row['rttt'], row['created_time']])
results = session.execute(batch)
print(results)
Fetcher(session).run()
This does script does both batch inserting and multithreading, but again multithreading seems unnecessary.

Django celery - asyncio - daemonic process are not allowed to have children

I can see similar questions have been asked before but those are running multi processors and not executors. Therefore I am unsure how to fix this.
the GitHub issue also say its resolved in 4.1 https://github.com/celery/celery/issues/1709
I am using
celery==4.1.1
django-celery==3.2.1
django-celery-beat==1.0.1
django-celery-results==1.0.1
My script as as follows, ive tried to cut it down to show relevant code only.
#asyncio.coroutine
def snmp_get(ip, oid, snmp_user, snmp_auth, snmp_priv):
results=[]
snmpEngine = SnmpEngine()
errorIndication, errorStatus, errorIndex, varBinds = yield from getCmd(
...
)
...
for varBind in varBinds:
results.append(' = '.join([x.prettyPrint() for x in varBind]))
snmpEngine.transportDispatcher.closeDispatcher()
return results
def create_link_data_record(link_data):
obj = LinkData.objects.create(
...
)
return 'data polled for {} record {} created'.format(link_data.hostname, obj.id)
async def retrieve_data(link, loop):
from concurrent.futures import ProcessPoolExecutor
executor = ProcessPoolExecutor(2)
poll_interval = 60
results = []
# credentials:
...
print('polling data for {} on {}'.format(hostname,link_mgmt_ip))
# create link data obj
link_data = LinkDataObj()
...
# first poll for speeds
download_speed_data_poll1 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index ,snmp_user, snmp_auth, snmp_priv)
download_speed_data_poll1 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index ,snmp_user, snmp_auth, snmp_priv)
# check we were able to poll
if 'timeout' in str(get_snmp_value(download_speed_data_poll1)).lower():
return 'timeout trying to poll {} - {}'.format(hostname ,link_mgmt_ip)
upload_speed_data_poll1 = await snmp_get(link_mgmt_ip, up_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
# wait for poll interval
await asyncio.sleep(poll_interval)
# second poll for speeds
download_speed_data_poll2 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
upload_speed_data_poll2 = await snmp_get(link_mgmt_ip, up_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
# create deltas for speed
down_delta = int(get_snmp_value(download_speed_data_poll2)) - int(get_snmp_value(download_speed_data_poll1))
up_delta = int(get_snmp_value(upload_speed_data_poll2)) - int(get_snmp_value(upload_speed_data_poll1))
...
results.append(await loop.run_in_executor(executor, create_link_data_record, link_data))
return results
def get_link_data():
link_data = LinkTargets.objects.all()
# create loop
loop = asyncio.get_event_loop()
if asyncio.get_event_loop().is_closed():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(asyncio.new_event_loop())
# create tasks
tasks = [asyncio.ensure_future(retrieve_data(link, loop)) for link in link_data]
if tasks:
start = time.time()
done, pending = loop.run_until_complete(asyncio.wait(tasks))
loop.close()
the error below which references the run_in_executor code
[2018-05-24 14:13:00,840: ERROR/ForkPoolWorker-3] Task exception was never retrieved
future: <Task finished coro=<retrieve_data() done, defined at /itapp/itapp/monitoring/jobs/link_monitoring.py:130> exception=AssertionError('daemonic processes are not allowed to have children',)>
Traceback (most recent call last):
File "/itapp/itapp/monitoring/jobs/link_monitoring.py", line 209, in retrieve_data
link_data.last_change = await loop.run_in_executor(executor, timestamp, (link_data.link_target_id, link_data.service_status))
File "/usr/local/lib/python3.6/asyncio/base_events.py", line 639, in run_in_executor
return futures.wrap_future(executor.submit(func, *args), loop=self)
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 466, in submit
self._start_queue_management_thread()
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 427, in _start_queue_management_thread
self._adjust_process_count()
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 446, in _adjust_process_count
p.start()
File "/usr/local/lib/python3.6/multiprocessing/process.py", line 103, in start
'daemonic processes are not allowed to have children'
AssertionError: daemonic processes are not allowed to have children

Try with Celery 5-devel
pip install git+https://github.com/celery/celery#5.0-devel
As per below issue
https://github.com/celery/celery/issues/3884
Celery 5.0 will support asyncio. We currently do not support it.
And then there is also below SO thread on same
How to combine Celery with asyncio?

python cassandra driver same insert performance as copy

I'm trying to use Python async with Cassandra to see if I can write records to Cassandra faster than the CQL COPY command.
My python code looks like this:
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
with open('dataImport.txt') as f:
for line in f:
query = SimpleStatement (
"INSERT INTO tstTable (id, accts, info) VALUES (%s) " %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
but its giving me the same performance as the COPY command...around 2,700 rows/sec....should it be faster with async?
Do I need to use multithreading in python? Just reading about it but not sure how it fits into this...
EDIT:
so I found something online that i'm trying to modify but can't get to quite work...I have this so far..also I split the file into 3 file into /Data/toImport/ dir:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
def mp_worker(inputArg):
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO CustInfo (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = file_list
start = time.time()
in_dir = '/Data/toImport/'
N_Proc = 8
file_data = [(in_dir) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)
but get this error:
Traceback (most recent call last):
File "multiCass.py", line 27, in <module>
temp_in_data = file_list
NameError: name 'file_list' is not defined

This post A Multiprocessing Example for Improved Bulk Data Throughput provides all the details needed to improve the performance of bulk data ingestion. Basically there are 3 mechanisms and additional tuning can be done based on your use-case & hw:
single process (that's the case in your example)
multi-processing single queries
multi-processing concurrent queries
Size of batches and concurrency are the variables you'll have to play with yourself.

got it working like this:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
def mp_worker(inputArg):
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('poc')
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO testTable (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = ['/toImport/part-00000', '/toImport/part-00001', '/toImport/part-00002']
start = time.time()
N_Proc = 3
file_data = [(i,) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

asyncio + asyncpg + pandas: obtain pandas.df with async selects from db - ERROR - python

asyncio.gather accepts coroutines as individual arguments, and you are sending it a list of tasks. You have to use the * operator to call gather correctly: tasks = await asyncio.gather(*tasks)

Related

Python Multiprocessing/Multithreading -> 'Future' object is not iterable

Gathering results from an asyncio task list with exceptions

Python3: cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'test'", ['127.0.0.1']) when using execute_async future

Django celery - asyncio - daemonic process are not allowed to have children

python cassandra driver same insert performance as copy

Categories

Resources