I do have around 500 CSV file within S3 Bucket
Note: Each file size is around 1 GB, Around ~6m of lines.
What am trying to do is to concatenate all those CSV files into a single file.
I'm trying to speed up the Process but am not sure what else i can do in such case!
Below is my code:
import trio
import boto3
import pandas as pd
from functools import partial
AWS_ID = 'Hidden'
AWS_SECRET = 'Hidden'
Bucket_Name = 'Hidden'
limiter = trio.CapacityLimiter(10)
async def read_object(bucket, object_csv, sender):
async with limiter, sender:
print(f'Reading {object_csv}')
test = bucket.Object(object_csv)
test = test.get()['Body']
data = await trio.to_thread.run_sync(partial(pd.read_csv, test, header=None))
await sender.send(data)
async def main():
async with trio.open_nursery() as nurse:
s3 = boto3.resource(
service_name='s3',
aws_access_key_id=AWS_ID,
aws_secret_access_key=AWS_SECRET,
)
bucket = s3.Bucket(Bucket_Name)
allfiles = [i.key for i in bucket.objects.all()]
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for f in allfiles:
nurse.start_soon(read_object, bucket, f, sender.clone())
async def rec(receiver):
alldf = []
async with receiver:
async for df in receiver:
alldf.append(df)
final = pd.concat(alldf, ignore_index=True)
print(final)
if __name__ == "__main__":
try:
trio.run(main)
except KeyboardInterrupt:
exit('Job Cancelled!')
The part which is taking time:
data = await trio.to_thread.run_sync(partial(pd.read_csv, test, header=None))
For a single file it's taking about 2 minutes. Also I'm running the operation under threads! so it's taking a lot!
Update: New Code is Below:
limiter = trio.CapacityLimiter(10)
async def read_object(bucket, object_csv, sender):
async with limiter, sender:
print(f'Reading {object_csv}')
test = bucket.Object(object_csv)
test = test.get()['Body']
data = await trio.to_thread.run_sync(test.read)
await sender.send(data)
print(f'Done Reading {object_csv}')
async def main():
async with trio.open_nursery() as nurse:
s3 = boto3.resource(
service_name='s3',
aws_access_key_id=AWS_ID,
aws_secret_access_key=AWS_SECRET,
)
bucket = s3.Bucket(Bucket_Name)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for csv in bucket.objects.all():
nurse.start_soon(read_object, bucket, csv.key, sender.clone())
async def rec(receiver):
async with receiver, await trio.open_file('output.csv', 'wb') as f:
count = 0
async for df in receiver:
count += 1
await f.write(df)
await f.write(b"\n")
print(f'Collected {count}', flush=True, end='\r')
if __name__ == "__main__":
try:
trio.run(main)
except KeyboardInterrupt:
exit('Job Cancelled!')
Based on that code, is it possible to speed up the process more than that ?
Related
How would I add / remove sockets dynamically on the with / as statement. Or would i need a completely other aproach like asyncio.create_task?
import asyncio
from binance import AsyncClient, BinanceSocketManager
api_key = config.binance_key
api_secret = config.binance_secret
async def main():
client = await AsyncClient.create()
bm = BinanceSocketManager(client)
# start any sockets here, i.e a trade socket
ts1 = bm.symbol_ticker_socket('BNBBTC')
ts2 = bm.symbol_ticker_socket('ETHBUSD')
# then start receiving messages
async with ts1 as tscm1, ts2 as tscm2:
while True:
res1 = await tscm1.recv()
res2 = await tscm2.recv()
print(res1)
print(res2)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Use AsyncExitStack to combine and handle multiple asynchronous context managers.
An auxiliary coroutine read_ticker is used to read data from a ticker socket connection.
import asyncio
from binance import AsyncClient, BinanceSocketManager
from contextlib import AsyncExitStack
api_key = config.binance_key
api_secret = config.binance_secret
async def read_ticker(ts_cm):
"""Read ticker data from ticker socket connection"""
while True:
res = await ts_cm.recv()
print(res)
async def main():
client = await AsyncClient.create()
bm = BinanceSocketManager(client)
# start any sockets here, i.e a trade socket
ts1 = bm.symbol_ticker_socket('BNBBTC')
ts2 = bm.symbol_ticker_socket('ETHBUSD')
async with AsyncExitStack() as stack:
await asyncio.gather(*[read_ticker(await stack.enter_async_context(ts))
for ts in (ts1, ts2)])
if __name__ == "__main__":
asyncio.run(main())
I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.
I am struggling to have my both my websocket script and my I/O serial script running together in one together.
Just some basic info before I continue:
I am using Windows PC(Have no access to linux PC)
This is the reason why I am using the AIOserial library instead of pyserial-asyncio
I have no "super" experience with asyncio, so be kind please :)
Here is my "old" websocket script:
from aiohttp import web
import socketio
import aiohttp_cors
import asyncio
import random
# creates a new Async Socket IO Server
sio = socketio.AsyncServer()
# Creates
app = web.Application()
sio.attach(app)
server_is_responding = "Message from the server:"
the_response = "Hello there!"
async def index(request):
with open('index.html') as f:
print("Somebody entered the server from the browser!")
return web.Response(text=f.read(), content_type='text/html')
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
#return send_message_to_client()
#sio.on('sendTextToServer')
async def message(sid, data):
print("message: " , data)
if data == "hei":
await sio.emit("ServerMessage", {"hehe"})
if data == "lol":
await sio.emit("ServerMessage", {"Message from server:": "hehe, funny right?.."})
else:
await sio.emit("ServerMessage", {"Message from server:": "Hello There!"})
# We bind our aiohttp endpoint to our app
# router
cors = aiohttp_cors.setup(app)
app.router.add_get('/', index)
# We kick off our server
if __name__ == '__main__':
web.run_app(app)
And here is my I/O serial script(which works and read the data), that I am trying to use with some of the websocket functions above:
import asyncio
import websockets
import socketio
import aiohttp_cors
import logging
from AIOExtensions.AIOSerial import (AIOSerial, AIOSerialClosedException,
AIOSerialErrorException, AIOSerialNotOpenException)
logging.basicConfig(level=logging.DEBUG)
sio = socketio.AsyncServer()
async def hello(websocket, path):
name = await websocket.recv()
print(f"< {name}")
greeting = f"Hello {name}!"
await websocket.send(greeting)
print(f"> {greeting}")
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
async def read_IO_serial():
try:
async with AIOSerial('COM8', baudrate=115200, line_mode=True) as aios:
await asyncio.sleep(100)
try:
while True:
# read with timeout
rcvd = await asyncio.wait_for(aios.read(), timeout=1.0)
# print the data received
print(f"data received: {rcvd}")
if rcvd == b'RF initialized\n':
print("CATCHED THIS LINE!")
except asyncio.TimeoutError:
print("reception timed out ;-(")
except AIOSerialNotOpenException:
print("Unable to open the port!")
print()
print("Have you specified the right port number? COM7? COM8?")
# port fatal error
except AIOSerialErrorException:
print("Port error!")
# port already closed
except AIOSerialClosedException:
print("Serial port is closed!")
start_server = websockets.serve(hello, "http://192.168.1.6", 8080)
#sio.attach(start_server) # HOW CAN I ATTACH THIS SO IT CAN BE USED WITH THE SIO FUNCTIONS BELOW?
if start_server:
print("Server started!")
asyncio.run(read_IO_serial())
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()
As you can see in my first simple websocket script, I could use "sio.attach(app)" which made it possible to listed to events from client, so I need a way of replacing this "app" on my current script..
Someone who can please help me with this?
I solved it using asyncio.gather(), this is how I dit it:
from aiohttp import web
import socketio
import aiohttp_cors
import asyncio
import random
import asyncio as aio
import logging
import sys
# creates a new Async Socket IO Server
sio = socketio.AsyncServer()
# Creates
app = web.Application()
sio.attach(app)
server_is_responding = "Message from the server:"
the_response = "Hello there!"
async def index(request):
with open('index.html') as f:
print("Somebody entered the server from the browser!")
return web.Response(text=f.read(), content_type='text/html')
#sio.event
async def join(sid, message):
sio.enter_room(sid, message['room'])
await sio.emit('my_response', {'data': 'Entered room: ' + message['room']}, room=sid)
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
#sio.on("receiveMessageFromServer")
async def message(sid, data):
print("message: ", data)
# await asyncio.sleep(1 * random.random())
return "OKKKK", 123
from AIOExtensions.AIOSerial import (AIOSerial, AIOSerialClosedException,
AIOSerialErrorException, AIOSerialNotOpenException)
logging.basicConfig(level=logging.DEBUG)
async def read_IO_serial():
try:
async with AIOSerial('COM8', baudrate=115200, line_mode=True) as aios:
# aios.sp.baudrate = 230400
# aios.sp.baudrate = 115200
# await aios.write(b"AT\r\n")
# await aios.read()
# await aios.close()
await aio.sleep(100)
try:
while True:
# read with timeout
rcvd = await aio.wait_for(aios.read(), timeout=1.0)
# print the data received
print(f"data received: {rcvd}")
if rcvd == b'RF initialized\n':
print("CATCHED THIS LINE!")
except aio.TimeoutError:
print("reception timed out ;-(")
except AIOSerialNotOpenException:
print("Unable to open the port!")
print()
print("Have you specified the right port number? COM7? COM8?")
# port fatal error
except AIOSerialErrorException:
print("Port error!")
# port already closed
except AIOSerialClosedException:
print("Serial port is closed!")
async def on_startup(app):
pass
cors = aiohttp_cors.setup(app)
app.router.add_get('/', index)
# We kick off our server
if __name__ == '__main__':
loop = asyncio.get_event_loop()
group2 = asyncio.gather(read_IO_serial())
group1 = asyncio.gather(web.run_app(app))
all_groups = asyncio.gather(group1, group2)
results = loop.run_until_complete(all_groups)
# loop.close()
#print(results)
I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e
I have an asynchronous API which I'm using to connect and send mail to an SMTP server which has some setup and tear down to it. So it fits nicely into using a contextmanager from Python 3's contextlib.
Though, I don't know if it's possible write because they both use the generator syntax to write.
This might demonstrate the problem (contains a mix of yield-base and async-await syntax to demonstrate the difference between async calls and yields to the context manager).
#contextmanager
async def smtp_connection():
client = SMTPAsync()
...
try:
await client.connect(smtp_url, smtp_port)
await client.starttls()
await client.login(smtp_username, smtp_password)
yield client
finally:
await client.quit()
Is this kind of thing possible within python currently? and how would I use a with as statement if it is? If not is there a alternative way I could achieve this - maybe using the old style context manager?
Since Python 3.7, you can write:
from contextlib import asynccontextmanager
#asynccontextmanager
async def smtp_connection():
client = SMTPAsync()
...
try:
await client.connect(smtp_url, smtp_port)
await client.starttls()
await client.login(smtp_username, smtp_password)
yield client
finally:
await client.quit()
Before 3.7, you can use the async_generator package for this. On 3.6, you can write:
# This import changed, everything else is the same
from async_generator import asynccontextmanager
#asynccontextmanager
async def smtp_connection():
client = SMTPAsync()
...
try:
await client.connect(smtp_url, smtp_port)
await client.starttls()
await client.login(smtp_username, smtp_password)
yield client
finally:
await client.quit()
And if you want to work all the way back to 3.5, you can write:
# This import changed again:
from async_generator import asynccontextmanager, async_generator, yield_
#asynccontextmanager
#async_generator # <-- added this
async def smtp_connection():
client = SMTPAsync()
...
try:
await client.connect(smtp_url, smtp_port)
await client.starttls()
await client.login(smtp_username, smtp_password)
await yield_(client) # <-- this line changed
finally:
await client.quit()
Thanks to #jonrsharpe was able to make an async context manager.
Here's what mine ended up looking like for anyone who want's some example code:
class SMTPConnection():
def __init__(self, url, port, username, password):
self.client = SMTPAsync()
self.url = url
self.port = port
self.username = username
self.password = password
async def __aenter__(self):
await self.client.connect(self.url, self.port)
await self.client.starttls()
await self.client.login(self.username, self.password)
return self.client
async def __aexit__(self, exc_type, exc, tb):
await self.client.quit()
usage:
async with SMTPConnection(url, port, username, password) as client:
await client.sendmail(...)
Feel free to point out if I've done anything stupid.
The asyncio_extras package has a nice solution for this:
import asyncio_extras
#asyncio_extras.async_contextmanager
async def smtp_connection():
client = SMTPAsync()
...
For Python < 3.6, you'd also need the async_generator package and replace yield client with await yield_(client).
I find that you need to call obj.__aenter__(...) in the try and obj.__aexit__(...) in the final. Perhaps you do too if all you want is abstract an overly complicated object that has resources.
e.g.
import asyncio
from contextlib import asynccontextmanager
from pycoq.common import CoqContext, LocalKernelConfig
from pycoq.serapi import CoqSerapi
from pdb import set_trace as st
#asynccontextmanager
async def get_coq_serapi(coq_ctxt: CoqContext) -> CoqSerapi:
"""
Returns CoqSerapi instance that is closed with a with statement.
CoqContext for the file is also return since it can be used to manipulate the coq file e.g. return
the coq statements as in for `stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):`.
example use:
```
filenames = pycoq.opam.opam_strace_build(coq_package, coq_package_pin)
filename: str
for filename in filenames:
with get_coq_serapi(filename) as coq, coq_ctxt:
for stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):
```
ref:
- https://stackoverflow.com/questions/37433157/asynchronous-context-manager
- https://stackoverflow.com/questions/3693771/understanding-the-python-with-statement-and-context-managers
Details:
Meant to replace (see Brando's pycoq tutorial):
```
async with aiofile.AIOFile(filename, 'rb') as fin:
coq_ctxt = pycoq.common.load_context(filename)
cfg = opam.opam_serapi_cfg(coq_ctxt)
logfname = pycoq.common.serapi_log_fname(os.path.join(coq_ctxt.pwd, coq_ctxt.target))
async with pycoq.serapi.CoqSerapi(cfg, logfname=logfname) as coq:
```
usually then you loop through the coq stmts e.g.
```
for stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):
```
"""
try:
import pycoq
from pycoq import opam
from pycoq.common import LocalKernelConfig
import os
# - note you can't return the coq_ctxt here so don't create it due to how context managers work, even if it's needed layer for e.g. stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):
# _coq_ctxt: CoqContext = pycoq.common.load_context(coq_filepath)
# - not returned since it seems its only needed to start the coq-serapi interface
cfg: LocalKernelConfig = opam.opam_serapi_cfg(coq_ctxt)
logfname = pycoq.common.serapi_log_fname(os.path.join(coq_ctxt.pwd, coq_ctxt.target))
# - needed to be returned to talk to coq
coq: CoqSerapi = pycoq.serapi.CoqSerapi(cfg, logfname=logfname)
# - crucial, or coq._kernel is None and .execute won't work
await coq.__aenter__() # calls self.start(), this must be called by itself in the with stmt beyond yield
yield coq
except Exception as e:
# fin.close()
# coq.close()
import traceback
await coq.__aexit__(Exception, e, traceback.format_exc())
# coq_ctxt is just a data class serapio no need to close it, see: https://github.com/brando90/pycoq/blob/main/pycoq/common.py#L32
finally:
import traceback
err_msg: str = 'Finally exception clause'
exception_type, exception_value = Exception('Finally exception clause'), ValueError(err_msg)
print(f'{traceback.format_exc()=}')
await coq.__aexit__(exception_type, exception_value, traceback.format_exc())
# coq_ctxt is just a data class so no need to close it, see: https://github.com/brando90/pycoq/blob/main/pycoq/common.py#L32
# -
async def loop_through_files_original():
''' '''
import os
import aiofile
import pycoq
from pycoq import opam
coq_package = 'lf'
from pycoq.test.test_autoagent import with_prefix
coq_package_pin = f"file://{with_prefix('lf')}"
print(f'{coq_package=}')
print(f'{coq_package_pin=}')
print(f'{coq_package_pin=}')
filenames: list[str] = pycoq.opam.opam_strace_build(coq_package, coq_package_pin)
filename: str
for filename in filenames:
print(f'-> {filename=}')
async with aiofile.AIOFile(filename, 'rb') as fin:
coq_ctxt: CoqContext = pycoq.common.load_context(filename)
cfg: LocalKernelConfig = opam.opam_serapi_cfg(coq_ctxt)
logfname = pycoq.common.serapi_log_fname(os.path.join(coq_ctxt.pwd, coq_ctxt.target))
async with pycoq.serapi.CoqSerapi(cfg, logfname=logfname) as coq:
print(f'{coq._kernel=}')
for stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):
print(f'--> {stmt=}')
_, _, coq_exc, _ = await coq.execute(stmt)
if coq_exc:
raise Exception(coq_exc)
async def loop_through_files():
"""
to test run in linux:
```
python ~pycoq/pycoq/utils.py
python -m pdb -c continue ~/pycoq/pycoq/utils.py
```
"""
import pycoq
coq_package = 'lf'
from pycoq.test.test_autoagent import with_prefix
coq_package_pin = f"file://{with_prefix('lf')}"
print(f'{coq_package=}')
print(f'{coq_package_pin=}')
print(f'{coq_package_pin=}')
filenames: list[str] = pycoq.opam.opam_strace_build(coq_package, coq_package_pin)
filename: str
for filename in filenames:
print(f'-> {filename=}')
coq_ctxt: CoqContext = pycoq.common.load_context(filename)
async with get_coq_serapi(coq_ctxt) as coq:
print(f'{coq=}')
print(f'{coq._kernel=}')
stmt: str
for stmt in pycoq.split.coq_stmts_of_context(coq_ctxt):
print(f'--> {stmt=}')
_, _, coq_exc, _ = await coq.execute(stmt)
if coq_exc:
raise Exception(coq_exc)
if __name__ == '__main__':
asyncio.run(loop_through_files_original())
asyncio.run(loop_through_files())
print('Done!\a\n')
see code: https://github.com/brando90/pycoq/blob/main/pycoq/utils.py