I am trying to download the data for several hundreds of stocks using pandas_datareader's .get_data_yahoo function. To speed up the process i want to use multithreading with python's concurrent.futures. One can see a stripped down version of my code in the code window below trying to download the stocks contained in the german DAX.
from pandas_datareader import data as pdr
from pytickersymbols import PyTickerSymbols
import concurrent.futures
import yfinance as yf
import datetime
import os
from time import sleep
yf.pdr_override()
def download_stockdata(ticker):
print(f"Downloading {ticker} \n")
df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
print(f"{ticker} downloaded \n")
return df
if __name__ == '__main__':
tickers = []
index_to_scan = "DAX"
for element in list(PyTickerSymbols().get_stocks_by_index(index_to_scan)):
if element["symbols"]:
tickers.append(element.get("symbols")[0].get("yahoo"))
print(f"Symbols in {index_to_scan}: {tickers} \n")
print("Starting multi thread download")
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
for ticker in tickers:
future = executor.submit(download_stockdata, ticker)
futures.append(future)
futures, _ = concurrent.futures.wait(futures)
for future in futures:
print(future.result())
When running this code i get the following error:
KeyError Traceback (most recent call last)
<ipython-input-1-2e4c65895072> in <module>
36 futures, _ = concurrent.futures.wait(futures)
37 for future in futures:
---> 38 print(future.result())
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~\anaconda3\envs\Trading\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<ipython-input-1-2e4c65895072> in download_stockdata(ticker)
11 def download_stockdata(ticker):
12 print(f"Downloading {ticker} \n")
---> 13 df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
14 print(f"{ticker} downloaded \n")
15 return df
~\anaconda3\envs\Trading\lib\site-packages\yfinance\multi.py in download(tickers, start, end, actions, threads, group_by, auto_adjust, back_adjust, progress, period, interval, prepost, proxy, rounding, **kwargs)
117
118 if len(tickers) == 1:
--> 119 return shared._DFS[tickers[0]]
120
121 try:
KeyError: 'BMW.F'
I tried different ways of multithreading like threading.Thread(), the ThreadPool from multiprocessing.pool and concurrent.futures. All methods result in a KeyError with a key that is not the same but varies from run to run. From this point on i have no more ideas how i could handle the Error. Can someone help me, to solve the KeyError?
Related
I'm new to the asyncio module. Until recently I used to use requests for the task I'm about to write about.
I'm trying to scale a script I work with which works well with up to 120 calls on Requests. However, with request being single threaded, it would take forever to get 1000 API calls (which is what I'm trying to achieve). This is where I found asyncio which makes asynchonous requests.
This is a script I put together for asyncio with the help of this article.
url = 'https://api.url.com/search?api_key=api_key&api_params=multiple_params'
queries = ['online slots', 'metaverse', 'sports betting', 'basketball odds', 'soccer odds', 'online poker', 'best casinos in germany', 'barbecue grills', 'outdoor pizza ovens']
results = []
def get_tasks(session):
tasks=[]
for q in queries:
tasks.append(asyncio.create_task(session.get(url.format(q), ssl=False)))
return tasks
timeout = ClientTimeout(total=500)
async def get_queries():
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = get_tasks(session)
responses = await asyncio.gather(*tasks)
for response in responses:
results.append(await response.json())
asyncio.run(get_queries())
It seems to work fine in most instances. But, it seems to timeout on many occasions. When I'm using the German queries and when it is making more than 500 API calls.
Below is what I keep getting back. As you can see in the script I've added a client timeout.
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
<ipython-input-4-8c48df090394> in <module>
33 results.append(await response.json())
34
---> 35 asyncio.run(get_queries())
/opt/anaconda3/lib/python3.8/site-packages/nest_asyncio.py in run(future, debug)
30 loop = asyncio.get_event_loop()
31 loop.set_debug(debug)
---> 32 return loop.run_until_complete(future)
33
34 if sys.version_info >= (3, 6, 0):
/opt/anaconda3/lib/python3.8/site-packages/nest_asyncio.py in run_until_complete(self, future)
68 raise RuntimeError(
69 'Event loop stopped before Future completed.')
---> 70 return f.result()
71
72 def _run_once(self):
/opt/anaconda3/lib/python3.8/asyncio/futures.py in result(self)
176 self.__log_traceback = False
177 if self._exception is not None:
--> 178 raise self._exception
179 return self._result
180
/opt/anaconda3/lib/python3.8/asyncio/tasks.py in __step(***failed resolving arguments***)
278 # We use the `send` method directly, because coroutines
279 # don't have `__iter__` and `__next__` methods.
--> 280 result = coro.send(None)
281 else:
282 result = coro.throw(exc)
<ipython-input-4-8c48df090394> in get_queries()
29 async with aiohttp.ClientSession(timeout=timeout) as session:
30 tasks = get_tasks(session)
---> 31 responses = await asyncio.gather(*tasks)
32 for response in responses:
33 results.append(await response.json())
/opt/anaconda3/lib/python3.8/asyncio/tasks.py in __wakeup(self, future)
347 def __wakeup(self, future):
348 try:
--> 349 future.result()
350 except BaseException as exc:
351 # This may also be a cancellation.
/opt/anaconda3/lib/python3.8/asyncio/tasks.py in __step(***failed resolving arguments***)
280 result = coro.send(None)
281 else:
--> 282 result = coro.throw(exc)
283 except StopIteration as exc:
284 if self._must_cancel:
/opt/anaconda3/lib/python3.8/site-packages/aiohttp/client.py in throw(self, arg)
1123
1124 def throw(self, arg: BaseException) -> None: # type: ignore[arg-type,override]
-> 1125 self._coro.throw(arg)
1126
1127 def close(self) -> None:
/opt/anaconda3/lib/python3.8/site-packages/aiohttp/client.py in _request(self, method, str_or_url, params, data, json, cookies, headers, skip_auto_headers, auth, allow_redirects, max_redirects, compress, chunked, expect100, raise_for_status, read_until_eof, proxy, proxy_auth, timeout, verify_ssl, fingerprint, ssl_context, ssl, proxy_headers, trace_request_ctx, read_bufsize)
557 resp = await req.send(conn)
558 try:
--> 559 await resp.start(conn)
560 except BaseException:
561 resp.close()
/opt/anaconda3/lib/python3.8/site-packages/aiohttp/client_reqrep.py in start(self, connection)
911 if self._continue is not None:
912 set_result(self._continue, True)
--> 913 self._continue = None
914
915 # payload eof handler
/opt/anaconda3/lib/python3.8/site-packages/aiohttp/helpers.py in __exit__(self, exc_type, exc_val, exc_tb)
719
720 if exc_type is asyncio.CancelledError and self._cancelled:
--> 721 raise asyncio.TimeoutError from None
722 return None
723
TimeoutError:
Can anyone help me figure what I'm doing wrong? And how to avoid timeouts for large amounts of API calls on asyncio?
Much appreciated!
I'm trying to use crypto feed to download data concurrently.
f = FeedHandler()
f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
f.run()
This code above can be run successfully. However, I am trying to run it in the background. So I am using concurrent futures to help.
executor = concurrent.futures.ThreadPoolExecutor(16)
job2 = executor.submit(f.run)
However, I got error:
job2.result()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-54-f96e35ee3c66> in <module>
----> 1 job2.result()
~/anaconda3/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
~/anaconda3/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~/anaconda3/lib/python3.8/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~/anaconda3/lib/python3.8/site-packages/cryptofeed/feedhandler.py in run(self, start_loop, install_signal_handlers, exception_handler)
145 raise ValueError(txt)
146
--> 147 loop = asyncio.get_event_loop()
148 # Good to enable when debugging or without code change: export PYTHONASYNCIODEBUG=1)
149 # loop.set_debug(True)
~/anaconda3/lib/python3.8/asyncio/events.py in get_event_loop(self)
637
638 if self._local._loop is None:
--> 639 raise RuntimeError('There is no current event loop in thread %r.'
640 % threading.current_thread().name)
641
RuntimeError: There is no current event loop in thread 'ThreadPoolExecutor-0_0'.
Could anyone help me? Thanks so much!
Edit: following
def threadable():
f = FeedHandler()
f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
f.run()
executor = concurrent.futures.ThreadPoolExecutor(16)
job2 = executor.submit(threadable)
job2.done()
job2.result()
I got the error: It seems I still got the same error about event loop... is it solvable?
RuntimeError Traceback (most recent call last)
<ipython-input-47-05c023dd326f> in <module>
11 job2.done()
12
---> 13 job2.result()
~/anaconda3/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
437 raise CancelledError()
438 elif self._state == FINISHED:
--> 439 return self.__get_result()
440 else:
441 raise TimeoutError()
~/anaconda3/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~/anaconda3/lib/python3.8/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<ipython-input-47-05c023dd326f> in threadable()
2 f = FeedHandler()
3 f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
----> 4 f.run()
5
6
~/anaconda3/lib/python3.8/site-packages/cryptofeed/feedhandler.py in run(self, start_loop, install_signal_handlers, exception_handler)
145 raise ValueError(txt)
146
--> 147 loop = asyncio.get_event_loop()
148 # Good to enable when debugging or without code change: export PYTHONASYNCIODEBUG=1)
149 # loop.set_debug(True)
~/anaconda3/lib/python3.8/asyncio/events.py in get_event_loop(self)
637
638 if self._local._loop is None:
--> 639 raise RuntimeError('There is no current event loop in thread %r.'
640 % threading.current_thread().name)
641
RuntimeError: There is no current event loop in thread 'ThreadPoolExecutor-1_0'.
In the single-threaded version of your code, all three of these statements execute in the same thread in a simple sequential fashion:
f = FeedHandler()
f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
f.run()
In the multithreaded version, you submit only the last line to the Executor, and therefore it will run in a secondary thread. But these statements, as far as I can tell from the code you provided, still execute in the main thread:
f = FeedHandler()
f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
How do you know that will work? In general it would depend on the implementation details of Gateio and Feedhandler. You need to be very careful about chopping up a program into pieces to be run in different threads, especially when third-party library calls are involved. So, good luck with that.
You could try this:
def threadable():
f = FeedHandler()
f.add_feed(Gateio(channels=[TRADES], symbols=list_tmp, callbacks={ TRADES: TradePostgresGateio(**postgres_cfg)}))
f.run()
...
executor = concurrent.futures.ThreadPoolExecutor(16)
job2 = executor.submit(threadable)
Then, at least, your entire sequence of steps will execute in the SAME thread.
I would be worried about those callbacks, however. They will now run in the secondary thread, and you need to understand the consequences of that. Do they interact with a user interface program? Your UI may not support multithreading.
The use of the Executor protocol is a bit weird here, since your function doesn't return a value. The Executors are most useful when they are used to aggregate returned values. You may be better off just launching the threads you need using methods in the threading module.
The following code takes a username and scrapes their twitter history from a given date
import pandas as pd
import twint
import pywren
def scrape_user(username):
c = twint.Config()
c.Username = username
c.Lang = 'es'
c.Since = '2021-04-28'
c.Hide_output = True
c.Pandas = True
twint.run.Search(c)
return twint.storage.panda.Tweets_df
When I run the function, I get the intended result i.e., a Pandas dataframe e.g., scrape_user("DeLaCalleHum"). However, when I use pywren (on even a single username)
pwex = pywren.default_executor()
futures = pwex.map(scrape_user, "DeLaCalleHum")
tweet_list = pywren.get_all_results(futures)
I get this error.
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-31-15f9e00ead75> in <module>
----> 1 wc_list = pywren.get_all_results(futures)
~/macs30123/lib/python3.7/site-packages/pywren/wren.py in get_all_results(fs)
117 """
118 wait(fs, return_when=ALL_COMPLETED)
--> 119 return [f.result() for f in fs]
~/macs30123/lib/python3.7/site-packages/pywren/wren.py in <listcomp>(.0)
117 """
118 wait(fs, return_when=ALL_COMPLETED)
--> 119 return [f.result() for f in fs]
~/macs30123/lib/python3.7/site-packages/pywren/future.py in result(self, timeout, check_only, throw_except, storage_handler)
146 if self._state == JobState.error:
147 if throw_except:
--> 148 raise self._exception
149 else:
150 return None
OSError: [Errno 28] No space left on device
What am I doing wrong? I would appreciate any help.
After some time I found the answer. I can automatically parallelize such function calls in PyWren as long as I add the ComprehendFullAccess policy to my pywren_exec_role_1 role in IAM
I've stumbled upon a 'NoBrokersAvailable: NoBrokersAvailable'-error in our Jupyter-notebook using this code:
from kafka import KafkaProducer
from kafka.errors import KafkaError
def on_send_success(record_metadata):
print(record_metadata.topic)
print(record_metadata.partition)
print(record_metadata.offset)
def on_send_error(excp):
log.error('I am an errback', exc_info=excp)
# handle exception
producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=lambda m: json.dumps(m).encode('utf-8'))
INTERVAL =10
while True:
data_points = get_realtime_stock('AAPL')
data = {'updated_on': data_points['updated_on'], 'ticker': data_points['security']['ticker'] ,'last_price': data_points['last_price']}
message = data_points
producer.send('data1', value=data).add_callback(on_send_success).add_errback(on_send_error)
time.sleep(INTERVAL)
Here the respective error:
---------------------------------------------------------------------------
NoBrokersAvailable Traceback (most recent call last)
<ipython-input-8-cab724428b84> in <module>
11 # handle exception
12
---> 13 producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=lambda m: json.dumps(m).encode('utf-8'))
14 INTERVAL =10
15 while True:
~/anaconda3/lib/python3.7/site-packages/kafka/producer/kafka.py in __init__(self, **configs)
379 client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
380 wakeup_timeout_ms=self.config['max_block_ms'],
--> 381 **self.config)
382
383 # Get auto-discovered version from client if necessary
~/anaconda3/lib/python3.7/site-packages/kafka/client_async.py in __init__(self, **configs)
237 if self.config['api_version'] is None:
238 check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
--> 239 self.config['api_version'] = self.check_version(timeout=check_timeout)
240
241 def _can_bootstrap(self):
~/anaconda3/lib/python3.7/site-packages/kafka/client_async.py in check_version(self, node_id, timeout, strict)
890 else:
891 self._lock.release()
--> 892 raise Errors.NoBrokersAvailable()
893
894 def wakeup(self):
NoBrokersAvailable: NoBrokersAvailable
The code worked just fine but out of nowhere it just stopped working for whatever reason.
Does anyone know what the problem might be?
I had the same error and I solved it by specifying the API version on the function KafkaProducer. Here is a sample from my code.
Please specify the version of your kafka-python library if the error persists.
producer = KafkaProducer(
bootstrap_servers=#####,
client_id=######,
value_serializer=JsonSerializer.serialize,
api_version=(0, 10, 1)
)
For the API version, you should put your Kafka version.
I'm having trouble with parallel processing on a news scraping script.
I have the following script that reads a google news rss page and processes each of the links returned. news_list is a BeautifulSoup element which contains information on the 10 most recent news on some subject.
def main(news_list):
news_list = soup_page.findAll("item")
feed = []
for article in news_list[:10]:
new = {}
new['title'] = article.title.text
new['source'] = article.source.text
new['link'] = article.link.text
new['date'] = datetime.strptime(article.pubDate.text, '%a, %d %b %Y %H:%M:%S %Z')
new['keywords'] = keywords(article.link.text)
feed.append(new)
The function keywords processes the news content and return salient keywords. This function takes about 1.5 seconds per news article, so the full script takes at least 15 seconds to run.
I want to reduce the duration of the script so i've been trying multiprocessing instead of the for loop, like this:
def process_article(article):
new = {}
new['title'] = article.title.text
new['source'] = article.source.text
new['link'] = article.link.text
new['date'] = datetime.strptime(article.pubDate.text, '%a, %d %b %Y %H:%M:%S %Z')
new['keywords'] = keywords(article.link.text)
return new
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
feed = Parallel(n_jobs=num_cores)(delayed(process_news)(article) for article in news_list[:10])
However, im getting an error as if the function process_article was recursive:
RecursionError: maximum recursion depth exceeded while calling a Python object
What am i doing wrong? It still happens if I write the function as the following, so the keywords function is not the problem
def process_article(article):
new = {}
return new
Any help is appreciated. Thanks!
This is the full traceback:
RecursionError Traceback (most recent call last)
<ipython-input-90-498afb9f1a25> in <module>
1 num_cores = multiprocessing.cpu_count()
2
----> 3 results = Parallel(n_jobs=num_cores)(delayed(process_news)(article) for article in list(news_list[:10]))
/usr/local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
787 # consumption.
788 self._iterating = False
--> 789 self.retrieve()
790 # Make sure that we get a last message telling us we are done
791 elapsed_time = time.time() - self._start_time
/usr/local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
697 try:
698 if getattr(self._backend, 'supports_timeout', False):
--> 699 self._output.extend(job.get(timeout=self.timeout))
700 else:
701 self._output.extend(job.get())
/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
/usr/local/lib/python3.6/site-packages/joblib/pool.py in send(obj)
369 def send(obj):
370 buffer = BytesIO()
--> 371 CustomizablePickler(buffer, self._reducers).dump(obj)
372 self._writer.send_bytes(buffer.getvalue())
373 self._send = send
RecursionError: maximum recursion depth exceeded while calling a Python object