The following code takes a username and scrapes their twitter history from a given date
import pandas as pd
import twint
import pywren
def scrape_user(username):
c = twint.Config()
c.Username = username
c.Lang = 'es'
c.Since = '2021-04-28'
c.Hide_output = True
c.Pandas = True
twint.run.Search(c)
return twint.storage.panda.Tweets_df
When I run the function, I get the intended result i.e., a Pandas dataframe e.g., scrape_user("DeLaCalleHum"). However, when I use pywren (on even a single username)
pwex = pywren.default_executor()
futures = pwex.map(scrape_user, "DeLaCalleHum")
tweet_list = pywren.get_all_results(futures)
I get this error.
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-31-15f9e00ead75> in <module>
----> 1 wc_list = pywren.get_all_results(futures)
~/macs30123/lib/python3.7/site-packages/pywren/wren.py in get_all_results(fs)
117 """
118 wait(fs, return_when=ALL_COMPLETED)
--> 119 return [f.result() for f in fs]
~/macs30123/lib/python3.7/site-packages/pywren/wren.py in <listcomp>(.0)
117 """
118 wait(fs, return_when=ALL_COMPLETED)
--> 119 return [f.result() for f in fs]
~/macs30123/lib/python3.7/site-packages/pywren/future.py in result(self, timeout, check_only, throw_except, storage_handler)
146 if self._state == JobState.error:
147 if throw_except:
--> 148 raise self._exception
149 else:
150 return None
OSError: [Errno 28] No space left on device
What am I doing wrong? I would appreciate any help.
After some time I found the answer. I can automatically parallelize such function calls in PyWren as long as I add the ComprehendFullAccess policy to my pywren_exec_role_1 role in IAM
Related
I am trying to download the data for several hundreds of stocks using pandas_datareader's .get_data_yahoo function. To speed up the process i want to use multithreading with python's concurrent.futures. One can see a stripped down version of my code in the code window below trying to download the stocks contained in the german DAX.
from pandas_datareader import data as pdr
from pytickersymbols import PyTickerSymbols
import concurrent.futures
import yfinance as yf
import datetime
import os
from time import sleep
yf.pdr_override()
def download_stockdata(ticker):
print(f"Downloading {ticker} \n")
df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
print(f"{ticker} downloaded \n")
return df
if __name__ == '__main__':
tickers = []
index_to_scan = "DAX"
for element in list(PyTickerSymbols().get_stocks_by_index(index_to_scan)):
if element["symbols"]:
tickers.append(element.get("symbols")[0].get("yahoo"))
print(f"Symbols in {index_to_scan}: {tickers} \n")
print("Starting multi thread download")
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
for ticker in tickers:
future = executor.submit(download_stockdata, ticker)
futures.append(future)
futures, _ = concurrent.futures.wait(futures)
for future in futures:
print(future.result())
When running this code i get the following error:
KeyError Traceback (most recent call last)
<ipython-input-1-2e4c65895072> in <module>
36 futures, _ = concurrent.futures.wait(futures)
37 for future in futures:
---> 38 print(future.result())
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~\anaconda3\envs\Trading\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<ipython-input-1-2e4c65895072> in download_stockdata(ticker)
11 def download_stockdata(ticker):
12 print(f"Downloading {ticker} \n")
---> 13 df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
14 print(f"{ticker} downloaded \n")
15 return df
~\anaconda3\envs\Trading\lib\site-packages\yfinance\multi.py in download(tickers, start, end, actions, threads, group_by, auto_adjust, back_adjust, progress, period, interval, prepost, proxy, rounding, **kwargs)
117
118 if len(tickers) == 1:
--> 119 return shared._DFS[tickers[0]]
120
121 try:
KeyError: 'BMW.F'
I tried different ways of multithreading like threading.Thread(), the ThreadPool from multiprocessing.pool and concurrent.futures. All methods result in a KeyError with a key that is not the same but varies from run to run. From this point on i have no more ideas how i could handle the Error. Can someone help me, to solve the KeyError?
I'm trying to authenticate to Atlas with the atlasapi. I'm using my google account and get the error ErrAtlasUnauthorized: Authentication is required with the below method. Is google auth supported or am I doing something wrong?
from atlasapi.atlas import Atlas
auth = Atlas("foo#google.com","<password>","<groupId>")
clusters = auth.Clusters.get_all_clusters
print (clusters())
full trace:
ErrAtlasUnauthorized Traceback (most recent call last)
<ipython-input-61-d69a101fdf69> in <module>
1 clusters = auth.Clusters.get_all_clusters
----> 2 print (clusters())
C:\...\atlasapi\atlas.py in get_all_clusters(self, pageNum, itemsPerPage, iterable)
129
130 uri = Settings.api_resources["Clusters"]["Get All Clusters"] % (self.atlas.group, pageNum, itemsPerPage)
--> 131 return self.atlas.network.get(Settings.BASE_URL + uri)
132
133 def get_single_cluster(self, cluster: str) -> dict:
C:\...\atlasapi\network.py in get(self, uri)
144 logger.debug("Auth information = {} {}".format(self.user, self.password))
145
--> 146 return self.answer(r.status_code, r.json())
147
148 except Exception:
C:\...\atlasapi\network.py in answer(self, c, details)
68 raise ErrAtlasBadRequest(c, details)
69 elif c == Settings.UNAUTHORIZED:
---> 70 raise ErrAtlasUnauthorized(c, details)
71 elif c == Settings.FORBIDDEN:
72 raise ErrAtlasForbidden(c, details)
ErrAtlasUnauthorized: Authentication is required
The API access keys are your User/Password.
I was trying to import ecoinvent 3.5 cutoff to a project using brightway, with the following:
if 'ecoinvent 3.5 cutoff' not in databases:
ei35cutofflink=r"H:\Data\ecoinvent 3.5_cutoff_lci_ecoSpold02\datasets"
ei35cutoff=SingleOutputEcospold2Importer(ei35cutofflink, 'ecoinvent 3.5 cutoff')
ei35cutoff.apply_strategies()
ei35cutoff.statistics()
ei35cutoff.write_database()
But I got the following error. It looks like the issue is not that related to brightway, but rather multiprocessing or pickle? I don't understand what the error message means.
---------------------------------------------------------------------------
MaybeEncodingError Traceback (most recent call last)
<ipython-input-4-f9acb2bc0c84> in <module>
1 if 'ecoinvent 3.5 cutoff' not in databases:
2 ei35cutofflink=r"H:\Data\ecoinvent 3.5_cutoff_lci_ecoSpold02\datasets"
----> 3 ei35cutoff=SingleOutputEcospold2Importer(ei35cutofflink, 'ecoinvent 3.5 cutoff')
4 ei35cutoff.apply_strategies()
5 ei35cutoff.statistics()
C:\miniconda3_py37\envs\ab\lib\site-packages\bw2io\importers\ecospold2.py in __init__(self, dirpath, db_name, extractor, use_mp, signal)
63 start = time()
64 try:
---> 65 self.data = extractor.extract(dirpath, db_name, use_mp=use_mp)
66 except RuntimeError as e:
67 raise MultiprocessingError('Multiprocessing error; re-run using `use_mp=False`'
C:\miniconda3_py37\envs\ab\lib\site-packages\bw2io\extractors\ecospold2.py in extract(cls, dirpath, db_name, use_mp)
91 ) for x in filelist
92 ]
---> 93 data = [p.get() for p in results]
94 else:
95 pbar = pyprind.ProgBar(len(filelist), title="Extracting ecospold2 files:", monitor=True)
C:\miniconda3_py37\envs\ab\lib\site-packages\bw2io\extractors\ecospold2.py in <listcomp>(.0)
91 ) for x in filelist
92 ]
---> 93 data = [p.get() for p in results]
94 else:
95 pbar = pyprind.ProgBar(len(filelist), title="Extracting ecospold2 files:", monitor=True)
C:\miniconda3_py37\envs\ab\lib\multiprocessing\pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x000001D257C55358>'. Reason: 'TypeError("can't pickle lxml.etree._ListErrorLog objects")'```
Use can use use_mp=False to get a sense of what the actual error is (instead of the error not being pickle-able, and this raising a separate errror). In this case I think you have a problem with the data folder, which you can solve by deleting it and downloading or extracting it again.
I am trying to use https://github.com/dowjones/dj-dna-streams-python/tree/master/dnaStreaming . It's a package to receive news streams from Dow Jones. When I try to "listen" to the streams I receive the following error:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-36-372f4305a9e1> in <module>()
1 while True:
----> 2 listener.listen(callback, maximum_messages=4, subscription_id=subscription_id)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\dj_dna_streaming_python-1.0.10-py3.5.egg\dnaStreaming\listener.py in listen(self, on_message_callback, maximum_messages, subscription_id)
21 def listen(self, on_message_callback, maximum_messages=DEFAULT_UNLIMITED_MESSAGES, subscription_id=None):
22 limit_pull_calls = not (maximum_messages == self.DEFAULT_UNLIMITED_MESSAGES)
---> 23 pubsub_client = pubsub_service.get_client(self.config)
24
25 subscription_id = subscription_id if subscription_id is not None else self.config.subscription()
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\dj_dna_streaming_python-1.0.10-py3.5.egg\dnaStreaming\services\pubsub_service.py in get_client(config)
7
8
----> 9 def get_client(config):
10 streaming_credentials = credentials_service.fetch_credentials(config)
11 credentials = authentication_service.get_authenticated_oauth_credentials(streaming_credentials)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\dj_dna_streaming_python-1.0.10-py3.5.egg\dnaStreaming\services\credentials_service.py in fetch_credentials(config)
11 response = _get_requests().get(config.credentials_uri(), headers=headers)
12
---> 13 streaming_credentials_string = json.loads(response.text)['data']['attributes']['streaming_credentials']
14
15 return json.loads(streaming_credentials_string)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\models.py in text(self)
824
825 self._content_consumed = True
--> 826 # don't need to release the connection; that's been handled by urllib3
827 # since we exhausted the data.
828 return self._content
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\models.py in apparent_encoding(self)
694 is **not** a check to see if the response code is ``200 OK``.
695 """
--> 696 try:
697 self.raise_for_status()
698 except HTTPError:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\chardet\__init__.py in detect(aBuf)
ImportError: cannot import name 'universaldetector'
I understand that the key part is that I can't import universaldetector. Any idea why is that? I have seen this answer but can't really relate to my problem. I have upgraded chardet and requests.
I am on Python3 and win . Executing code in Jupyter Notebook.
I have a 2000 rows data frame and I'm trying to slice the same data frame into two and combine them together.
t1 = test[:10, :]
t2 = test[20:, :]
temp = t1.rbind(t2)
temp.show()
Then I got this error:
---------------------------------------------------------------------------
EnvironmentError Traceback (most recent call last)
<ipython-input-37-8daeb3375743> in <module>()
2 t2 = test[20:, :]
3 temp = t1.rbind(t2)
----> 4 temp.show()
5 print len(temp)
6 print len(test)
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in show(self, use_pandas)
383 print("This H2OFrame has been removed.")
384 return
--> 385 if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill()
386 if H2ODisplay._in_ipy():
387 import IPython.display
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in _frame(self, fill_cache)
423
424 def _frame(self, fill_cache=False):
--> 425 self._ex._eager_frame()
426 if fill_cache:
427 self._ex._cache.fill()
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eager_frame(self)
67 if not self._cache.is_empty(): return self
68 if self._cache._id is not None: return self # Data already computed under ID, but not cached locally
---> 69 return self._eval_driver(True)
70
71 def _eager_scalar(self): # returns a scalar (or a list of scalars)
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eval_driver(self, top)
81 def _eval_driver(self, top):
82 exec_str = self._do_it(top)
---> 83 res = ExprNode.rapids(exec_str)
84 if 'scalar' in res:
85 if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']]
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in rapids(expr)
163 The JSON response (as a python dictionary) of the Rapids execution
164 """
--> 165 return H2OConnection.post_json("Rapids", ast=expr,session_id=H2OConnection.session_id(), _rest_version=99)
166
167 class ASTId:
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
515 if __H2OCONN__ is None:
516 raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 517 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 520 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
521 return self._process_tables(raw_txt.json())
522
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
592 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
593 "detailed error messages: {}")
--> 594 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
595
596
EnvironmentError: h2o-py got an unexpected HTTP status code:
500 Server Error (method = POST; url = http://localhost:54321/99/Rapids).
detailed error messages: []
If I count rows (len(temp)), it works find. Also if I change the slicing index a little bit, it works find too. For example, if I change to this, it shows the data frame.
t1 = test[:10, :]
t2 = test[:5, :]
Do I miss something here? Thanks.
Unclear what happened without more information (logs would probably say why the rbind did not take).
What version are you using? I tried your code with iris on the bleeding edge and it all worked as expected.
By the way, rbind is typically going to be expensive, especially since what you're semantically after is a subset:
test[range(10) + range(20,test.nrow),:]
should also give you the desired subset (with caveat that you make the full list of row indices in python and pass it over REST to h2o).