I'm having trouble with parallel processing on a news scraping script.
I have the following script that reads a google news rss page and processes each of the links returned. news_list is a BeautifulSoup element which contains information on the 10 most recent news on some subject.
def main(news_list):
news_list = soup_page.findAll("item")
feed = []
for article in news_list[:10]:
new = {}
new['title'] = article.title.text
new['source'] = article.source.text
new['link'] = article.link.text
new['date'] = datetime.strptime(article.pubDate.text, '%a, %d %b %Y %H:%M:%S %Z')
new['keywords'] = keywords(article.link.text)
feed.append(new)
The function keywords processes the news content and return salient keywords. This function takes about 1.5 seconds per news article, so the full script takes at least 15 seconds to run.
I want to reduce the duration of the script so i've been trying multiprocessing instead of the for loop, like this:
def process_article(article):
new = {}
new['title'] = article.title.text
new['source'] = article.source.text
new['link'] = article.link.text
new['date'] = datetime.strptime(article.pubDate.text, '%a, %d %b %Y %H:%M:%S %Z')
new['keywords'] = keywords(article.link.text)
return new
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
feed = Parallel(n_jobs=num_cores)(delayed(process_news)(article) for article in news_list[:10])
However, im getting an error as if the function process_article was recursive:
RecursionError: maximum recursion depth exceeded while calling a Python object
What am i doing wrong? It still happens if I write the function as the following, so the keywords function is not the problem
def process_article(article):
new = {}
return new
Any help is appreciated. Thanks!
This is the full traceback:
RecursionError Traceback (most recent call last)
<ipython-input-90-498afb9f1a25> in <module>
1 num_cores = multiprocessing.cpu_count()
2
----> 3 results = Parallel(n_jobs=num_cores)(delayed(process_news)(article) for article in list(news_list[:10]))
/usr/local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
787 # consumption.
788 self._iterating = False
--> 789 self.retrieve()
790 # Make sure that we get a last message telling us we are done
791 elapsed_time = time.time() - self._start_time
/usr/local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
697 try:
698 if getattr(self._backend, 'supports_timeout', False):
--> 699 self._output.extend(job.get(timeout=self.timeout))
700 else:
701 self._output.extend(job.get())
/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
/usr/local/lib/python3.6/site-packages/joblib/pool.py in send(obj)
369 def send(obj):
370 buffer = BytesIO()
--> 371 CustomizablePickler(buffer, self._reducers).dump(obj)
372 self._writer.send_bytes(buffer.getvalue())
373 self._send = send
RecursionError: maximum recursion depth exceeded while calling a Python object
Related
Im pulling daily all the emails from some 8 different postboxes via exchangelib. I was using it the whole week but now the code seems to be throttled by the exchangeserver as the error below gets thrown out while its trying to grab the first email. So I want to learn how to handle it and dont get throttled anymore. I already implemented one retry policy
credentials = Credentials(username='username', password='password')
config = Configuration(retry_policy=FaultTolerance(max_wait=600), credentials=credentials)
for that im using following code:
while True:
try:
for shared_postbox in tqdm(shared_postboxes):
account = Account(shared_postbox, credentials=credentials, config = config, autodiscover=True)
top_folder = account.root
email_folders = [f for f in top_folder.walk() if isinstance(f, Messages)]
for folder in tqdm(email_folders):
#added item_class in filter and removed order by
#for m in folder.all().only('text_body', 'datetime_received', "sender").filter(datetime_received__range=(start_of_month,end_of_month), sender__exists=True):
#when since statement is needed
for m in folder.all().only('text_body', 'datetime_received', "sender").filter(datetime_received__gt=midnight, sender__exists=True):
try:
senderdomain = ExtractingDomain(m.sender.email_address)
senderdomains.append(senderdomain)
except:
print("could not extract domain")
else:
if senderdomain in domains_of_interest:
postboxname = account.identity.primary_smtp_address
body = m.text_body
emails.append(body)
sender.append(senderdomain)
postbox.append(postboxname)
received.append(m.datetime_received)
#else:
# print("nicht in domains of interest")
account.protocol.close()
except RateLimitError as e:
time.sleep(60)
following error code i get:
RateLimitError Traceback (most recent call last)
Input In [4], in <cell line: 77>()
81 account = Account(shared_postbox, credentials=credentials, config = config, autodiscover=True)
---> 82 top_folder = account.root
83 email_folders = [f for f in top_folder.walk() if isinstance(f, Messages)]
File ~\.conda\envs\python383\lib\site-packages\cached_property.py:74, in threaded_cached_property.__get__(self, obj, cls)
72 except KeyError:
73 # if not, do the calculation and release the lock
---> 74 return obj_dict.setdefault(name, self.func(obj))
File ~\.conda\envs\python383\lib\site-packages\exchangelib\account.py:349, in Account.root(self)
347 #threaded_cached_property
348 def root(self):
--> 349 return Root.get_distinguished(account=self)
File ~\.conda\envs\python383\lib\site-packages\exchangelib\folders\roots.py:114, in RootOfHierarchy.get_distinguished(cls, account)
113 try:
--> 114 return cls.resolve(
115 account=account, folder=cls(account=account, name=cls.DISTINGUISHED_FOLDER_ID, is_distinguished=True)
116 )
117 except MISSING_FOLDER_ERRORS:
File ~\.conda\envs\python383\lib\site-packages\exchangelib\folders\base.py:512, in BaseFolder.resolve(cls, account, folder)
509 #classmethod
510 def resolve(cls, account, folder):
511 # Resolve a single folder
--> 512 folders = list(FolderCollection(account=account, folders=[folder]).resolve())
513 if not folders:
File ~\.conda\envs\python383\lib\site-packages\exchangelib\folders\collections.py:335, in FolderCollection.resolve(self)
334 additional_fields = self.get_folder_fields(target_cls=self._get_target_cls())
--> 335 yield from self.__class__(account=self.account, folders=resolveable_folders).get_folders(
336 additional_fields=additional_fields
337 )
File ~\.conda\envs\python383\lib\site-packages\exchangelib\folders\collections.py:403, in FolderCollection.get_folders(self, additional_fields)
399 additional_fields.update(
400 (FieldPath(field=BaseFolder.get_field_by_fieldname(f)) for f in self.REQUIRED_FOLDER_FIELDS)
401 )
--> 403 yield from GetFolder(account=self.account).call(
404 folders=self.folders,
405 additional_fields=additional_fields,
406 shape=ID_ONLY,
407 )
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\get_folder.py:43, in GetFolder._elems_to_objs(self, elems)
42 def _elems_to_objs(self, elems):
---> 43 for folder, elem in zip(self.folders, elems):
44 if isinstance(elem, Exception):
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\common.py:246, in EWSService._chunked_get_elements(self, payload_func, items, **kwargs)
245 log.debug("Processing chunk %s containing %s items", i, len(chunk))
--> 246 yield from self._get_elements(payload=payload_func(chunk, **kwargs))
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\common.py:266, in EWSService._get_elements(self, payload)
263 try:
264 # Create a generator over the response elements so exceptions in response elements are also raised
265 # here and can be handled.
--> 266 yield from self._response_generator(payload=payload)
267 return
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\common.py:228, in EWSService._response_generator(self, payload)
223 """Send the payload to the server, and return the response.
224
225 :param payload: payload as an XML object
226 :return: the response, as XML objects
227 """
--> 228 response = self._get_response_xml(payload=payload)
229 if self.supports_paging:
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\common.py:343, in EWSService._get_response_xml(self, payload, **parse_opts)
342 log.debug("Trying API version %s", api_version)
--> 343 r = self._get_response(payload=payload, api_version=api_version)
344 if self.streaming:
345 # Let 'requests' decode raw data automatically
File ~\.conda\envs\python383\lib\site-packages\exchangelib\services\common.py:298, in EWSService._get_response(self, payload, api_version)
297 session = self.protocol.get_session()
--> 298 r, session = post_ratelimited(
299 protocol=self.protocol,
300 session=session,
301 url=self.protocol.service_endpoint,
302 headers=self._extra_headers(session),
303 data=wrap(
304 content=payload,
305 api_version=api_version,
306 account_to_impersonate=self._account_to_impersonate,
307 timezone=self._timezone,
308 ),
309 stream=self.streaming,
310 timeout=self.timeout or self.protocol.TIMEOUT,
311 )
312 self._handle_response_cookies(session)
File ~\.conda\envs\python383\lib\site-packages\exchangelib\util.py:880, in post_ratelimited(protocol, session, url, headers, data, allow_redirects, stream, timeout)
879 total_wait = time.monotonic() - t_start
--> 880 if protocol.retry_policy.may_retry_on_error(response=r, wait=total_wait):
881 r.close() # Release memory
File ~\.conda\envs\python383\lib\site-packages\exchangelib\protocol.py:780, in FaultTolerance.may_retry_on_error(self, response, wait)
778 if wait > self.max_wait:
779 # We lost patience. Session is cleaned up in outer loop
--> 780 raise RateLimitError(
781 "Max timeout reached", url=response.url, status_code=response.status_code, total_wait=wait
782 )
783 if response.status_code == 401:
784 # EWS sometimes throws 401's when it wants us to throttle connections. OK to retry.
RateLimitError: Max timeout reached (gave up after 634.031 seconds. URL https://outlook.office365.com/EWS/Exchange.asmx returned status code 401)
When I looked into it, I saw that exchangelib has a function to handle the throttle policy but I don't know to implement it. could the function
def post_ratelimited(protocol, session, url, headers, data, stream=False, timeout=None)
help me in this case? I found this function in their documentation.
You defined a policy the tells exchangelib to retry up to 600 seconds. The code threw an exception after waiting for more than 600 seconds. That's how it's supposed to work.
If you want the code to retry for a longer period, then increase the max_wait value.
Guide to EWS throttling and how to handle it is here: https://learn.microsoft.com/en-us/exchange/client-developer/exchange-web-services/ews-throttling-in-exchange
Im having issues when using polyglot... is it a bug in polyglot?? Below provided is the codes
import polyglot
from polyglot.downloader import downloader
print(downloader.supported_languages_table("ner2", 3))
Issue persists for the following code also
import polyglot
from polyglot.text import Text, Word
blob = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the
entire world"."""
text = Text(blob)
text.entities
Output is an error:
IndexError: list index out of range
The below is the traceback i recieved for the first block of code. even for the second i received the second block of code.. any help is appreciated
Thanks:)
IndexError Traceback (most recent call
last)
<ipython-input-5-3795737b065a> in <module>
1 from polyglot.downloader import downloader
----> 2 print(downloader.supported_languages_table("ner2", 3))
~\anaconda3\lib\site-packages\polyglot\downloader.py in
supported_languages_table(self, task, cols)
976
977 def supported_languages_table(self, task, cols=3):
--> 978 languages = self.supported_languages(task)
979 return pretty_list(languages)
980
~\anaconda3\lib\site-packages\polyglot\downloader.py in
supported_languages(self, task)
968 """
969 if task:
--> 970 collection = self.get_collection(task=task)
971 return [isoLangs[x.id.split('.')[1]]["name"]
972 for x in
collection.packages]
~\anaconda3\lib\site-packages\polyglot\downloader.py in
get_collection(self, lang, task)
944 else: raise ValueError("You should pass either the task or the
lang")
945 try:
--> 946 return self.info(id)
947 except ValueError as e:
948 if lang: raise LanguageNotSupported("Language {} is not
supported".format(id))
~\anaconda3\lib\site-packages\polyglot\downloader.py in info(self, id)
927 if id in self._packages: return self._packages[id]
928 if id in self._collections: return self._collections[id]
--> 929 self._update_index() # If package is not found, most probably
we did not
930 # warm up the cache
931 if id in self._packages: return self._packages[id]
~\anaconda3\lib\site-packages\polyglot\downloader.py in
_update_index(self, url)
841 packages = []
842 for p in objs:
--> 843 P = Package.fromcsobj(p)
844 packages.append(P)
845 self._packages = dict((p.id, p) for p in packages)
~\anaconda3\lib\site-packages\polyglot\downloader.py in fromcsobj(csobj)
214 filename = attrs["name"]
215 task = subdir.split(path.sep)[0]
--> 216 language = subdir.split(path.sep)[1]
217 attrs = attrs
218 return Package(**locals())
IndexError: list index out of range
I find a solution, it replaces 4 items path.sep with "/" (4 items in lines number: 208, 210, 215 and 216) at file "C:\Python36\Lib\site-packages\polyglot\downloader.py", ("fromcsobj" function in line 205).
I am trying to download the data for several hundreds of stocks using pandas_datareader's .get_data_yahoo function. To speed up the process i want to use multithreading with python's concurrent.futures. One can see a stripped down version of my code in the code window below trying to download the stocks contained in the german DAX.
from pandas_datareader import data as pdr
from pytickersymbols import PyTickerSymbols
import concurrent.futures
import yfinance as yf
import datetime
import os
from time import sleep
yf.pdr_override()
def download_stockdata(ticker):
print(f"Downloading {ticker} \n")
df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
print(f"{ticker} downloaded \n")
return df
if __name__ == '__main__':
tickers = []
index_to_scan = "DAX"
for element in list(PyTickerSymbols().get_stocks_by_index(index_to_scan)):
if element["symbols"]:
tickers.append(element.get("symbols")[0].get("yahoo"))
print(f"Symbols in {index_to_scan}: {tickers} \n")
print("Starting multi thread download")
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
for ticker in tickers:
future = executor.submit(download_stockdata, ticker)
futures.append(future)
futures, _ = concurrent.futures.wait(futures)
for future in futures:
print(future.result())
When running this code i get the following error:
KeyError Traceback (most recent call last)
<ipython-input-1-2e4c65895072> in <module>
36 futures, _ = concurrent.futures.wait(futures)
37 for future in futures:
---> 38 print(future.result())
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
~\anaconda3\envs\Trading\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~\anaconda3\envs\Trading\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<ipython-input-1-2e4c65895072> in download_stockdata(ticker)
11 def download_stockdata(ticker):
12 print(f"Downloading {ticker} \n")
---> 13 df = pdr.get_data_yahoo(ticker, datetime.datetime.now() - datetime.timedelta(days=365), datetime.date.today())
14 print(f"{ticker} downloaded \n")
15 return df
~\anaconda3\envs\Trading\lib\site-packages\yfinance\multi.py in download(tickers, start, end, actions, threads, group_by, auto_adjust, back_adjust, progress, period, interval, prepost, proxy, rounding, **kwargs)
117
118 if len(tickers) == 1:
--> 119 return shared._DFS[tickers[0]]
120
121 try:
KeyError: 'BMW.F'
I tried different ways of multithreading like threading.Thread(), the ThreadPool from multiprocessing.pool and concurrent.futures. All methods result in a KeyError with a key that is not the same but varies from run to run. From this point on i have no more ideas how i could handle the Error. Can someone help me, to solve the KeyError?
I tried to run multiprocessing with a large dataset.
when i run below script with for loop, the total run time is 1.5 sec.
def get_vars(accessCode, user_profile, wt, meals, instance_method='get_wt_adherence'):
'''
Examples
--------
>> n_cpus = multiprocessing.cpu_count()
>> get_wt_adherence = partial(get_vars, user_profile, wt, meals,
instance_method='get_wt_adherence')
>> pool = multiprocessing.Pool(n_cpus-5)
>> result = pool.map(get_wt_adherence, accessCodes)
>> concated_result = pd.concat(result)
Version
-------
# 2020.03.26 Updated
: Class name edited. 'NOOM' -> 'DATA_GEN'
'''
#
COL_WEEK = ['{}week'.format(i) for i in range(1, 17)]
data_gen = DATA_GEN(accessCode, user_profile, wt, meals)
if instance_method == 'get_wt_adherence':
func = data_gen.get_wt_adherence
elif instance_method == 'get_meal_adherence':
func = data_gen.get_meal_adherence
elif instance_method == 'get_color_food':
func = data_gen.get_color_food
elif instance_method == 'get_daily_cal':
func = data_gen.get_daily_cal
row = pd.DataFrame([func(weeks) for weeks in range(1, 17)]).T
row.columns = COL_WEEK
row['accessCode'] = accessCode
return row
from noom.handler import DATA_GEN
from functools import partial
import multiprocessing
# start_time = time.time()
get_wt = partial(get_vars, user_profile=user_profile, wt=wt_logs, meals=meals, instance_method='get_wt_adherence')
for i in range(10):
get_wt(accessCodes[i])
however, when i tried to run this script usign multiprocessing, the script was not responded
Even, 'accessCodes' is list which has 100 elements.
I suspect the 'get_wt' function using partial module.
n_cpus = multiprocessing.cpu_count()
pool = multiprocessing.Pool(n_cpus-15)
result_wt = pool.map(get_wt, accessCodes) ; print('wt adherence finished')
pool.close()
How to solve this problem?
the error is below
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-22-73ddf2e21bbd> in <module>
2 n_cpus = multiprocessing.cpu_count()
3 pool = multiprocessing.Pool(n_cpus-15)
----> 4 result_wt = pool.map(get_wt_adherence, accessCodes[1:10]) ; print('wt adherence finished')
5 pool.close()
6 time.time() - start_time
/usr/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
/usr/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
/usr/lib/python3.6/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
/usr/lib/python3.6/multiprocessing/connection.py in send(self, obj)
204 self._check_closed()
205 self._check_writable()
--> 206 self._send_bytes(_ForkingPickler.dumps(obj))
207
208 def recv_bytes(self, maxlength=None):
/usr/lib/python3.6/multiprocessing/connection.py in _send_bytes(self, buf)
391 n = len(buf)
392 # For wire compatibility with 3.2 and lower
--> 393 header = struct.pack("!i", n)
394 if n > 16384:
395 # The payload is large so Nagle's algorithm won't be triggered
error: 'i' format requires -2147483648 <= number <= 2147483647
I have a 2000 rows data frame and I'm trying to slice the same data frame into two and combine them together.
t1 = test[:10, :]
t2 = test[20:, :]
temp = t1.rbind(t2)
temp.show()
Then I got this error:
---------------------------------------------------------------------------
EnvironmentError Traceback (most recent call last)
<ipython-input-37-8daeb3375743> in <module>()
2 t2 = test[20:, :]
3 temp = t1.rbind(t2)
----> 4 temp.show()
5 print len(temp)
6 print len(test)
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in show(self, use_pandas)
383 print("This H2OFrame has been removed.")
384 return
--> 385 if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill()
386 if H2ODisplay._in_ipy():
387 import IPython.display
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in _frame(self, fill_cache)
423
424 def _frame(self, fill_cache=False):
--> 425 self._ex._eager_frame()
426 if fill_cache:
427 self._ex._cache.fill()
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eager_frame(self)
67 if not self._cache.is_empty(): return self
68 if self._cache._id is not None: return self # Data already computed under ID, but not cached locally
---> 69 return self._eval_driver(True)
70
71 def _eager_scalar(self): # returns a scalar (or a list of scalars)
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eval_driver(self, top)
81 def _eval_driver(self, top):
82 exec_str = self._do_it(top)
---> 83 res = ExprNode.rapids(exec_str)
84 if 'scalar' in res:
85 if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']]
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in rapids(expr)
163 The JSON response (as a python dictionary) of the Rapids execution
164 """
--> 165 return H2OConnection.post_json("Rapids", ast=expr,session_id=H2OConnection.session_id(), _rest_version=99)
166
167 class ASTId:
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
515 if __H2OCONN__ is None:
516 raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 517 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 520 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
521 return self._process_tables(raw_txt.json())
522
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
592 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
593 "detailed error messages: {}")
--> 594 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
595
596
EnvironmentError: h2o-py got an unexpected HTTP status code:
500 Server Error (method = POST; url = http://localhost:54321/99/Rapids).
detailed error messages: []
If I count rows (len(temp)), it works find. Also if I change the slicing index a little bit, it works find too. For example, if I change to this, it shows the data frame.
t1 = test[:10, :]
t2 = test[:5, :]
Do I miss something here? Thanks.
Unclear what happened without more information (logs would probably say why the rbind did not take).
What version are you using? I tried your code with iris on the bleeding edge and it all worked as expected.
By the way, rbind is typically going to be expensive, especially since what you're semantically after is a subset:
test[range(10) + range(20,test.nrow),:]
should also give you the desired subset (with caveat that you make the full list of row indices in python and pass it over REST to h2o).