Related
When using pd.Timestamp() I get this:
Liabraries imported
'''
%matplotlib inline
import zipline
from zipline import run_algorithm
from zipline.api import order_target_percent, symbol, set_commission, set_slippage, schedule_function, date_rules, time_rules
from datetime import date, datetime
import pytz
import matplotlib.pyplot as plt
import pyfolio as pf
import pandas as pd
import numpy as np
from scipy import stats
from zipline.finance.commission import PerDollar
from zipline.finance.slippage import VolumeShareSlippage, FixedSlippage
'''
Model Settings
intial_portfolio = 100000
momentum_window = 125
minimum_momentum = 40
portfolio_size = 30
vola_window = 20
enable_commission = True
commission_pct = 0.001
enable_slippage = True
slippage_volume_limit = 0.025
slippage_impact = 0.05
'''
Function to calculate annual regression slope R multiplied
def momentum_score(ts):
x = np.arange(len(ts))
log_ts = np.log(ts)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, log_ts)
annualized_slope = (np.power(np.exp(slope), 252) - 1) * 100
score = annualized_slope * (r_value ** 2)
return score
Function using standard deviation as a measure of volitility
def volatility(ts):
return ts.pct_change().rolling(vola_window).std().iloc[-1]
def output_progress(context):
today = zipline.api.get_datetime().date()
perf_pct = (context.portfolio.portfolio_value / context.last_month) - 1
print("{} - Last Month Result: {:.2%}".format(today, perf_pct))
context.last_month = context.portfolio.portfolio_value
'''
Reading file from disk, scheduling trades, setting commission and slippage
def initialize(context):
if enable_commission:
comm_model = PerDollar(cost=commission_pct)
else:
comm_model = PerDollar(cost=0.0)
set_commission(comm_model)
if enable_slippage: slippage_model=VolumeShareSlippage(volume_limit=slippage_volume_limit,
price_impact=slippage_impact)
else:
slippage_model=FixedSlippage(spread=0.0)
set_slippage(slippage_model)
context.last_month = intial_portfolio
context.index_members = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Project Folder/index members/sp500.csv', index_col=0, parse_dates=[0], engine='python', error_bad_lines=False)
schedule_function(
func=rebalance,
date_rule=date_rules.month_start(),
time_rule=time_rules.market_open()
)
'''
The balancing trades function
def rebalance(context, data):
output_progress(context)
today = zipline.api.get_datetime()
all_prior = context.index_members.loc[context.index_members.index < today]
latest_day = all_prior.iloc[-1,0]
list_of_tickers = latest_day.split(',')
todays_universe = [symbol(ticker) for ticker in list_of_tickers]
todays_universe = [
symbol(ticker) for ticker in
context.index_members.loc[context.index_members.index < today].iloc[-1,0].split(',')
]
hist = data.history(todays_universe, "close", momentum_window,"1d")
ranking_table = hist.apply(momentum_score).sort_values(ascending=False)
kept_positions = list(context.portfolio.positions.keys())
for security in context.portfolio.positions:
if (security not in todays_universe):
order_target_percent(security, 0.0)
kept_positions.remove(security)
elif ranking_table[security] < minimum_momentum:
order_target_percent(security, 0.0)
kept_positions.remove(security)
replacement_stocks = portfolio_size - len(kept_positions)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
new_portfolio = pd.concat(
(buy_list,
ranking_table.loc[ranking_table.index.isin(kept_positions)])
)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
vola_table = hist[new_portfolio.index].apply(volatility)
inv_vola_table = 1 / vola_table
sum_inv_vola = np.sum(inv_vola_table)
vola_target_weights = inv_vola_table / sum_inv_vola
for security, rank in new_portfolio.iteritems():
weight = vola_target_weights[security]
if security in kept_positions:
order_target_percent(security, weight)
else:
if ranking_table[security] > minimum_momentum:
order_target_percent(security, weight)
'''
Function to calculate performance metrics of model
def analyze(context, perf):
perf['max'] = perf.portfolio_value.cummax()
perf['dd'] = (perf.portfolio_value / perf['max']) - 1
maxdd = perf['dd'].min()
ann_ret = (np.power((perf.portfolio_value.iloc[-1] / perf.portfolio_value.iloc[0]),(252 / len(perf)))) - 1
print("Annualized Return: {:.2%} Max Drawdown: {:.2%}".format(ann_ret, maxdd))
return
'''
Setting Start and End dates of simulation
start_date = pd.Timestamp('1996-1-2', tz='utc')
end_date = pd.Timestamp('2018-12-31', tz='utc')
Running the simulation
results = run_algorithm(
start=start_date,
end=end_date,
initialize=initialize,
analyze=analyze,
capital_base=intial_portfolio,
data_frequency = 'daily',
bundle='eod_data')
'''
I get these error messages:
TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.
'''
TypeError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimelike.py in _validate_comparison_value(self, other)
539 try:
--> 540 self._check_compatible_with(other)
541 except (TypeError, IncompatibleFrequency) as err:
17 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimes.py in _check_compatible_with(self, other, setitem)
501 return
--> 502 self._assert_tzawareness_compat(other)
503 if setitem:
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimes.py in _assert_tzawareness_compat(self, other)
691 raise TypeError(
--> 692 "Cannot compare tz-naive and tz-aware datetime-like objects."
693 )
TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.
The above exception was the direct cause of the following exception:
'''
InvalidComparison: 1997-01-02 21:00:00+00:00
'''
InvalidComparison Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimelike.py in _cmp_method(self, other, op)
1007 try:
-> 1008 other = self._validate_comparison_value(other)
1009 except InvalidComparison:
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimelike.py in _validate_comparison_value(self, other)
542 # e.g. tzawareness mismatch
--> 543 raise InvalidComparison(other) from err
544
InvalidComparison: 1996-01-02 21:00:00+00:00
During handling of the above exception, another exception occurred:
'''
>TypeError: Invalid comparison between dtype=datetime64[ns] and Timestamp
'''
TypeError: Invalid comparison between dtype=datetime64[ns] and Timestamp
'''
TypeError Traceback (most recent call last)
<ipython-input-2-efa4e42dcada> in <module>()
192 capital_base=intial_portfolio,
193 data_frequency = 'daily',
--> 194 bundle='eod_data')
/usr/local/lib/python3.7/dist-packages/zipline/utils/run_algo.py in run_algorithm(start, end, initialize, capital_base, handle_data, before_trading_start, analyze, data_frequency, bundle, bundle_timestamp, trading_calendar, metrics_set, benchmark_returns, default_extension, extensions, strict_extensions, environ, custom_loader, blotter)
417 blotter=blotter,
418 custom_loader=custom_loader,
--> 419 benchmark_spec=benchmark_spec,
420 )
421
/usr/local/lib/python3.7/dist-packages/zipline/utils/run_algo.py in _run(handle_data, initialize, before_trading_start, analyze, algofile, algotext, defines, data_frequency, capital_base, bundle, bundle_timestamp, start, end, output, trading_calendar, print_algo, metrics_set, local_namespace, environ, blotter, custom_loader, benchmark_spec)
223 else {
224 "algo_filename": getattr(algofile, "name", "<algorithm>"),
--> 225 "script": algotext,
226 },
227 ).run()
/usr/local/lib/python3.7/dist-packages/zipline/algorithm.py in run(self, data_portal)
621 try:
622 perfs = []
--> 623 for perf in self.get_generator():
624 perfs.append(perf)
625
/usr/local/lib/python3.7/dist-packages/zipline/gens/tradesimulation.py in transform(self)
226 for dt, action in self.clock:
227 if action == BAR:
--> 228 for capital_change_packet in every_bar(dt):
229 yield capital_change_packet
230 elif action == SESSION_START:
/usr/local/lib/python3.7/dist-packages/zipline/gens/tradesimulation.py in every_bar(dt_to_use, current_data, handle_data)
141 metrics_tracker.process_commission(commission)
142
--> 143 handle_data(algo, current_data, dt_to_use)
144
145 # grab any new orders from the blotter, then clear the list.
/usr/local/lib/python3.7/dist-packages/zipline/utils/events.py in handle_data(self, context, data, dt)
207 context,
208 data,
--> 209 dt,
210 )
211
/usr/local/lib/python3.7/dist-packages/zipline/utils/events.py in handle_data(self, context, data, dt)
227 """
228 if self.rule.should_trigger(dt):
--> 229 self.callback(context, data)
230
231
<ipython-input-2-efa4e42dcada> in rebalance(context, data)
107
108 # Second, get the index makeup for all days prior to today.
--> 109 all_prior = context.index_members.loc[context.index_members.index < today]
110
111 # Now let's snag the first column of the last, i.e. latest, entry.
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/extension.py in wrapper(self, other)
154
155 op = getattr(self._data, opname)
--> 156 return op(other)
157
158 wrapper.__name__ = opname
/usr/local/lib/python3.7/dist-packages/pandas/core/ops/common.py in new_method(self, other)
67 other = item_from_zerodim(other)
68
---> 69 return method(self, other)
70
71 return new_method
/usr/local/lib/python3.7/dist-packages/pandas/core/arraylike.py in __lt__(self, other)
38 #unpack_zerodim_and_defer("__lt__")
39 def __lt__(self, other):
---> 40 return self._cmp_method(other, operator.lt)
41
42 #unpack_zerodim_and_defer("__le__")
/usr/local/lib/python3.7/dist-packages/pandas/core/arrays/datetimelike.py in _cmp_method(self, other, op)
1008 other = self._validate_comparison_value(other)
1009 except InvalidComparison:
-> 1010 return invalid_comparison(self, other, op)
1011
1012 dtype = getattr(other, "dtype", None)
/usr/local/lib/python3.7/dist-packages/pandas/core/ops/invalid.py in invalid_comparison(left, right, op)
32 else:
33 typ = type(right).__name__
---> 34 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
35 return res_values
36
TypeError: Invalid comparison between dtype=datetime64[ns] and Timestamp
'''
I am running a simple code for KMeans:
# Scaling the data set before clustering
scaler = StandardScaler()
subset = df[num_col].copy()
subset_scaled = scaler.fit_transform(subset)
subset_scaled_df = pd.DataFrame(subset_scaled, columns=subset.columns)
clusters = range(1, 9)
meanDistortions = []
for k in clusters:
model = KMeans(n_clusters=k)
model.fit(subset_scaled_df)
prediction = model.predict(subset_scaled_df)
distortion = (
sum(
np.min(cdist(subset_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
)
/ subset_scaled_df.shape[0]
)
meanDistortions.append(distortion)
print("Number of Clusters:", k, "\tAverage Distortion:", distortion)
plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize=20)
Running into the following error:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-173-4b988580ff32> in <module>
11 for k in clusters:
12 model = KMeans(n_clusters=k)
---> 13 model.fit(subset_scaled_df)
14 prediction = model.predict(subset_scaled_df)
15 distortion = (
/usr/local/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py in fit(self, X, y, sample_weight)
1006 if self._algorithm == "full":
1007 kmeans_single = _kmeans_single_lloyd
-> 1008 self._check_mkl_vcomp(X, X.shape[0])
1009 else:
1010 kmeans_single = _kmeans_single_elkan
/usr/local/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py in _check_mkl_vcomp(self, X, n_samples)
872 active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
873 if active_threads < self._n_threads:
--> 874 modules = threadpool_info()
875 has_vcomp = "vcomp" in [module["prefix"] for module in modules]
876 has_mkl = ("mkl", "intel") in [
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in threadpool_info()
122 In addition, each module may contain internal_api specific entries.
123 """
--> 124 return _ThreadpoolInfo(user_api=_ALL_USER_APIS).todicts()
125
126
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in __init__(self, user_api, prefixes, modules)
338
339 self.modules = []
--> 340 self._load_modules()
341 self._warn_if_incompatible_openmp()
342 else:
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _load_modules(self)
369 """Loop through loaded libraries and store supported ones"""
370 if sys.platform == "darwin":
--> 371 self._find_modules_with_dyld()
372 elif sys.platform == "win32":
373 self._find_modules_with_enum_process_module_ex()
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _find_modules_with_dyld(self)
426
427 # Store the module if it is supported and selected
--> 428 self._make_module_from_path(filepath)
429
430 def _find_modules_with_enum_process_module_ex(self):
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _make_module_from_path(self, filepath)
513 if prefix in self.prefixes or user_api in self.user_api:
514 module_class = globals()[module_class]
--> 515 module = module_class(filepath, prefix, user_api, internal_api)
516 self.modules.append(module)
517
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in __init__(self, filepath, prefix, user_api, internal_api)
603 self.user_api = user_api
604 self.internal_api = internal_api
--> 605 self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
606 self.version = self.get_version()
607 self.num_threads = self.get_num_threads()
/usr/local/Cellar/python#3.9/3.9.1_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
372
373 if handle is None:
--> 374 self._handle = _dlopen(self._name, mode)
375 else:
376 self._handle = handle
OSError: image not already loaded
However, if I replace the above code with the following, it works fine:
clusters = range(1, 9)
meanDistortions = []
for k in clusters:
model = KMeans(n_clusters=8)
Instead of passing "k" into KMeans(n_clusters= ), if I pass an integer it works fine. Not able to understand what is going wrong, any pointers would be greatly appreciated.
Thanks!
Try to update your sklearn package:
pip install -U sklearn
I assume it throws the error for k=1, and works for k > 1. This would also explain your working modification. So you could do range(2, 9) as a quick fix. I'm observing the same in my most recent sklean environment (0.24.2).
I’m trying to use PyMC3 Minibatch ADVI for Bayesian Regression. The pm.fit function throws the following error and I’m not sure how to fix it.
It says that the ‘str’ object has no attribute ‘type’. What is any ‘str’ object from the error message here? I’ve mapped float tensors for more_replacements to the best of what I know.
advi = pm.ADVI()
tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval,std=advi.approx.std.eval)
map_tensor_batch = {'x_tensor': pm.Minibatch(X_train, dtype=float),'y_tensor':pm.Minibatch(y_train['target'],dtype=float)}
approx = advi.fit(20000, obj_optimizer=pm.sgd(learning_rate=0.01), callbacks=[tracker], more_replacements = map_tensor_batch)
Your answers will be appreciated.
Addendum: If I just say
pm.Minibatch(np.array([tuple(y.iloc[i,[0]]) for i in train_index])).type()
(Or)
map_tensor_batch['y_tensor'].type()
I get the following result:
<TensorType(float64, matrix)>
Then why does it throw the attribute error below? Again, what’s my ‘str’ object?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-24-2824def803dc> in <module>
56 tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval,std=advi.approx.std.eval)
57 map_tensor_batch = {'x_tensor': pm.Minibatch(X_train, dtype=float),'y_tensor': pm.Minibatch(y_train['target'],dtype=float)}
---> 58 approx = advi.fit(20000, obj_optimizer=pm.sgd(learning_rate=0.01), callbacks=[tracker], more_replacements = map_tensor_batch)
59 fig = plt.figure(figsize=(16, 9))
60 mu_ax = fig.add_subplot(221)
/opt/conda/lib/python3.7/site-packages/pymc3/variational/inference.py in fit(self, n, score, callbacks, progressbar, **kwargs)
142 callbacks = []
143 score = self._maybe_score(score)
--> 144 step_func = self.objective.step_function(score=score, **kwargs)
145 if progressbar:
146 progress = progress_bar(range(n), display=progressbar)
/opt/conda/lib/python3.7/site-packages/theano/configparser.py in res(*args, **kwargs)
46 def res(*args, **kwargs):
47 with self:
---> 48 return f(*args, **kwargs)
49
50 return res
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in step_function(self, obj_n_mc, tf_n_mc, obj_optimizer, test_optimizer, more_obj_params, more_tf_params, more_updates, more_replacements, total_grad_norm_constraint, score, fn_kwargs)
358 more_updates=more_updates,
359 more_replacements=more_replacements,
--> 360 total_grad_norm_constraint=total_grad_norm_constraint,
361 )
362 if score:
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in updates(self, obj_n_mc, tf_n_mc, obj_optimizer, test_optimizer, more_obj_params, more_tf_params, more_updates, more_replacements, total_grad_norm_constraint)
244 more_obj_params=more_obj_params,
245 more_replacements=more_replacements,
--> 246 total_grad_norm_constraint=total_grad_norm_constraint,
247 )
248 resulting_updates.update(more_updates)
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in add_obj_updates(self, updates, obj_n_mc, obj_optimizer, more_obj_params, more_replacements, total_grad_norm_constraint)
284 more_replacements = dict()
285 obj_target = self(
--> 286 obj_n_mc, more_obj_params=more_obj_params, more_replacements=more_replacements
287 )
288 grads = pm.updates.get_or_compute_grads(obj_target, self.obj_params + more_obj_params)
/opt/conda/lib/python3.7/site-packages/theano/configparser.py in res(*args, **kwargs)
46 def res(*args, **kwargs):
47 with self:
---> 48 return f(*args, **kwargs)
49
50 return res
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in __call__(self, nmc, **kwargs)
401 m = 1.0
402 a = self.op.apply(self.tf)
--> 403 a = self.approx.set_size_and_deterministic(a, nmc, 0, kwargs.get("more_replacements"))
404 return m * self.op.T(a)
405
/opt/conda/lib/python3.7/site-packages/theano/configparser.py in res(*args, **kwargs)
46 def res(*args, **kwargs):
47 with self:
---> 48 return f(*args, **kwargs)
49
50 return res
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in set_size_and_deterministic(self, node, s, d, more_replacements)
1496 _node = node
1497 optimizations = self.get_optimization_replacements(s, d)
-> 1498 flat2rand = self.make_size_and_deterministic_replacements(s, d, more_replacements)
1499 node = theano.clone(node, optimizations)
1500 node = theano.clone(node, flat2rand)
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in make_size_and_deterministic_replacements(self, s, d, more_replacements)
1470 flat2rand = collections.OrderedDict()
1471 for g in self.groups:
-> 1472 flat2rand.update(g.make_size_and_deterministic_replacements(s, d, more_replacements))
1473 flat2rand.update(more_replacements)
1474 return flat2rand
/opt/conda/lib/python3.7/site-packages/pymc3/variational/opvi.py in make_size_and_deterministic_replacements(self, s, d, more_replacements)
1186 initial = tt.patternbroadcast(initial, self.symbolic_initial.broadcastable)
1187 if more_replacements:
-> 1188 initial = theano.clone(initial, more_replacements)
1189 return {self.symbolic_initial: initial}
1190
/opt/conda/lib/python3.7/site-packages/theano/scan/utils.py in clone(output, replace, strict, share_inputs)
212 )
213 )
--> 214 tmp_replace = [(x, x.type()) for x, y in items]
215 new_replace = [(x, y) for ((_, x), (_, y)) in zip(tmp_replace, items)]
216 _, _outs, _ = rebuild_collect_shared(
/opt/conda/lib/python3.7/site-packages/theano/scan/utils.py in <listcomp>(.0)
212 )
213 )
--> 214 tmp_replace = [(x, x.type()) for x, y in items]
215 new_replace = [(x, y) for ((_, x), (_, y)) in zip(tmp_replace, items)]
216 _, _outs, _ = rebuild_collect_shared(
AttributeError: 'str' object has no attribute 'type'
A workaround seems to be to use a tuple instead of a dictionary. The following doesn’t throw the error as before.
advi = pm.ADVI()
tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval,std=advi.approx.std.eval)
#map_tensor_batch = {'x_tensor': pm.Minibatch(x_tensor.eval()),'y_tensor': pm.Minibatch(y_tensor.eval())}
map_tensor_batch = (pm.Minibatch(x_tensor.eval()),pm.Minibatch(y_tensor.eval()))
approx = advi.fit(20000, obj_optimizer=pm.sgd(learning_rate=0.00001), callbacks=[tracker], more_replacements = map_tensor_batch)
But there’s another error as below:
TypeError: TensorType does not support iteration. Maybe you are using builtins.sum instead of theano.tensor.sum? (Maybe .max?)
Any fix to this?
This (https://alexioannides.com/2018/11/07/bayesian-regression-in-pymc3-using-mcmc-variational-inference/) is the example I’m trying to follow.
The blog post you are working from shows
import theano
y_tensor = theano.shared(train.y.values.astype('float64'))
x_tensor = theano.shared(train.x.values.astype('float64'))
map_tensor_batch = {y_tensor: pm.Minibatch(train.y.values, 100),
x_tensor: pm.Minibatch(train.x.values, 100)}
That is, map_tensor_batch should be a dict, but the keys are Theano tensors, not mere strings.
My code in its simplest form looks as below. I'm trying to connect to hive from a jupyter notebook. I code works fine when I query for smaller number of rows say 'select * from table limit 200' but throws this error when I do something like 'select * from table', the table is around 180MB and can be easily loaded in memory. I'm not sure why this is happening. Any help is much appreciated! I looked at the other similar questions, but they were not helpful
package versions -
python - 3.7
pandas - 0.24.2
pyhive 0.6.1
import pandas
import os
from pyhive import hive
def hiveconnection(query):
conn = hive.Connection(host=HOST, port=10000, username=USER,
password=PASSWORD, auth='LDAP')
cur = conn.cursor()
cur.execute(query)
result = cur.fetchall()
conn.close()
return result
query = """select * from table"""
df_new = hiveconnection(query)
The query runs fine and I get results for smaller resultset say - select * from table limit 200
But I get the following error when I query for larger number of rows.
--------------------------------------------------------------------------- TTransportException Traceback (most recent call
last) in
1 query = """select * from sample_table"""
----> 2 df_new = hiveconnection(query)
in hiveconnection(query)
7 cur = conn.cursor()
8 cur.execute(query)
----> 9 result = cur.fetchall()
10 conn.close()
11
/usr/local/anaconda/lib/python3.7/site-packages/pyhive/common.py in
fetchall(self)
134 :py:meth:execute did not produce any result set or no call was issued yet.
135 """
--> 136 return list(iter(self.fetchone, None))
137
138 #property
/usr/local/anaconda/lib/python3.7/site-packages/pyhive/common.py in
fetchone(self)
103
104 # Sleep until we're done or we have some data to return
--> 105 self._fetch_while(lambda: not self._data and self._state != self._STATE_FINISHED)
106
107 if not self._data:
/usr/local/anaconda/lib/python3.7/site-packages/pyhive/common.py in
_fetch_while(self, fn)
43 def _fetch_while(self, fn):
44 while fn():
---> 45 self._fetch_more()
46 if fn():
47 time.sleep(self._poll_interval)
/usr/local/anaconda/lib/python3.7/site-packages/pyhive/hive.py in
_fetch_more(self)
384 maxRows=self.arraysize,
385 )
--> 386 response = self._connection.client.FetchResults(req)
387 _check_status(response)
388 schema = self.description
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/TCLIService.py
in FetchResults(self, req)
712 """
713 self.send_FetchResults(req)
--> 714 return self.recv_FetchResults()
715
716 def send_FetchResults(self, req):
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/TCLIService.py
in recv_FetchResults(self)
731 raise x
732 result = FetchResults_result()
--> 733 result.read(iprot)
734 iprot.readMessageEnd()
735 if result.success is not None:
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/TCLIService.py
in read(self, iprot) 3468 if ftype == TType.STRUCT:
3469 self.success = TFetchResultsResp()
-> 3470 self.success.read(iprot) 3471 else: 3472 iprot.skip(ftype)
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/ttypes.py
in read(self, iprot) 6581 if ftype == TType.STRUCT:
6582 self.results = TRowSet()
-> 6583 self.results.read(iprot) 6584 else: 6585 iprot.skip(ftype)
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/ttypes.py
in read(self, iprot) 2865 for _i114 in
range(_size110): 2866 _elem115 = TColumn()
-> 2867 _elem115.read(iprot) 2868 self.columns.append(_elem115) 2869
iprot.readListEnd()
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/ttypes.py
in read(self, iprot) 2727 if ftype == TType.STRUCT:
2728 self.i64Val = TI64Column()
-> 2729 self.i64Val.read(iprot) 2730 else: 2731 iprot.skip(ftype)
/usr/local/anaconda/lib/python3.7/site-packages/TCLIService/ttypes.py
in read(self, iprot) 2351 (_etype79, _size76) =
iprot.readListBegin() 2352 for _i80 in
range(_size76):
-> 2353 _elem81 = iprot.readI64() 2354 self.values.append(_elem81) 2355
iprot.readListEnd()
/usr/local/anaconda/lib/python3.7/site-packages/thrift/protocol/TBinaryProtocol.py
in readI64(self)
220
221 def readI64(self):
--> 222 buff = self.trans.readAll(8)
223 val, = unpack('!q', buff)
224 return val
/usr/local/anaconda/lib/python3.7/site-packages/thrift/transport/TTransport.py
in readAll(self, sz)
60 have = 0
61 while (have < sz):
---> 62 chunk = self.read(sz - have)
63 chunkLen = len(chunk)
64 have += chunkLen
/usr/local/anaconda/lib/python3.7/site-packages/thrift_sasl/init.py
in read(self, sz)
171 return ret
172
--> 173 self._read_frame()
174 return ret + self.__rbuf.read(sz - len(ret))
175
/usr/local/anaconda/lib/python3.7/site-packages/thrift_sasl/init.py
in _read_frame(self)
188 else:
189 # If the frames are not encoded, just pass it through
--> 190 decoded = self._trans.read(length)
191 self.__rbuf = BufferIO(decoded)
192
/usr/local/anaconda/lib/python3.7/site-packages/thrift/transport/TSocket.py
in read(self, sz)
141 if len(buff) == 0:
142 raise TTransportException(type=TTransportException.END_OF_FILE,
--> 143 message='TSocket read 0 bytes')
144 return buff
145
TTransportException: TSocket read 0 bytes
I'm making a bag from a plain txt file - it's got a bunch of reviews, delimited by two newlines. But, sometimes - and I really can't predict when - it gives me FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt' while processing it
Here's the actual code
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np
import dask.bag as bag
import os
def get_next_part(file, start_index, span_index=0, blocksize=1000):
file.seek(start_index)
buffer = file.read(blocksize + span_index).decode('cp1252')
delimiter_position = buffer.find('\n\n')
if delimiter_position == -1:
return get_next_part(file, start_index, span_index + blocksize)
else:
file.seek(start_index)
return start_index, delimiter_position
def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
with open(filename, 'rb') as file_handle:
file_handle.seek(start_index)
text = file_handle.read(delimiter_position).decode(encoding)
return dict((element.split(': ')[0], element.split(': ')[1])
if len(element.split(': ')) > 1
else ('unknown', element)
for element in text.strip().split('\n'))
with open(f"{os.getcwd()}/foods.txt", 'rb') as file_handle:
size = file_handle.seek(0,2) - 1
more_data = True
output = []
current_position = next_position = 0
while more_data:
if current_position >= size:
more_data = False
else:
current_position, next_position = get_next_part(file_handle, current_position, 0)
output.append((current_position, next_position))
current_position = current_position + next_position + 2
with ProgressBar():
reviews = (bag.from_sequence(output, npartitions=104)
.map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
x[0],
x[1]))
.compute())
Sometimes it works fine, but other times it gives me something along these lines (different percentage every time):
[########## ] | 26% Completed | 54.3s
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-1-90a316620d10> in <module>()
42 with ProgressBar():
43 reviews = (bag.from_sequence(output, npartitions=104)
---> 44 .map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
45 x[0],
46 x[1]))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs)
190 get_id=_process_get_id, dumps=dumps, loads=loads,
191 pack_exception=pack_exception,
--> 192 raise_exception=reraise, **kwargs)
193 finally:
194 if cleanup:
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
109 def reraise(exc, tb=None):
110 if exc.__traceback__ is not tb:
--> 111 raise exc.with_traceback(tb)
112 raise exc
113
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in execute_task()
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in reify()
1589 def reify(seq):
1590 if isinstance(seq, Iterator):
-> 1591 seq = list(seq)
1592 if seq and isinstance(seq[0], Iterator):
1593 seq = list(map(list, seq))
~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in map_chunk()
1749 else:
1750 for a in zip(*args):
-> 1751 yield f(*a)
1752
1753 # Check that all iterators are fully exhausted
<ipython-input-1-90a316620d10> in <lambda>()
44 .map(lambda x: get_item(f"{os.getcwd()}/foods.txt",
45 x[0],
---> 46 x[1]))
47 .compute())
<ipython-input-1-90a316620d10> in get_item()
18
19 def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
---> 20 with open(filename, 'rb') as file_handle:
21 file_handle.seek(start_index)
22 text = file_handle.read(delimiter_position).decode(encoding)
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt'
I've tried messing with the partition numbers - leaving it as default (101), or making sure it's a multiple of 4. Doesn't seem to have an effect.
Anyone know what's going on here? It usually works if I run it a second time, but that's still tough to deal with.
I'm using the latest version of Dask. Using conda, it's all in Jupyterlab, and I'm running it from Windows Subsystem for Linux
Thanks!
Wasn't able to fix my initial read method, but was able to find another way of doing the parallel read (with native Dask objects too!)
Sections were delimited with \n\n and the linedelimiter argument to bag didn't mean what I thought it meant, but with this I was able to figure a way to get the sections I needed: Why `linedelimiter` does not work for bag.read_text?
bag.read_text(
f"{os.getcwd()}/foods.txt",
encoding="cp1252",
blocksize="10MB",
linedelimiter="\n\n",
)
.map_partitions(lambda x: "".join(x).split("\n\n"))