Expected binary or unicode string, got nan - tensorflow/pandas - python

I am fairly new to TensorFlow/Machine Learning and therefore have a few difficulties. I have a dataset in csv format here and want to read it with pandas like here. It worked on a different dataset but I modified and extended but I think I am missing something important here. Basically all I am trying to do ist to predict the "overall" rating from the given dataset. Here's my code and the traceback I get:
import pandas as pd
import tensorflow as tf
import tempfile
COLUMNS = ["reviewerID", "asin", "reviewerName", "helpful_0", "helpful_1", "reviewText",
"overall", "summary", "unixReviewTime"]
CATEGORICAL_COLUMNS = ["reviewerID", "reviewerName", "reviewText", "summary"]
CONTINUOUS_COLUMNS = ["helpful_0", "helpful_1", "unixReviewTime"]
df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, skipinitialspace=True,
low_memory=False, skiprows=1)
df_test = pd.read_csv('Digital_Music_5_test.csv', names=COLUMNS,
skipinitialspace=True, skiprows=1)
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = df_train["overall"]
df_test[LABEL_COLUMN] = df_train["overall"]
print(df_train)
def input_fn(df):
# Creates a dictionary mapping from each continuous feature column name (k)
# to the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values)
for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name
# (k) to the values of that column stored in a tf.SparseTensor.
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label
def train_input_fn():
return input_fn(df_train)
def eval_input_fn():
return input_fn(df_test)
reviewText = tf.contrib.layers.sparse_column_with_hash_bucket("reviewText", hash_bucket_size=100000)
reviewerID = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerID", hash_bucket_size=100000)
reviewerName = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerName", hash_bucket_size=100000)
summary = tf.contrib.layers.sparse_column_with_hash_bucket("summary", hash_bucket_size=100000)
asin = tf.contrib.layers.real_valued_column("asin")
helpful_0 = tf.contrib.layers.real_valued_column("helpful_0")
helpful_1 = tf.contrib.layers.real_valued_column("helpful_1")
unixReviewTime = tf.contrib.layers.real_valued_column("unixReviewTime")
# reviewText_x_summary = tf.contrib.layers.crossed_column([reviewText, summary], hash_bucket_size=100000)
# reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewerID, reviewerName], hash_bucket_size=100000)
# reviewText_x_reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewText, reviewerID, reviewerName], hash_bucket_size=100000)
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[reviewText, reviewerName, summary,
asin, helpful_0, helpful_1, unixReviewTime], optimizer=tf.train.FtrlOptimizer(
learning_rate=0.1,
l1_regularization_strength=1.0,
l2_regularization_strength=1.0),
model_dir=model_dir)
m.fit(input_fn=train_input_fn, steps=200)
# results = m.evaluate(input_fn=eval_input_fn, steps=1)
# for key in sorted(results):
# print("{}: {}".format(key, results[key]))
Traceback:
Traceback (most recent call last):
File "amazon_reviews.py", line 78, in <module>
m.fit(input_fn=train_input_fn, steps=200)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 280, in new_func
return func(*args, **kwargs)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 426, in fit
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 932, in _train_model
features, labels = input_fn()
File "amazon_reviews.py", line 47, in train_input_fn
return input_fn(df_train)
File "amazon_reviews.py", line 36, in input_fn
dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
File "amazon_reviews.py", line 36, in <dictcomp>
dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/sparse_tensor.py", line 125, in __init__
values, name="values", as_ref=True)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 702, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 110, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 99, in constant
tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 451, in make_tensor_proto
append_fn(tensor_proto, proto_values)
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in SlowAppendObjectArrayToTensorProto
tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 109, in <listcomp>
tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
File "/home/cfritz/virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 65, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got nan

Your input DataFrame contains empty reviewer names and review texts which are mapped to NaN by pd.read_csv(), however TensorFlow expects a string and not NaN.
Check the empty cells using this command:
df_train[df_train.isnull().any(axis=1)]
You can simply convert these NaNs to an empty string using
df_train.fillna('', inplace=True)
or have pd.read_csv() create empty strings instead of NANs directly using na_values=[]:
df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS,
skipinitialspace=True, low_memory=False,
skiprows=1, na_values=[])

Related

Generating data via SDV GaussianCopula throws "numpy.linalg.LinAlgError: SVD did not converge" in Python

I am currently using SDV and GaussianCopula (https://sdv.dev/SDV/user_guides/single_table/gaussian_copula.html) to train my models. I have a given data set which is loaded for training.
However, I get the following error message when creating the datasets:
Saving Model to path D:/.../GaussianCopula/model_MLB_1.pkl
Generating 22479 rows of synthetic data
Traceback (most recent call last):
File ".\generate_gaussian_model.py", line 47, in <module>
samples = gaussianCopula.sample(len(data.index))
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 442, in sample
return self._sample_batch(num_rows, max_retries, max_rows_multiplier)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 300, in _sample_batch
num_rows, conditions, transformed_conditions, float_rtol)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 228, in _sample_rows
sampled = self._sample(num_rows)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\copulas.py", line 319, in _sample
return self._model.sample(num_rows, conditions=conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\__init__.py", line 36, in wrapper
return function(self, *args, **kwargs)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 249, in sample
samples = self._get_normal_samples(num_rows, conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 223, in _get_normal_samples
samples = np.random.multivariate_normal(means, covariance, size=num_rows)
File "mtrand.pyx", line 4120, in numpy.random.mtrand.RandomState.multivariate_normal
File "<__array_function__ internals>", line 6, in svd
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 1660, in svd
u, s, vh = gufunc(a, signature=signature, extobj=extobj)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 97, in _raise_linalgerror_svd_nonconvergence
raise LinAlgError("SVD did not converge")
numpy.linalg.LinAlgError: SVD did not converge
I also checked out this following thread and tried to apply their solution (which you can see below) but it didn't work.
And this is my class (generate_gaussian_model.py) and what I've tried so far:
from sdv.tabular import GaussianCopula
import pickle
import pandas as pd
from pandas.core.indexes.base import Index
header_import_path = "C:/Users/.../headers/all_headers.txt"
all_mlb_names = ['MLB_1', 'MLB_7', 'MLB_19', 'MLB_31', 'MLB_41', 'MLB_45', 'MLB_49', 'MLB_53', 'MLB_58']
with open(header_import_path, 'rb') as fp:
all_headers = pickle.load(fp)
for mlb_file_name in all_mlb_names:
#Create separate model for each MLB Table
model_export_path = "D:/.../GaussianCopula/model_{0}.pkl".format(mlb_file_name)
synth_data_export_path = "C:/Users/.../models/generated/{0}_samples.csv".format(mlb_file_name)
data_import_path = "C:/Users/.../models/original/{0}.csv".format(mlb_file_name)
headers = all_headers[mlb_file_name]
print("Read data for table {0}".format(mlb_file_name))
data = pd.read_csv(data_import_path, sep='|', names=headers)
# This is necessary to remove invalid columns from my original dataset
for colname in data.columns:
if colname.startswith("Calculation"):
data = data.drop(axis=1, labels=[colname])
# Thought this would fix my issue but it didn't
# https://stackoverflow.com/questions/21827594/raise-linalgerrorsvd-did-not-converge-linalgerror-svd-did-not-converge-in-m
data.dropna(inplace=True)
#print("Takes a third of the dataset")
data = data.sample(frac=0.3)
print(data)
gaussianCopula = GaussianCopula()
print("Start training of GaussianCopula Model")
gaussianCopula.fit(data)
print("Saving Model to path {0}".format(model_export_path))
gaussianCopula.save(model_export_path)
print("Generating {0} rows of synthetic data".format(len(data.index)))
# Here it begins to crash
samples = gaussianCopula.sample(len(data.index))
samples.to_csv(synth_data_export_path, header=True, sep='|', index=False)
The following command would work, but these are not enough datasets for me: data = data.sample(n=1000)
Hope you guys can help me out and explain this error message to me.

Pandas Array Exception: Data must be 1-Dimensional

This is my Python script for using Markov Blanket Algorithm on my Dataset:
df1 = read_csv("input-binary-120-training.csv")
Y1 = df1[df1.CategoryL == 1].CategoryL
X1 = minmax_scale(df1[df1.CategoryL == 1].ix[:, 1:24], axis = 0)
y_train = Y1.values
df2 = read_csv("input-binary-120-test.csv")
Y2 = df2[df2.CategoryL == 1].CategoryL
X2 = minmax_scale(df2[df2.CategoryL == 1].ix[:, 1:24], axis = 0)
y_test = Y2.values
x_test = X2.reshape(X2.shape[0], X2.shape[1], 1)
seed(2017)
kfold = KFold(n_splits=5, random_state=27, shuffle=True)
scores = list()
# Create a PyImpetus classification object and initialize with required parameters
model = PPIMBC(LogisticRegression(random_state=27, max_iter=1000, class_weight="balanced"), cv=0, num_simul=20, simul_type=0, simul_size=0.2, random_state=27, sig_test_type="non-parametric", verbose=2, p_val_thresh=0.05)
x_train = model.fit_transform(X1, Y1)
x_test = model.transform(x_test)
print("Markov Blanket: ", model.MB)
But for the line X_train = model.fit_transform(X1,Y1) I got the exception:
Data must be 1-Dimensional.
I used X1.flatten() but it doesn't work. Could you please advise me about this issue?
Full error:
x_train = model.fit_transform(X1, Y1)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 326, in fit_transform
self.fit(data, Y)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 299, in fit
final_MB, final_feat_imp = self._find_MB(data.copy(), Y)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 221, in _find_MB
Y = np.reshape(Y, (-1, 1))
File "<__array_function__ internals>", line 6, in reshape
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 299, in reshape
return _wrapfunc(a, 'reshape', newshape, order=order)
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 55, in _wrapfunc
return _wrapit(obj, method, *args, **kwds)
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 48, in _wrapit
result = wrap(result)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/generic.py", line 1999, in __array_wrap__
return self._constructor(result, **d).__finalize__(self)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/series.py", line 311, in __init__
data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 729, in sanitize_array
raise Exception("Data must be 1-dimensional")
Exception: Data must be 1-dimensional
Try to reshape Y1 either Y1=Y1[:, 0] or Y1=Y1.ravel() to get a 1D dimension.

MemoryError when running python script on google cloud

I am trying to use the Google cloud to run a script that makes predictions for every line of a test.csv file. I use the cloud because it looks like Google Colab is going to take some time. However, when I run it there is a memory error:
(pre_env) mikempc3#instance-1:~$ python predictSales.py
Traceback (most recent call last):
File "predictSales.py", line 7, in <module>
sales = pd.read_csv("sales_train.csv")
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 463, in _read
data = parser.read(nrows)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 1169, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/frame.py", line 411, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 257, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 87, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1694, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1764, in form_blocks
int_blocks = _multi_blockify(items_dict["IntBlock"])
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1846, in _multi_blockify
values, placement = _stack_arrays(list(tup_block), dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1874, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 67.2 MiB for an array with shape (3, 2935849) and data type int64
Here is my script:
import statsmodels.tsa.arima.model as smt
import pandas as pd
import datetime
import numpy as np
sales = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
sales.date = sales.date.apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
sales_monthly = sales.groupby(
["date_block_num", "shop_id", "item_id"])["date", "item_price",
"item_cnt_day"].agg({
"date": ["min", "max"],
"item_price": "mean",
"item_cnt_day": "sum"})
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
print(statsmodels.__version__)
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :]['item_price'].values *
sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :][
'item_cnt_day'].values).T.iloc[0]
print(ts.values)
if ts.values != [] and len(ts.values) > 2:
best_aic = np.inf
best_order = None
best_model = None
ranges = range(1, 5)
for difference in ranges:
# try:
tmp_model = smt.ARIMA(ts.values, order=(0, 1, 0), trend='t').fit()
tmp_aic = tmp_model.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_difference = difference
best_model = tmp_model
# except Exception as e:
# print(e)
# continue
if best_model is not None:
y_hat = best_model.forecast()[0]
if y_hat < 0:
y_hat = 0
else:
y_hat = 0
else:
y_hat = 0
print("predicted:", y_hat)
d = {'id': row['ID'], 'item_cnt_month': y_hat}
array.append(d)
print("-------------------")
df = pd.DataFrame(array)
df.to_csv("submission.csv")
You can use the Fil memory profiler (https://pythonspeed.com/fil) to figure out which lines of code are responsible for peak memory use. It will also handle out-of-memory conditions and dump a report when you run out.
Only caveat is (1) it require Python 3.6 or later and (2) will only run on Linux or macOS. We're up to 3.9 so probably time to upgrade regardless.

TypeError: Could not build a TypeSpec for a column

I am trying to predict the Global Sales from the values 'Name', 'Platform', 'Genre', 'Publisher' and 'Year' from this dataset here: https://www.kaggle.com/gregorut/videogamesales
This is my code for training the model:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow as tf
dftrain = pd.read_csv('./vgsales_eval.csv')
dfeval = pd.read_csv('./vgsales_train.csv')
print(dftrain[dftrain.isnull().any(axis=1)])
y_train = dftrain.pop('Global_Sales')
y_eval = dfeval.pop('Global_Sales')
CATEGORICAL_COLUMNS = ['Name', 'Platform', 'Genre', 'Publisher']
NUMERIC_COLUMNS = ['Year']
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
vocabulary = dftrain[feature_name].unique() # gets a list of all unique values from given feature column
feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
for feature_name in NUMERIC_COLUMNS:
feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.int64))
print(feature_columns)
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
return input_function
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn)
I get the following error:
Traceback (most recent call last):
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\util\structure.py", line 93, in normalize_element
spec = type_spec_from_value(t, use_fallback=False)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\util\structure.py", line 466, in type_spec_from_value
(element, type(element).__name__))
TypeError: Could not build a TypeSpec for 0 Tecmo Koei
1 Nippon Ichi Software
2 Ubisoft
3 Activision
4 Atari
...
6594 Kemco
6595 Infogrames
6596 Activision
6597 7G//AMES
6598 Wanadoo
Name: Publisher, Length: 6599, dtype: object with type Series
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\kuhn-\Documents\Github\Tensorflow_Test\VideoGameSales_Test\main.py", line 45, in <module>
linear_est.train(train_input_fn)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 349, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1175, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1201, in _train_model_default
self._get_features_and_labels_from_input_fn(input_fn, ModeKeys.TRAIN))
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1037, in _get_features_and_labels_from_input_fn
self._call_input_fn(input_fn, mode))
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1130, in _call_input_fn
return input_fn(**kwargs)
File "c:\Users\kuhn-\Documents\Github\Tensorflow_Test\VideoGameSales_Test\main.py", line 34, in input_function
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 682, in from_tensor_slices
return TensorSliceDataset(tensors)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 3001, in __init__
element = structure.normalize_element(element)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\util\structure.py", line 98, in normalize_element
ops.convert_to_tensor(t, name="component_%d" % i))
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1499, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\constant_op.py", line 338, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\constant_op.py", line 264, in constant
allow_broadcast=True)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\constant_op.py", line 282, in _constant_impl
allow_broadcast=allow_broadcast))
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 563, in make_tensor_proto
append_fn(tensor_proto, proto_values)
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 155, in SlowAppendObjectArrayToTensorProto
tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 155, in <listcomp>
tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
File "C:\Users\kuhn-\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\util\compat.py", line 87, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got nan
What am I doing wrong here? Is it a problem with the dataset or do I have to read the values differently?
This is basically with the null values present in the data which you have taken, you need to handle it when you load the data.
I have done couple of changes.
To drop the record of the null value, you can also perform df.fillna based on the columns and the values you need to fill in it considering data type.
I have changed the column Year datatype from float to int. Since it would lead to another problem for tensor_slices.
Below is the modified code with the same data you have taken.
df = pd.read_csv('/content/vgsales.csv')
# print(df.head())
print(df[df.isnull().any(axis=1)])
# df.fillna('', inplace=True)
df.dropna(how="any",inplace = True)
df.Year = df.Year.astype(int)
CATEGORICAL_COLUMNS = ['Name', 'Platform', 'Genre', 'Publisher']
NUMERIC_COLUMNS = ['Year']
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
vocabulary = df[feature_name].unique() # gets a list of all unique values from given feature column
feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
for feature_name in NUMERIC_COLUMNS:
feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.int64))
print(feature_columns)
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
return input_function
train_input_fn = make_input_fn(df, y_train)
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

How to use Dataset API to read TFRecords file of lists of variant length?

I want to use Tensorflow's Dataset API to read TFRecords file of lists of variant length. Here is my code.
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def main1():
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = a[i]
feature = {'i': _int64_feature(np.array([i])), 'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
#data = np.fromstring(data_string, dtype=np.int64)
print(i, data)
# Use Dataset API to read the TFRecord file.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.FixedLenFeature([], tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(i.eval())
print(data.eval())
Check TFRecord file
[0] [0, 54, 91, 153, 177]
[1] [0, 50, 89, 147, 196]
[2] [0, 38, 79, 157]
[3] [0, 49, 89, 147, 177]
[4] [0, 32, 73, 145]
But it showed the following error when I tried to use Dataset API to read TFRecord file.
tensorflow.python.framework.errors_impl.InvalidArgumentError: Name:
, Key: data, Index: 0. Number of int64 values != expected.
Values size: 5 but output shape: []
Thank you.
UPDATE:
I tried to use the following code to read TFRecord with Dataset API, but both of them failed.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
or
def _parse_function(example_proto):
keys_to_features = {'i' :tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
And the error:
Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 468, in make_tensor_proto
str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 468, in
str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py",
line 65, in as_bytes
(bytes_or_text,)) TypeError: Expected binary or unicode string, got
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "2tfrecord.py", line 126, in
main1() File "2tfrecord.py", line 72, in main1
iterator = ds.map(_parse_function).make_one_shot_iterator() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 712, in map
return MapDataset(self, map_func) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1385, in init
self._map_func.add_to_graph(ops.get_default_graph()) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 486, in add_to_graph
self._create_definition_if_needed() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 321, in _create_definition_if_needed
self._create_definition_if_needed_impl() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 338, in _create_definition_if_needed_impl
outputs = self._func(*inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1376, in tf_map_func
flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1376, in
flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 836, in convert_to_tensor
as_ref=False) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 926, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",
line 229, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",
line 208, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape)) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 472, in make_tensor_proto
"supported type." % (type(values), values)) TypeError: Failed to convert object of type to Tensor.
Contents:
SparseTensor(indices=Tensor("ParseSingleExample/Slice_Indices_i:0",
shape=(?, 1), dtype=int64),
values=Tensor("ParseSingleExample/ParseExample/ParseExample:3",
shape=(?,), dtype=int64),
dense_shape=Tensor("ParseSingleExample/Squeeze_Shape_i:0", shape=(1,),
dtype=int64)). Consider casting elements to a supported type.
Python version: 3.5.2
Tensorflow version: 1.4.1
After hours of searching and trying, I believe the answer emerges. Below is my code.
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = np.array(a[i])
feature = {'i' : _int64_feature(np.array([i])),
'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
print(i, data)
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
keys_to_features = {'i':tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return tf.sparse_tensor_to_dense(parsed_features['i']), \
tf.sparse_tensor_to_dense(parsed_features['data'])
# Parse the record into tensors.
dataset = dataset.map(_parse_function)
# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=1)
# Repeat the input indefinitly
dataset = dataset.repeat()
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
print(sess.run([i, data]))
print(sess.run([i, data]))
There are few things to note.
1. This SO question helps a lot.
2. tf.VarLenFeature would return SparseTensor, thus, using tf.sparse_tensor_to_dense to convert to dense tensor is necessary.
3. In my code, parse_single_example() can't be replaced with parse_example(), and it bugs me for a day. I don't know why parse_example() doesn't work out. If anyone know the reason, please enlighten me.
The error is very simple. Your data is not FixedLenFeature it is VarLenFeature. Replace your line:
'data':tf.FixedLenFeature([], tf.int64)}
with
'data':tf.VarLenFeature(tf.int64)}
Also, when you call print(i.eval()) and print(data.eval()) you are calling the iterator twice. The first print will print 0, but the second one will print the value of the second row [ 0, 50, 89, 147, 196]. You can do print(sess.run([i, data])) to get i and data from the same row.

Categories