Converting CSV files to TF Records - python

I've been running my script for more than 5 hours already. I have 258 CSV files that I want to convert to TF Records. I wrote the following script, and as I've said, I've been running it for more than 5 hours already:
import argparse
import os
import sys
import standardize_data
import tensorflow as tf
FLAGS = None
PATH = '/home/darth/GitHub Projects/gru_svm/dataset/train'
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def convert_to(dataset, name):
"""Converts a dataset to tfrecords"""
filename_queue = tf.train.string_input_producer(dataset)
# TF reader
reader = tf.TextLineReader()
# default values, in case of empty columns
record_defaults = [[0.0] for x in range(24)]
key, value = reader.read(filename_queue)
duration, service, src_bytes, dest_bytes, count, same_srv_rate, \
serror_rate, srv_serror_rate, dst_host_count, dst_host_srv_count, \
dst_host_same_src_port_rate, dst_host_serror_rate, dst_host_srv_serror_rate, \
flag, ids_detection, malware_detection, ashula_detection, label, src_ip_add, \
src_port_num, dst_ip_add, dst_port_num, start_time, protocol = \
tf.decode_csv(value, record_defaults=record_defaults)
features = tf.stack([duration, service, src_bytes, dest_bytes, count, same_srv_rate,
serror_rate, srv_serror_rate, dst_host_count, dst_host_srv_count,
dst_host_same_src_port_rate, dst_host_serror_rate, dst_host_srv_serror_rate,
flag, ids_detection, malware_detection, ashula_detection, src_ip_add,
src_port_num, dst_ip_add, dst_port_num, start_time, protocol])
filename = os.path.join(FLAGS.directory, name + '.tfrecords')
print('Writing {}'.format(filename))
writer = tf.python_io.TFRecordWriter(filename)
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
example, l = sess.run([features, label])
print('Writing {dataset} : {example}, {label}'.format(dataset=sess.run(key),
example=example, label=l))
example_to_write = tf.train.Example(features=tf.train.Features(feature={
'duration' : _float_feature(example[0]),
'service' : _int64_feature(int(example[1])),
'src_bytes' : _float_feature(example[2]),
'dest_bytes' : _float_feature(example[3]),
'count' : _float_feature(example[4]),
'same_srv_rate' : _float_feature(example[5]),
'serror_rate' : _float_feature(example[6]),
'srv_serror_rate' : _float_feature(example[7]),
'dst_host_count' : _float_feature(example[8]),
'dst_host_srv_count' : _float_feature(example[9]),
'dst_host_same_src_port_rate' : _float_feature(example[10]),
'dst_host_serror_rate' : _float_feature(example[11]),
'dst_host_srv_serror_rate' : _float_feature(example[12]),
'flag' : _int64_feature(int(example[13])),
'ids_detection' : _int64_feature(int(example[14])),
'malware_detection' : _int64_feature(int(example[15])),
'ashula_detection' : _int64_feature(int(example[16])),
'label' : _int64_feature(int(l)),
'src_ip_add' : _float_feature(example[17]),
'src_port_num' : _float_feature(example[18]),
'dst_ip_add' : _float_feature(example[19]),
'dst_port_num' : _float_feature(example[20]),
'start_time' : _float_feature(example[21]),
'protocol' : _int64_feature(int(example[22])),
}))
writer.write(example_to_write.SerializeToString())
writer.close()
except tf.errors.OutOfRangeError:
print('Done converting -- EOF reached.')
finally:
coord.request_stop()
coord.join(threads)
def main(unused_argv):
files = standardize_data.list_files(path=PATH)
convert_to(dataset=files, name='train')
It already got me thinking that perhaps it's stuck in an infinite loop? What I want to do is to read all rows in each CSV file (258 CSV files), and write those rows into a TF Record (a feature and a label, that is, of course). And then, stop the loop when there are no more rows available, or the CSV files have been exhausted already.
The standardize_data.list_files(path) is a function I wrote in a different module. I just re-used it for this script. What it does is to return a list of all the files found in PATH. Take note that the files in my PATH only contains CSV files.

Set num_epochs=1 in string_input_producer. Another note: Converting these csv to tfrecords may not offer any advantage you are looking in tfrecords, the overheads is very high with this kind of data (with the large number of single features/labels). You may want to experiment this part.

Related

How to get the mfcc of song(.wav) using python?

I have been trying to create mfcc of every file in my dataset.
I want to create a preprocessing fuction which takes the source_path as input and returns dictionary mydict with two keys label , mfcc.
I have tried to create the following function
def preprocess_data(source_path):
mydict = {
"labels" : [] ,
"mfcc" : []
}
music = ['reggae', 'jazz', 'country', 'hiphop', 'rock', 'metal', 'classical', 'disco', 'blues', 'pop']
path = 'Data/genres_original/'
i = 0
for n in music :
new_path = path + n
song = os.listdir(new_path)
for p in song:
final_path = new_path + "/" + p
melody , sr = librosa.load(final_path)
mfcc = librosa.feature.mfcc(melody, sr=sr, n_mfcc=13)
mydict["labels"].append(i)
mydict["mfcc"].append(mfcc.tolist())
i+= 1
return mydict
but this is not working intead it shows a warning PySoundFile failed. Trying audioread instead. and then gives error !
to resolve it have also installed ffmpeg but it didn't do anything
note : i am using google colab and expect a code that runs on it
here is the link

Tensorflow 2.3: How to parallelize reading text from big file?

I need to break down my data-set file of size 4GB into chunks small chunks. As part of optimizing the time consumption, I would like to maximize the parallel processing. Currently, I can observe that cores of CPU and GPU are under utilized. See the attached output in the image here.
My Code snippet looks like below
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Returns a float_list from a float / double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def serialize_row(text, rating):
# Create a dictionary mapping the feature name to the tf.Example-compatible data type.
feature = {
'text': _bytes_feature(text),
'rating': _float_feature(rating),
}
# Create a Features message using tf.train.Example.
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()
def transform(example):
str_example = example.decode("utf-8")
json_example = json.loads(str_example)
overall = json_example.get('overall', -99)
text = json_example.get('reviewText', '')
if type(text) is str:
text = bytes(text, 'utf-8')
tf_serialized_string = serialize_row(text, overall)
return tf_serialized_string
line_dataset = tf.data.TextLineDataset(filenames=[file_path])
line_dataset = line_dataset.map(lambda row: tf.numpy_function(transform, [row], tf.string))
line_dataset = line_dataset.shuffle(2)
line_dataset = line_dataset.batch(NUM_OF_RECORDS_PER_BATCH_FILE)
'''
Perform batchwise transformation of the population.
'''
start = time.time()
for idx, line in line_dataset.enumerate():
FILE_NAMES = 'test{0}.tfrecord'.format(idx)
end = time.time()
time_taken = end - start
tf.print('Processing for file - {0}'.format(FILE_NAMES))
DIRECTORY_URL = '/home/gaurav.gupta/projects/practice/'
filepath = os.path.join(DIRECTORY_URL, 'data-set', 'electronics', FILE_NAMES)
batch_ds = tf.data.Dataset.from_tensor_slices(line)
writer = tf.data.experimental.TFRecordWriter(filepath)
writer.write(batch_ds)
tf.print('Processing for file - {0} took {1}'.format(FILE_NAMES, time_taken))
tf.print('Done')
Logs to showcase execution flow
Processing for file - test0.tfrecord took 14.350863218307495
Processing for file - test1.tfrecord took 12.695453882217407
Processing for file - test2.tfrecord took 12.904462575912476
Processing for file - test3.tfrecord took 12.344425439834595
Processing for file - test4.tfrecord took 11.188365697860718
Processing for file - test5.tfrecord took 11.319620609283447
Processing for file - test6.tfrecord took 11.285977840423584
Processing for file - test7.tfrecord took 11.169529438018799
Processing for file - test8.tfrecord took 11.289997816085815
Processing for file - test9.tfrecord took 11.431073188781738
Processing for file - test10.tfrecord took 11.428141593933105
Processing for file - test11.tfrecord took 3.223125457763672
Done
I have tried num_parallel_reads argument but couldn't see much difference. I believe it can be handy while reading multiple files instead of single big file.
I am seeking for your suggestion to parallelize this task to reduce the time consumption.
I would try something like this (I like to use joblib as it is quite simple to put into existing code, you could probably do something similar with many other frameworks, furthermore, joblib does not use GPU nor it does not use any JITting):
from joblib import Parallel, delayed
from tqdm import tqdm
...
def process_file(idx, line):
FILE_NAMES = 'test{0}.tfrecord'.format(idx)
end = time.time()
time_taken = end - start
tf.print('Processing for file - {0}'.format(FILE_NAMES))
DIRECTORY_URL = '/home/gaurav.gupta/projects/practice/'
filepath = os.path.join(DIRECTORY_URL, 'data-set', 'electronics', FILE_NAMES)
batch_ds = tf.data.Dataset.from_tensor_slices(line)
writer = tf.data.experimental.TFRecordWriter(filepath)
writer.write(batch_ds)
#tf.print('Processing for file - {0} took {1}'.format(FILE_NAMES, time_taken))
return FILE_NAMES, time_taken
times = Parallel(n_jobs=12, prefer="processes")(delayed(process_file)(idx, line) for idx, line in tqdm(line_dataset.enumerate(), total=len(line_dataset)))
print('Done.')
This is untested code and I am also unsure how it will work with the tf code, but I would give it a try.
The tqdm is totally unnecessary, it is just something I prefer to use as it provides a nice progress bar.

How to read a Protobuf that was written by a TFRecordWriter

I am trying to read data that was written with tf.io.TFRecordWriter as shown below:
import tensorflow as tf
import numpy as np
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
tfrecord_filename = "test.tfrecord"
with tf.io.TFRecordWriter(tfrecord_filename) as writer:
for i in range(4):
a = np.random.uniform(-1, 1, 5)
a = tf.convert_to_tensor(a, dtype=tf.float32)
a = tf.io.serialize_tensor(a)
feature = {
'a' : _bytes_feature(a),
}
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example_proto.SerializeToString())
I am then using the schema as given by:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/feature.proto and https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/example.proto
along with protoc to decode it. The reading program is:
import test_pb2
parser = test_pb2.Example()
with open("test.tfrecord", "rb") as f:
parser.ParseFromString(f.read())
print(parser.feature)
Using the ParseFromString method I expect to be able to recover the data written after executing the above program but I consistently get:
RuntimeWarning: Unexpected end-group tag: Not all data was converted
What am I doing wrong?

Tensorflow hangs with high CPU usage for training a very small forest

I tried the following code in order train a very small random forest as a test, but for some reason it hangs with high CPU usage. I think it possibly might have to do with the hash table not being initialized, but I am not sure where it should be initialized to be accessible by both input_fn and the call to 'estimator.evaluate'. Any advice?
import csv
import tensorflow as tf
from tensorflow.contrib.cloud.python.ops import bigquery_reader_ops
from tensorflow.contrib.lookup import KeyValueTensorInitializer, HashTable
from tensorflow.contrib.tensor_forest.client.random_forest import TensorForestEstimator
from tensorflow.contrib.tensor_forest.python.tensor_forest import ForestHParams
from tensorflow.python.training.input import string_input_producer
sess = tf.Session()
with open('event_classes.csv', mode='r') as infile:
reader = csv.reader(infile)
event_names, event_numbers = list(zip(*[(r[0], int(r[1])) for r in reader]))
def input_fn():
# Create maps between event names and event numbers
event_class_map = HashTable(KeyValueTensorInitializer(
event_names, event_numbers, key_dtype=tf.string, value_dtype=tf.int64), int(0))
reverse_event_class_map = HashTable(KeyValueTensorInitializer(
event_numbers, event_names, key_dtype=tf.int64, value_dtype=tf.string), "Unknown")
# Specify features to read
features = {"time_{}".format(i): tf.FixedLenFeature([1], tf.string, default_value="") for i in range(4)}
# Create a Reader.
reader = bigquery_reader_ops.BigQueryReader(project_id="drivemode-com",
dataset_id="temp_stephane",
table_id="event_history",
timestamp_millis=1497502522,
num_partitions=1,
features=features)
# Populate a queue with the BigQuery Table partitions.
queue = string_input_producer(reader.partitions())
# Read and parse examples.
row_id, examples_serialized = reader.read_up_to(queue, 100)
examples = tf.parse_example(examples_serialized, features=features)
# Process the Tensors example["name"], example["age"], etc...
for i in range(4):
col = "time_{}".format(i)
examples[col] = event_class_map.lookup(examples[col])
# event_class_map.init.run(session=sess)
label = examples.pop("time_3")
return examples, label
hparams = ForestHParams(num_classes=len(event_numbers), num_features=3, max_nodes=3, num_trees=1).fill()
estimator = TensorForestEstimator(hparams)
estimator.fit(input_fn=input_fn, steps=1)
tf.train.start_queue_runners(sess)
print sess.run(estimator.evaluate(input_fn=input_fn))

tensorflow using tf.train.string_input_producer

I'm using tf.train.string_input_producer to read data from tfRecord file. I suppose it create a queue and pipeline and the data will automatically loaded and feed into my model. However, it stuck at the first batch, and show this exception:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value input_producer/limit_epochs/epochs
my tfrecord was made by tf.train.SequenceExample, instead of tf.train.Example, which don't have clear documentation in the official guide.
here is code snapshot to reproduce my problem. (I believe my problem come from the queue initializing or sth. because it seems that the whole pipeline is hang up)
from config.config import get_config
init = tf.global_variables_initializer()
config = get_config()
filename_queue = tf.train.string_input_producer(['data0.tfrecord,data1.tfrecord'], 5, capacity=16384)
reader = tf.TFRecordReader()
(keys, values) = reader.read_up_to(filename_queue, config.batch_size)
context_features = {
"seq_len": tf.FixedLenFeature([1], dtype=tf.int64),
}
audio_features = {
"audio": tf.FixedLenSequenceFeature([config.num_features], dtype=tf.float32),
"label": tf.FixedLenSequenceFeature([config.num_classes], dtype=tf.float32)
}
audio_list = []
label_list = []
len_list = []
for i in range(config.batch_size):
print(i)
context, sequence = tf.parse_single_sequence_example(
serialized=values[i],
context_features=context_features,
sequence_features=audio_features
)
audio = sequence['audio']
label = sequence['label']
# seq_len = context['seq_len'][0]
seq_len = tf.shape(audio)[0]
audio_list.append(audio)
label_list.append(label)
len_list.append(seq_len)
audio_tensor = tf.stack(audio_list)
label_tenor = tf.stack(label_list)
len_tensor = tf.stack(len_list)
with tf.Session() as sess:
sess.run(init)
threads = tf.train.start_queue_runners(sess=sess)
for i in range(3):
x, y, z = sess.run([audio_tensor, label_tenor, len_tensor])
print(z)
Try
init2 = tf.local_variables_initializer()
sess.run(init2)
Variabes (num_epochs or capacity) inside tf.train.string_input_producer() are local variables. You have to initialize them with local variable initializer as shown above.
Let me know if this helped.

Categories