how to convert mnist images to variables images and labels - python

I have a code as below:
dataset = MNIST(path=data_path, download=True, shuffle=True)
if train:
images, labels = dataset.get_train()
else:
images, labels = dataset.get_test()
images, labels = images[:n_examples], labels[:n_examples]
images, labels = iter(images.view(-1, 784) / 255), iter(labels)
but when i run it, it gives me this error:
Traceback (most recent call last):
File "C:\Users\Ati\Downloads\Compressed\bindsnet_experiments-
master\experiments\mnist\two_layer_backprop.py", line 135, in <module>
images, labels = dataset.get_train()
AttributeError: 'TorchvisionDatasetWrapper' object has no attribute 'get_train'
I think because get_train() is out of date , it doesn’t support by torchvision
But i tested different ways for converting mnist data to images and labels variables
Who know how could i change it when get_train() doesn’t work
I will appreciate your help if someone helps me on this

Yes, it looks like class does not exist in the package anymore.
I was able to find the source code for the package you are looking for:
import os
import functools
import operator
import gzip
import struct
import array
import tempfile
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve # py2
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
import numpy
__version__ = '0.2.2'
# `datasets_url` and `temporary_dir` can be set by the user using:
# >>> mnist.datasets_url = 'http://my.mnist.url'
# >>> mnist.temporary_dir = lambda: '/tmp/mnist'
datasets_url = 'http://yann.lecun.com/exdb/mnist/'
temporary_dir = tempfile.gettempdir
class IdxDecodeError(ValueError):
"""Raised when an invalid idx file is parsed."""
pass
def download_file(fname, target_dir=None, force=False):
"""Download fname from the datasets_url, and save it to target_dir,
unless the file already exists, and force is False.
Parameters
----------
fname : str
Name of the file to download
target_dir : str
Directory where to store the file
force : bool
Force downloading the file, if it already exists
Returns
-------
fname : str
Full path of the downloaded file
"""
target_dir = target_dir or temporary_dir()
target_fname = os.path.join(target_dir, fname)
if force or not os.path.isfile(target_fname):
url = urljoin(datasets_url, fname)
urlretrieve(url, target_fname)
return target_fname
def parse_idx(fd):
"""Parse an IDX file, and return it as a numpy array.
Parameters
----------
fd : file
File descriptor of the IDX file to parse
endian : str
Byte order of the IDX file. See [1] for available options
Returns
-------
data : numpy.ndarray
Numpy array with the dimensions and the data in the IDX file
1. https://docs.python.org/3/library/struct.html
#byte-order-size-and-alignment
"""
DATA_TYPES = {0x08: 'B', # unsigned byte
0x09: 'b', # signed byte
0x0b: 'h', # short (2 bytes)
0x0c: 'i', # int (4 bytes)
0x0d: 'f', # float (4 bytes)
0x0e: 'd'} # double (8 bytes)
header = fd.read(4)
if len(header) != 4:
raise IdxDecodeError('Invalid IDX file, '
'file empty or does not contain a full header.')
zeros, data_type, num_dimensions = struct.unpack('>HBB', header)
if zeros != 0:
raise IdxDecodeError('Invalid IDX file, '
'file must start with two zero bytes. '
'Found 0x%02x' % zeros)
try:
data_type = DATA_TYPES[data_type]
except KeyError:
raise IdxDecodeError('Unknown data type '
'0x%02x in IDX file' % data_type)
dimension_sizes = struct.unpack('>' + 'I' * num_dimensions,
fd.read(4 * num_dimensions))
data = array.array(data_type, fd.read())
data.byteswap() # looks like array.array reads data as little endian
expected_items = functools.reduce(operator.mul, dimension_sizes)
if len(data) != expected_items:
raise IdxDecodeError('IDX file has wrong number of items. '
'Expected: %d. Found: %d' % (expected_items,
len(data)))
return numpy.array(data).reshape(dimension_sizes)
def download_and_parse_mnist_file(fname, target_dir=None, force=False):
"""Download the IDX file named fname from the URL specified in dataset_url
and return it as a numpy array.
Parameters
----------
fname : str
File name to download and parse
target_dir : str
Directory where to store the file
force : bool
Force downloading the file, if it already exists
Returns
-------
data : numpy.ndarray
Numpy array with the dimensions and the data in the IDX file
"""
fname = download_file(fname, target_dir=target_dir, force=force)
fopen = gzip.open if os.path.splitext(fname)[1] == '.gz' else open
with fopen(fname, 'rb') as fd:
return parse_idx(fd)
def train_images():
"""Return train images from Yann LeCun MNIST database as a numpy array.
Download the file, if not already found in the temporary directory of
the system.
Returns
-------
train_images : numpy.ndarray
Numpy array with the images in the train MNIST database. The first
dimension indexes each sample, while the other two index rows and
columns of the image
"""
return download_and_parse_mnist_file('train-images-idx3-ubyte.gz')
def test_images():
"""Return test images from Yann LeCun MNIST database as a numpy array.
Download the file, if not already found in the temporary directory of
the system.
Returns
-------
test_images : numpy.ndarray
Numpy array with the images in the train MNIST database. The first
dimension indexes each sample, while the other two index rows and
columns of the image
"""
return download_and_parse_mnist_file('t10k-images-idx3-ubyte.gz')
def train_labels():
"""Return train labels from Yann LeCun MNIST database as a numpy array.
Download the file, if not already found in the temporary directory of
the system.
Returns
-------
train_labels : numpy.ndarray
Numpy array with the labels 0 to 9 in the train MNIST database.
"""
return download_and_parse_mnist_file('train-labels-idx1-ubyte.gz')
def test_labels():
"""Return test labels from Yann LeCun MNIST database as a numpy array.
Download the file, if not already found in the temporary directory of
the system.
Returns
-------
test_labels : numpy.ndarray
Numpy array with the labels 0 to 9 in the train MNIST database.
"""
return download_and_parse_mnist_file('t10k-labels-idx1-ubyte.gz')
You can store this in any file, import that file and use the functions as you need (without creating the MNIST object).
Hope this helps. Good luck.

Related

Tensorflow dataset from lots of .npy files

I'm trying to create a tensorflow dataset from 6500 .npy files of shape [256,256].
My previous method (for less files) is to load them and stack them into an np.array, and the use tf.data.Dataset.from_tensor_slices((stacked_data)).
With the current number of files I get ValueError: Cannot create a tensor proto whose content is larger than 2GB.
I'm now trying the following:
def data_generator():
processed = []
for i in range(len(onlyfiles)):
processed.append(tf.convert_to_tensor(np.load(onlyfiles[i], mmap_mode='r')))
yield iter(tf.concat(processed, 0))
_dataset = tf.data.Dataset.from_generator(generator=data_generator,output_types=tf.float32)
onlyfiles is the list of the filenames
I get multiple errors, one of which is the following:
2022-10-01 11:25:44.602505: W tensorflow/core/framework/op_kernel.cc:1639] Invalid argument: TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was float32, but the yielded element was <generator object Tensor.__iter__ at 0x7fe6d7d506d0>.
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 653, in generator_py_func
ret_arrays.append(script_ops.FuncRegistry._convert( # pylint: disable=protected-access
File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/ops/script_ops.py", line 195, in _convert
result = np.asarray(value, dtype=dtype, order="C")
TypeError: float() argument must be a string or a number, not 'generator'
What should I change? Is there another method to do it?
Because I created the dataset, is there a better way to prepare it for the Tensorflow implementation?
After a few days, I found this solution. I don't know how good it it, but I'll post it just in case someone finds it useful:
#tf.function
def input_fn():
tf.compat.v1.enable_eager_execution()
mypath = 'tensorflow_datasets/Dataset_1/'
list_of_file_names = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
def gen():
for i in itertools.count(1):
data1 = np.load(list_of_file_names[i%len(list_of_file_names)])
data2 = np.where(data1 > 1, data1, 1)
yield tf.convert_to_tensor(np.where(data2>0, 20*np.log10(data2), 0))
dataset = tf.data.Dataset.from_generator(gen, (tf.float32))
return dataset.make_one_shot_iterator().get_next()
I usually do such things as follows
dataset = tf.data.Dataset.from_tensor_slices(list_of_file_names)
# Optional
dataset = dataset.repeat().shuffle(...)
def read_file(file_name):
full_path_to_image_file = ... # build full path
buffer = tf.io.read_file(full_path_to_image_file)
tensor = ... # converte from buffer to tensor
return tensor
dataset = dataset.map(read_file, num_parallel_calls=...)
As an option you can read file with np.load inside py_function (use decode ("utf-8") to convert byte string to ordinary python string) like
def read_file(file_path):
tensor = tf.py_function(
func=lambda path: np.load(path.numpy().decode("utf-8")),
inp=[file_path],
Tout=tf.float32
)
tensor.set_shape(img_shape)
return tensor

I am trying to convert a CSV to a WAV file, and my inexperience is causing a problem

To start, I know very little about Python. I am trying to convert a CSV to A wav file using a script I found in another post. Best I can tell it was written for an older version of python than the one I am using. One error I am getting is because of the version difference, I am just not sure how to correct it. The other error may be because of my ignorance with Python, but I am not sure of that.
The first error is:
Python\CVS-WAV2.py:44: DeprecationWarning: 'U' mode is deprecated
for time, value in csv.reader(open(fname, 'U'), delimiter=','):
I know in Python 3 the 'U' has been replaced with newline= with either "None, '\n', '\r', or '\n\r'. After reading up on the newline function I think that "None" is the option I want.
Once I change 'U' to newlinw=None, My first error goes away but I still get the following when I run the script:
File "\Python\CVS-WAV2.py", line 43, in
for time, value in csv.reader(open(fname, newline='\n'), delimiter=','):
ValueError: not enough values to unpack (expected 2, got 0)
I am not sure how to resolve this error though.
#!/usr/bin/python
import wave
import struct
import sys
import csv
import numpy
from scipy.io import wavfile
from scipy.signal import resample
def write_wav(data, filename, framerate, amplitude):
wavfile = wave.open(filename,'w')
nchannels = 1
sampwidth = 2
framerate = framerate
nframes = len(data)
comptype = "NONE"
compname = "not compressed"
wavfile.setparams((nchannels,
sampwidth,
framerate,
nframes,
comptype,
compname))
frames = []
for s in data:
mul = int(s * amplitude)
frames.append(struct.pack('h', mul))
frames = ''.join(frames)
wavfile.writeframes(frames)
wavfile.close()
print("%s written" %(filename))
if __name__ == "__main__":
if len(sys.argv) <= 1:
print ("You must supply a filename to generate")
exit(-1)
for fname in sys.argv[1:]:
data = []
for time, value in csv.reader(open(fname, newline=None), delimiter=','):
try:
data.append(float(value))#Here you can see that the time column is skipped
except ValueError:
pass # Just skip it
arr = numpy.array(data)#Just organize all your samples into an array
# Normalize data
arr /= numpy.max(numpy.abs(data)) #Divide all your samples by the max sample value
filename_head, extension = fname.rsplit(".", 1)
data_resampled = resample( arr, len(data) )
wavfile.write('rec.wav', 2000, data_resampled) #resampling at 2khz
print ("File written succesfully !")
Any help would be greatly appreciated!!
There are probably empty lines in your CSV file. One way to skip over those is to use the following:
for record in csv.reader(open(fname, newline=None), delimiter=','):
if not record:
continue
time, value = record

problem using pycocotools for Scaled Yolo v4 in pytorch : numpy version dilemma

my setting is
python 3.9
numpy 1.21.0
cuda 10.2
right now i'm having a problem receiving two error messages.
one is :
ERROR: pycocotools unable to run: 'numpy.float64' object cannot be interpreted as an integer
second is :
ERROR: pycocotools unable to run: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject
situation is, i looked for closed questions and found the solution.
For first error, downgrading numpy was a solution for many people. (down to 1.16.5)
For second error, upgrading numpy was a solution for many people. (up to 1.21.0)
So if i upgrade numpy, first problem occurs, downgrade, second problem occurs. Opposite solutions.
I've been trying to solve the first error without downgrading my numpy, but it haven't gone very well.
This is the problem code below.
# Save JSON
if save_json and len(jdict):
f = 'detections_val2017_%s_results.json' % \
(weights.split(os.sep)[-1].replace('.pt', '') if isinstance(weights, str) else '') # filename
print('\nCOCO mAP with pycocotools... saving %s...' % f)
with open(f, 'w') as file:
json.dump(jdict, file)
try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
# THIS IS WHERE THE CODE STOPS WHEN SECOND ERROR OCCURS
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files]
cocoGt = COCO(glob.glob('../coco/annotations/instances_val*.json')[0]) # initialize COCO ground truth api
cocoDt = cocoGt.loadRes(f) # initialize COCO pred api
#THIS IS WHERE THE CODE STOPS WHEN FIRST ERROR OCCURS
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.params.imgIds = imgIds # image IDs to evaluate
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
map, map50 = cocoEval.stats[:2] # update results (mAP#0.5:0.95, mAP#0.5)
except Exception as e:
print('ERROR: pycocotools unable to run: %s' % e)
Below is the value and type of each cocoGt, and cocoDt
<pycocotools.coco.COCO object at 0x000002351F4CE6A0>
<class 'pycocotools.coco.COCO'>
<pycocotools.coco.COCO object at 0x000002351F4E58E0>
<class 'pycocotools.coco.COCO'>
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
I think this part is the most suspicious part in the code
I tried to cover the values (cocoGt, cocoDt) with int but it didn't work displaying an error :
int() argument must be a string, a bytes-like object or a number, not 'COCO'.
below is a part of cocoeval.py which contains the def of cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') in case it might be an useful information.
def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
'''
Initialize CocoEval using coco APIs for gt and dt
:param cocoGt: coco object with ground truth annotations
:param cocoDt: coco object with detection results
:return: None
'''
if not iouType:
print('iouType not specified. use default iouType segm')
self.cocoGt = cocoGt # ground truth COCO API
self.cocoDt = cocoDt # detections COCO API
self.params = {} # evaluation parameters
self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
self.eval = {} # accumulated evaluation results
self._gts = defaultdict(list) # gt for evaluation
self._dts = defaultdict(list) # dt for evaluation
self.params = Params(iouType=iouType) # parameters
self._paramsEval = {} # parameters for evaluation
self.stats = [] # result summarization
self.ious = {} # ious between all gts and dts
if not cocoGt is None:
self.params.imgIds = sorted(cocoGt.getImgIds())
self.params.catIds = sorted(cocoGt.getCatIds())
Thank you very much for your time

ValueError: TensorFlow requires that the following symbols must be defined before the loop

I am trying to create an input pipeline using the tf.data API. I have 3D data and using normal NumPy operations I would've ended up with an array with dimensions [?,256x256x3x100], which one can think of as 100 frames each of 256x256x3 size.
import glob
import os
import numpy as np
import tensorflow.compat.v1 as tf
def readfile(filenames):
flag = 0
for name in filenames:
string = tf.read_file(name)
image = tf.image.decode_image(string, channels=3)
if flag == 0:
bunch = image
flag = 1
else:
bunch = tf.concat([bunch,image],1)
return bunch
with tf.device("/cpu:0"):
train_files = []
for s in [x[0] for x in os.walk("path/to/data/folders")]:
if(s == "path/to/data/folders"):
continue
train_files.append(glob.glob(s+"/*.png"))
# shape of train_files is [5,100]
train_dataset = tf.data.Dataset.from_tensor_slices(train_files)
train_dataset = train_dataset.map(readfile, num_parallel_calls=16)
I think the error is occurring because 'bunch' is changing size in for loop. Error:
ValueError Traceback (most recent call last)
<ipython-input-13-c2f88ca344dc> in <module>
22 train_dataset = train_dataset.map(
---> 23 readfile, num_parallel_calls=16)
ValueError: in converted code:
ValueError: TensorFlow requires that the following symbols must be defined before the loop: ('bunch',)
How do I read the data correctly?
EDIT
What worked for me:
def readfile(filenames):
flag = 0
name = filenames[0]
string = tf.read_file(name)
image = tf.image.decode_image(string, channels=3)
bunch = image
for name in filenames:
string = tf.read_file(name)
image = tf.image.decode_image(string, channels=3)
if flag == 0:
bunch = image
flag = 1
else:
bunch = tf.concat([bunch,image],1)
return bunch
So I'm not sure why it is necessary to initialise bunch before the loop, when the first iteration should take care of that bunch = image. It might be because flag is not defined as a tensor so bunch = image is never actually run?
The variable bunch is created inside the function readfile() and therefore the error, because variables cannot be created inside the loop at run time. A fix would be to move the declaration of the variable bunch outside the loop. Code sample follows:
import glob
import os
import numpy as np
import tensorflow.compat.v1 as tf
def readfile(filenames):
flag = 0
bunch = <some_appropriate_initialization>
for name in filenames:
string = tf.read_file(name)
image = tf.image.decode_image(string, channels=3)
if flag == 0:
bunch = image
flag = 1
else:
bunch = tf.concat([bunch,image],1)
return bunch
# Rest of the code
You can't use arbitrary python code inside a dataset.map function, that is readfile in your case. There are two ways to solve this:
By using readfile code as it is and by calling it astf.py_function instead, here you can do eager execution, hence you can write any python logic as normal.
By converting the code in readfile and making use of only tensorflow functions to do the transformation. Performance-wise this is much better than using tf.py_function.
You can find an example on both at https://www.tensorflow.org/api_docs/python/tf/py_function

Scipy: Trying to write wav file, AttributeError: 'list' object has no attribute 'dtype'

I am using Anaconda3 and SciPy to try to write a wav file using an array:
wavfile.write("/Users/Me/Desktop/C.wav", 1000, array)
(I don't know how many samples per second, I'm planning on playing around with that, I'm betting on 1000 however)
array returns an array of 3000 integers, so the file would last 3 seconds.
However it gives me this error when trying to run:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-21-ce3a8d3e4b4b> in <module>()
----> 1 wavfile.write("/Users/Me/Desktop/C.wav", 1000, fin)
/Users/Me/anaconda/lib/python3.4/site-packages/scipy/io/wavfile.py in write(filename, rate, data)
213
214 try:
--> 215 dkind = data.dtype.kind
216 if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and data.dtype.itemsize == 1)):
217 raise ValueError("Unsupported data type '%s'" % data.dtype)
AttributeError: 'list' object has no attribute 'dtype'
You are passing write an ordinary python list, which does not have an attribute called dtype (you can get that info by studying the error message). The documentation of scipy.io.wavfile clearly states you should pass it a numpy array:
Definition: wavfile.write(filename, rate, data)
Docstring:
Write a numpy array as a WAV file
You can convert your ordinary python list to a numpy array like so:
import numpy as np
arr = np.array(array)
I would like to add a bit of information in reply to user3151828's comment. I opened a file comprised of 32 bit signed float values, audio data not formatted as a proper wave file, and created a normal Python list and then converted it to a numpy array as Oliver W. states to do and printed the results.
import numpy as np
import os
import struct
file = open('audio.npa', 'rb')
i = 0
datalist = []
for i in range(4):
data = file.read(4)
s = struct.unpack('f', data)
datalist.append(s)
numpyarray = np.array(datalist)
print('datalist, normal python array is: ', datalist, '/n')
print('numpyarray is: ', numpyarray)
The output is:
datalist, normal python list is: [(-0.000152587890625,), (-0.005126953125,), (-0.010284423828125,), (-0.009796142578125,)]
numpyarray is:
[[-0.00015259]
[-0.00512695]
[-0.01028442]
[-0.00979614]]
So, there is the difference between the two.

Categories