How to make a dataset similar to CIFAR10 with several images - python

I try to make a dataset that is similar to CIFAR10. I found this tutorial:
How to create dataset similar to cifar-10
I already can make a dataset with 1 image, but when I try to use several images I got this error:
tensorflow.python.framework.errors.InvalidArgumentError: Indices are not valid: not lexicographically sorted or containing repeats.
Can anyone help me to solve this problem?
This is my code:
from PIL import Image
import numpy as np
out =np.empty([20,7501])
for j in xrange(0, 10):
im = Image.open('%d_receipt.jpg' % j)
im = (np.array(im))
r = im[:,:,0].flatten()
g = im[:,:,1].flatten()
b = im[:,:,2].flatten()
label = [0]
out[j] = np.array(list(label) + list(r) + list(g) + list(b),np.uint8)
for i in xrange(0, 10):
im = Image.open('%d_news.jpg' % i)
im = (np.array(im))
r = im[:,:,0].flatten()
g = im[:,:,1].flatten()
b = im[:,:,2].flatten()
label = [1]
j = i + 10
out[j] = np.array(list(label) + list(r) + list(g) + list(b),np.uint8)
out.tofile("data_batch.bin")

I do it like this:
import numpy as np
import scipy.io
mat = scipy.io.loadmat('train_32x32.mat')
data = mat['X']
label = mat['y']
R_data = data[:,:,0,:]
G_data = data[:,:,1,:]
B_data = data[:,:,2,:]
R_data = np.transpose(R_data, (2,0,1))
G_data = np.transpose(G_data, (2,0,1))
B_data = np.transpose(B_data, (2,0,1))
R_data = np.reshape(R_data,(73257,32*32))
G_data = np.reshape(G_data,(73257,32*32))
B_data = np.reshape(B_data,(73257,32*32))
outdata = np.concatenate((label,R_data,G_data,B_data), axis = 1)
step = 10000
for i in range(1,6):
temp = outdata[i*step:(i+1)*step,:]
temp.tofile('SVHN_train_data_batch%d.bin' % i)
print('save data %d' % i)
Then, just put it directly in the train code of Cifar10 tensorflow example.

I too tried to follow the tutorial you posted in the question however I couldn't get it to work so I made my own solution. It can be found on my github here: https://github.com/jdeepee/machine_learning/tree/master
The code is commented so should be easy enough to follow. I should note it iterated through a master directory containing multiple folders which contain the images.

The below snippet is what I did to adapt CIFAR-10 to GTSRB. More details here. https://github.com/hashkanna/traffic-signs/blob/master/Traffic_Signs_Recognition_binFiles.ipynb
out = {}
for i in range(5):
bin_val = (i%5) + 1
#im = Image.open(X_train[i])
#im = np.array(im)
im = X_train[i]
r = im[:,:,0].flatten()
g = im[:,:,1].flatten()
b = im[:,:,2].flatten()
label = [y_train[i]]
out[bin_val] = np.array(list(label) + list(r) + list(g) + list(b),np.uint8)
for i in range(5,len(X_train)):
bin_val = (i%5) + 1
#im = Image.open(X_train[i])
#im = np.array(im)
im = X_train[i]
r = im[:,:,0].flatten()
g = im[:,:,1].flatten()
b = im[:,:,2].flatten()
label = [y_train[i]]
new_array = np.array(list(label) + list(r) + list(g) + list(b),np.uint8)
out[bin_val] = np.append(out[bin_val], new_array, 0)
for bin_val in range(1,6):
out[bin_val].tofile("/Users/kanna/Downloads/data_batch_%s.bin"%bin_val)

Related

Data from three .txt files - loop together to one, and make a plot?

I want to end up with a scatterplot that differentiates color between different values.
First i need to analyze my data. Problem is i have a FE-Model, that exports element numbers coupled with 4 nodes. These 4 nodes have 4 coordinate sets, but if 4 elements share 1 node, it will only give 1 coordinate set for these 4 nodes.
I end up with three .txt files.
.txt with element number (and data i am analyzing for the plot)
.txt with the element number and node numbers.
.txt with node coordinates.
Is it possible to make a loop that connects these datapoints?
I would like to include an example, but i have not yet made one.
I have tried something like this
from numpy import loadtxt
from fpdf import FPDF
Sek = ['Sektion_100']
I = 0
HeaderY =['nr.','LC','Element nr.','Myy','Nyy','MRd_y','URy']
HeaderX =['nr.','LC','Element nr.','Mxx','Nxx','MRd_x','URx']
#load the excel file
#header = loadtxt"Loads\Area_233.txt", unpack=False, skiprows=1)
pdf = FPDF(orientation = 'P', unit = 'mm', format = 'A4')
MaxURx =[]
MaxURy =[]
data = loadtxt("Loads/Sektion_150.txt", unpack=False, skiprows=1)
nr = data[:,1]
Mxx = data[:,2]
Nxx = -data[:,4]
Myy = data[:,3]
Nyy = -data[:,5]
topologi = loadtxt("Loads/Sektion_150_topologi.txt", unpack=False, skiprows=1)
nr1 = topologi[:,0]
node1 = topologi[:,1]
node2 = topologi[:,2]
node3 = topologi[:,3]
node4 = topologi[:,4]
knuder = loadtxt("Loads/Sektion_150_knuder.txt", unpack=False, skiprows=1)
nr2 = knuder[:,0]
x = knuder[:,1]
y = knuder[:,2]
z = knuder[:,3]
Picture of dataset
I have included a picture of my dataset here. In "Sektion_150_Knuder" NR = Node number.
I hope anyone have some pointers in the right direction to solve this problem.
I found the answer.
import xlwings as xw
import matplotlib.pyplot as plt
from math import pi
from numpy import loadtxt
import numpy as np
from fpdf import FPDF
import matplotlib as mpl
from matplotlib.ticker import ScalarFormatter
Sek = ['Sektion_100','Sektion_110','Sektion_120','Sektion_130','Sektion_140','Sektion_150']
I = 0
HeaderY =['nr.','LC','Element nr.','Myy','Nyy','MRd_y','URy']
HeaderX =['nr.','LC','Element nr.','Mxx','Nxx','MRd_x','URx']
#load the excel file
#header = loadtxt"Loads\Area_233.txt", unpack=False, skiprows=1)
pdf = FPDF(orientation = 'P', unit = 'mm', format = 'A4')
MaxURx =[]
MaxURy =[]
Elem = np.array(loadtxt("Element/Sektion_100_elements.txt", unpack=False, skiprows=1))
Node = np.array(loadtxt("Element/Sektion_100_nodes.txt", unpack=False, skiprows=1))
#Elem = np.array(loadtxt("Element/"+Sek+"_elements.txt", unpack=False, skiprows=1))
#Node = np.array(loadtxt("Element/"+Sek+"_nodes.txt", unpack=False, skiprows=1))
data = loadtxt("Loads/Sektion_100.txt", unpack=False, skiprows=1)
Mxx = data[:,2]
Nxx = -data[:,4]
Myy = data[:,3]
Nyy = -data[:,5]
R1x = []
R2x = []
MRdx = []
URx = []
R1y = []
R2y = []
MRdy = []
URy = []
min_nx = int(round(max(min(Nxx),-300),-1)-10)
max_nx = int(round(max(Nxx),-1)+10)
min_ny = int(round(min(Nyy),-1)-10)
max_ny = int(round(max(Nyy),-1)+10)
xrange = range(min_nx, max_nx+50, round((max_nx+50-min_nx)/20))
yrange = range(min_ny, max_ny+50, round((max_ny+50-min_ny)/20))
x2 =range(0,len(data),1)
wbx =xw.Book("Capacity\\Sektion_100_L.xlsm", None, True)
sht_x1 = wbx.sheets["Beregning"]
for i in xrange:
kx = sht_x1.range("N25").value = i
Q1x = sht_x1["AV24"].value
Q2x = sht_x1["BC24"].value
R1x+=[Q1x]
R2x+=[Q2x]
for i in x2:
if Myy[i] <= 0:
mrdx = np.interp(Nxx[i],xrange,R1x)
urx = Mxx[i] / mrdx
else:
mrdx = np.interp(Nxx[i],xrange,R2x)
urx = Mxx[i] / mrdx
MRdx += [round(mrdx,2)]
URx += [round(urx,2)]
TabelX=np.c_[data[:,[0,1,2,4]],MRdx,URx]
sort_tabelX = np.flipud(TabelX[TabelX[:,5].argsort()])
LimX = 25
for i in x2:
if sort_tabelX[i,5] > 1.05 :
LimX = i+2
else:
break
LimX = max(25,LimX)
TABx=np.c_[list(range(1,LimX+1)),sort_tabelX[0:LimX,:]]
TABx2 = np.unique(TABx[:,2])
print(TABx2)
print(len(TABx2))
#OUE=np.array(TABx2)
#np.savetxt("array1.txt", TABx2)
# %%
NumOUE = len(TABx2)
NumElem = len(Elem)
EleRange = range(0,NumElem)
OUERange = range(0,NumOUE)
EO = np.searchsorted(Elem[:,0], TABx2)
print(EO)
EleCorOx =[]
EleCorOy =[]
EleCorOz =[]
EleCorUx =[]
EleCorUy =[]
EleCorUz =[]
for i in EleRange:
Na = np.searchsorted(Node[:,0],Elem[i,1])
Nb = np.searchsorted(Node[:,0],Elem[i,2])
Nc = np.searchsorted(Node[:,0],Elem[i,3])
Nd = np.searchsorted(Node[:,0],Elem[i,4])
print(Na,Nb,Nc,Nd)
if i in EO:
EleCorOx += [(Node[Na,1] + Node[Nb,1] + Node[Nc,1] + Node[Nd,1])/4]
EleCorOy += [(Node[Na,2] + Node[Nb,2] + Node[Nc,2] + Node[Nd,2])/4]
EleCorOz += [(Node[Na,3] + Node[Nb,3] + Node[Nc,3] + Node[Nd,3])/4]
else:
EleCorUx += [(Node[Na,1] + Node[Nb,1] + Node[Nc,1] + Node[Nd,1])/4]
EleCorUy += [(Node[Na,2] + Node[Nb,2] + Node[Nc,2] + Node[Nd,2])/4]
EleCorUz += [(Node[Na,3] + Node[Nb,3] + Node[Nc,3] + Node[Nd,3])/4]
fig = plt.figure()
fig.set_size_inches(20,10)
ax = fig.add_subplot(projection='3d')
ax.scatter3D(EleCorUx,EleCorUy,EleCorUz,color = 'Blue')
ax.scatter3D(EleCorOx,EleCorOy,EleCorOz,color = 'red')
ax.set_zlim(0,27000)
plt.show()
This code is for showing Sektion 100. Small changes gives me the plots for 110, 120, 130 and so on.
If anyone can use it.

Annotating images from h5 file

.Hi all, I have 70k images saved into .h5 file and now with this script I want to read from that file and annotate text instances into .json file. When I run this script it takes very long time to annotate 1 image (cca 2h).
When I do this with 15 images then the script works fine and annotate all 15 images about a few seconds.
Now with 70k images -> .h5 file is 51gb.
I don't know is problem in code or the h5 file is too big? Because code works fine with small amount of images, but I'm working on some project where I need 70k or 700k images.
from __future__ import division
import os
import os.path as osp
from re import U
import numpy as np
import matplotlib.pyplot as plt
import h5py
from common import *
import json
import cv2
import numpy as np
from itertools import cycle
import js2py
#from gen import brojac
#from synthgen import imnames
global x
global y
def write_json(data, filename='annotation.json'):
with open(filename,'w') as file:
json.dump(data,file,indent=4)
DATA_PATH = 'results'
DB_FNAME = osp.join(DATA_PATH,'SynthText.h5')
def get_data():
return h5py.File(DB_FNAME,'r')
def viz_textbb(text_im, imageName, charBB_list, wordBB, textToList, alpha=1.0):
"""
text_im : image containing text
charBB_list : list of 2x4xn_i bounding-box matrices
wordBB : 2x4xm matrix of word coordinates
"""
#print("k",z, type(z))
plt.close(1)
plt.figure(1)
plt.imshow(text_im)
H,W = text_im.shape[:2]
global imnames
#print("MOLIIIM",wordBB)
#DODANO IZ MAIN-a
#**********************************************
db = h5py.File('results/SynthText.h5', 'r')
dsets = sorted(db['data'].keys())
for k in dsets:
db = get_data()
imnames = sorted(db['data'].keys())
start = 0
count = 0
coordinate = []
coordinate1 = []
name = []
name1 = []
final = []
upperList = []
downList = []
counter = 0
FinalFinal = []
imageData = { }
dictList = []
for eachWord in textToList:
length = len(eachWord)
for i in range(0,4):
for j in range(start,length+start):
coordinate.append([charBB_list[0][0][i][j], charBB_list[0][1][i][j]])
coordinate1.append((charBB_list[0][0][i][j], charBB_list[0][1][i][j]))
name.append(coordinate)
name1.append(coordinate1)
coordinate = []
for j in range(0, length):
for i in range(len(name)) :
#print(i,j, name[i][j]) ## koordinate da se snađem, treba
final.append(name[i][j])
#print(name)
#NEŠTA ZA CRTANJE, NEBITNO
if(i == 0 or i == 1):
upperList.append(name[i][j])
if(i == 2):
downList.append(name[i+1][j])
if(i == 3):
downList.append(name[i-1][j])
down = reversed(downList)
joinList = [*upperList,*down,upperList[0]]
FinalFinal.append(joinList)
imageData['transcription']=eachWord
imageData['language']="Latin"
imageData['illegibility']=False
imageData['points']=final
dictionary_copy = imageData.copy()
dictList.append(dictionary_copy)
del(dictionary_copy)
finalToList = np.array(final)
name=[]
final = []
upperList = []
downList = []
start = len(eachWord) + start
#del(dictList[0])
finalDict = {f'gt_{imageName}':dictList}
#print(type(finalDict)) --> dict
#print(imageName,finalDict)
#print(finalDict)
#print(len(textToList))
#print(textToList)
with open("annotation.json") as json_file:
data=json.load(json_file)
temp=data["annotations"]
#temp.append(finalDict)
temp.update(finalDict)
#temp['annotations'] = finalDict
write_json(data)
json_file.close()
for list in FinalFinal:
x,y = zip(*list)
plt.plot(x,y)
#print(x,y)
# points = tuple(zip(x,y))
# # boundaries of the bounding box
# left, right = min(points, key=lambda p: p[0]), max(points, key=lambda p: p[0])
# bottom, top = min(points, key=lambda p: p[1]), max(points, key=lambda p: p[1])
# # area
# base = right[0] - left[0]
# height = top[1] - bottom[1]
# A = base * height
#print(A)
for i in range(len(charBB_list)):
# #print(charBB_list) #ispisuje x-eve za jedan vrh svih instanci pojedinih slova, pa drugi, 3. i 4. i onda posebno y-one
bbs = charBB_list[i]
ni = bbs.shape[-1]
for j in range(ni):
bb = bbs[:,:,j]
bb = np.c_[bb,bb[:,0]] #ako se doda ,bb[:,0] -> printa isto kao i gornji lijevi
#plt.plot(bb[0,:], bb[1,:], 'r', alpha=alpha)
# plot the word-BB:
for i in range(wordBB.shape[-1]):
bb = wordBB[:,:,i] #koordinate wordBB-a
bb = np.c_[bb,bb[:,0]] #spaja skroz lijevu, TREBA
#plt.plot(bb[0,:], bb[1,:], 'g', alpha=alpha)
# visualize the indiv vertices:
vcol = ['r','g','b','k']
#for j in range(4):
#plt.scatter(bb[0,j],bb[1,j],color=vcol[j])
#print(bb) # ----> KOORDINATE wordBB-a
#print(bb[1,j])
plt.gca().set_xlim([0,W-1])
plt.gca().set_ylim([H-1,0])
plt.show(block=False)
def main(db_fname):
db = h5py.File(db_fname, 'r')
dsets = sorted(db['data'].keys())
print ("total number of images : ", colorize(Color.RED, len(dsets), highlight=True))
for k in dsets:
rgb = db['data'][k][...]
charBB = db['data'][k].attrs['charBB']
wordBB = db['data'][k].attrs['wordBB']
txt = db['data'][k].attrs['txt']
textToList = (db['data'][k].attrs['txt']).tolist()
#print(textToList)
viz_textbb(rgb, k,[charBB], wordBB, textToList)
print ("image name : ", colorize(Color.RED, k, bold=True))
print (" ** no. of chars : ", colorize(Color.YELLOW, charBB.shape[-1]))
print (" ** no. of words : ", colorize(Color.YELLOW, wordBB.shape[-1]))
print (" ** text : ", colorize(Color.GREEN, txt))
#print("To know", z[1], type(z[1]))
# OTKOMATI OVO DOLJE AKO ŽELIM STISKAT ENTER
# if 'q' in input("next? ('q' to exit) : "):
# break
db.close()
if __name__=='__main__':
main('results/SynthText.h5')

My python program sorts information incorrectly

I have no idea how to fix this sorter part. Could anyone help me out here? My program sorter should output something like shown in the image:
It takes data from the url given and calculates the area of it. But it should also sort the information but it doesn't right now.
import json
import urllib.request
import requests
f = open('katastritunnused.txt', 'r')
response = requests.get('https://geoportaal.maaamet.ee/url/xgis-ky.php?ky=41201:004:0067&out=json')
json_response = response.json()
print(json_response["1"].get("Pindala"))
This gets and calculates the area of the info got form the url.
c = []
list2 = []
def bubblesort(c):
n = len(c)
for _ in range(n):
jarjestatud = True
for x in range(n - _ - 1):
if c[x][1] > c[x + 1][1]:
c[x][1], c[x + 1][1] = c[x + 1][1], c[x][1]
jarjestatud = False
if jarjestatud:
break
return c
list2 = []
for j, _ in enumerate(c):
templist = [_, list2[j]]
list2.append(templist)
desclist = bubblesort(list2)
desclist.reverse()
def writer(a):
with open('sorteeritud.csv', mode = 'w',) as csv:
for b in a:
csv.write(str(b[0]) +';' + str(b[1])+'\n')
writer(desclist)
f.close()

Run MATLAB script in Python

I'm looking for a way to execute MATLAB code in a python file without using MATLAB. This is my MATLAB code:
count = 0
for image_name in os.listdir(directory_segmentation):
if count < n_displays:
segmented_image = cv2.imread(directory_segmentation + "/" + image_name)
segmented = Image.open(directory_segmentation + "/" + image_name)
original = Image.open(directory_originals + "/" + image_name)
font = ImageFont.truetype(font_path, font_size)
d1 = ImageDraw.Draw(segmented)
masked = self.create_mask_plaque(segmented_image, color_plaque)
maxlevel = 3
**v = self.waveletdescr(masked, maxlevel)**
print(v)
count += 1
And the function that it's between ** ** it's coded in MATLAB. So I need to give some parameters to it and get back a vector. Is it possible? Thank you!!
PD. The structure of the MATLAB script looks like this:
function v = waveletdescr(im,maxlevel) ;
im = rgb2gray(im)
im = double(im);
[m,n] = size(im);
npix = m*n;
....
v = zeros( 3*maxlevel+1, 1 ); % the descriptors
end
v(end) = sum(sum( im.^2 )) / npix;
function imf = filterh(im,l)
d = size(im(end-1:-1:end-l,:))
imf = 0.5*[im; im(end-1:-1:end-l,:)];
imf = imf(1:end-l,:) + imf(l+1:end,:);
function imf = filterg(im,l)
imf = 0.5*[im; im(end-1:-1:end-l,:)];
imf = imf(l+1:end,:) - imf(1:end-l,:);

make_node requires 4D tensor of kernels

I have trained cnn model and saved parameters in five files,but when I use these params to test photos ,I meet a question like this:enter image description here
the code of load_data is:
def load_data(pag_name):``
k = 0
for filename in os.listdir(pag_name):
if (filename != '.DS_Store'):
k = k + 1
num = k
# test_per = k*4
print k
i = 0
j = 0
label = 0
train_set = numpy.empty((num, 1, 56, 56))
while (j < 1):
for filename in os.listdir(pag_name):
if (filename != '.DS_Store'):
filename = pag_name+ '/' + filename
image = Image.open(filename)
#print image.size
#print image
img_ndarray = numpy.asarray(image, dtype='float64') / 256
img_ndarray = numpy.asarray([img_ndarray])
# train_set[i] = numpy.ndarray.flatten(img_ndarray)
train_set[i] = img_ndarray
#print train_set.shape
# print filename1
# print 'label:', label
# print 'i:',i
i = i + 1
j = j + 1
def shared_dataset(data_x, borrow=True):
shared_x = theano.shared(numpy.asarray(data_x,
dtype=theano.config.floatX),
borrow=borrow)
return shared_x
train_set = shared_dataset(train_set)
print train_set.get_value(borrow=True).shape
return train_set
and the code of use_CNN is :
def use_CNN(pag_name,nkerns=[20,40,60]):
data = load_data(pag_name)
data_num = data.get_value(borrow=True).shape[0]
layer0_params,layer01_params,layer1_params,layer2_params,layer3_params = load_params()
x = T.matrix('x')
layer0_input = x.reshape((data_num,1,56,56))
layer0 = LeNetConvPoolLayer(
input=layer0_input,
params_W = layer0_params[0],
params_b = layer0_params[1],
image_shape=(data_num, 1, 56, 56),
filter_shape=(nkerns[0], 1, 5,5),
poolsize=(2, 2)`
)
I haven't meet this problem ,and I don't know where and how I change my code.
the result of this error is the params are not 4D, the params I load is 3D, like my W and b is (20,1,5,5),but I load (1,5,5),so I meet this problem.

Categories