python program to convert mapper and reducer program for hadoop

python program to convert mapper and reducer program for hadoop - python

I have two programs that the first program is about to convert .h5 file to .tiff file and second program is predict temperature using arima model.
here is first program for converting .h5 file to .tiff file and min and max .tiff and it also call geo tiff.
from osgeo import gdal
import numpy as np
import os
import h5py
from collections import defaultdict
from osgeo import osr
import datetime
in_dir = r'/Users/sunnybhargav/Desktop/jan'
out_dir = r'/Users/sunnybhargav/Desktop/new_output'
#in_dir = input('Enter input directory path where hdf files are stored: ')
#out_dir = input('Enter output directory path where geotiff files are to be stored: ')
def arrayToTif(array,tifFilePath,proj,transform,nodatavalue):
with open(tifFilePath,'a') as file:
pass
# write raster
out_ds = gdal.GetDriverByName('GTiff').Create(tifFilePath,
array.shape[1],
array.shape[0],
1, #Number of bands
gdal.GDT_Float32)
out_ds.GetRasterBand(1).WriteArray(array)
out_ds.GetRasterBand(1).SetNoDataValue(nodatavalue)
# close tif to write into disk (free tif file)
out_ds = None
dates_dict = defaultdict(list)
for root,directories,filenames in os.walk(in_dir):
for filename in filenames:
if (filename.endswith('.h5')):
hdffileDate = filename[6:15]
hdfdate = (int(hdffileDate[0:2]))
dates_dict[hdfdate].append(filename)
print(filename)
for key in dates_dict.keys():
file_list = dates_dict[key]
min_lst = 1000*np.ones((2816,2805))
max_lst = -1000*np.ones((2816,2805))
for v in file_list:
hdf_ds = h5py.File(os.path.join(in_dir,v))
lst = np.array(hdf_ds['LST'])[0,:,:]
hdf_ds = gdal.Open(os.path.join(in_dir,v))
metadata = hdf_ds.GetMetadata_Dict()
lst = lst.astype('Float32')
max_lst = np.maximum(max_lst,lst)
lst[lst==-999] = 999
min_lst = np.minimum(min_lst,lst)
min_lst[min_lst==999] = -999
transform = (0,1,0,0,0,-1)
proj = None
nodatavalue = -999
tiffileDate = v[6:15]
MinName = 'MIN' +v[0:2]+str.lower(tiffileDate) + '.tif'
MaxName = 'MAX' +v[0:2]+str.lower(tiffileDate) + '.tif'
arrayToTif
(max_lst,os.path.join(out_dir,MaxName),proj,transform,nodatavalue)
arrayToTif
(min_lst,os.path.join(out_dir,MinName),proj,transform,nodatavalue)
del lst
del min_lst
del max_lst
second program
in second program is for tiff to get ndarray and then get out put for particular all max temprature and predict next 5 day prediction.
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from numpy import genfromtxt
import csv
import datetime
from datetime import datetime
import time
from matplotlib import pyplot
from pandas import Series
from statsmodels.tsa.arima_model import ARIMA
import numpy
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
import numpy as np
import subprocess
import gdal,osr
from gdalconst import *
import os
import numpy as np
from PIL import Image
import scipy.misc
from datetime import datetime
# import timeseries as ts
count = 1
max_temp = []
min_temp = []
filename = []
filenamer = []
max_temp_points = []
min_temp_points = []
source = r'/Volumes/bhargav 1/data/NEW_MAX'
for root, dirs, filenames in os.walk(source):
print(filenames)
for f in filenames:
print (f)
dataset = gdal.Open( source + '//' + f ,gdal.GA_ReadOnly)
#print(dataset)
geotransform = dataset.GetGeoTransform()
band = dataset.GetRasterBand(1)
data = band.ReadAsArray(0,0,dataset.RasterXSize,dataset.RasterYSize).astype(np.float64)
#print(np.histogram(data,bins=500))
print(np.shape(data))
max_temp_point = data[793][1160]
max_temp_point = max_temp_point - 273
print(max_temp_point)
print("Count:",count)
max_temp_points.append(max_temp_point)
count = count + 1
print(np.shape(max_temp_points))
print(np.mean(max_temp_points))
count = 1
np.save("Max_temp_points_1",max_temp_points)
X = max_temp_points
model = ARIMA(X, order=(5,0,4))
model_fit = model.fit(disp=-1)
# print summary of fit model
print(model_fit.summary())
forecast = model_fit.predict()
print (forecast)
# plot
start_index = len(X)
end_index = start_index + 6
predict_val = model_fit.predict(start=start_index, end=end_index)
print('Prediction:',predict_val)
pyplot.plot(X)
pyplot.plot(forecast, color='red')
pyplot.show()

Related

Librosa Split .wav file into 15s intervals

I'm new to working with audio files. I have several 60 second long files that I want to split into 15 second files (or any length). I'm able to split files into 1 second long files (so 60 files) but can't seem to get 15 second intervals to work. How can I create the intervals I'm looking for?
import os
import numpy as np
import librosa
import librosa.display
audio_dir = r'data\acoustics\recordings'
out_dir = r'data\acoustics\splits'
os.makedirs(out_dir, exist_ok=True)
audio_file = os.path.join(audio_dir, 'rec_20220729T160547Z.wav')
wave, sr = librosa.load(audio_file, sr=None)
num_sections = int(np.ceil(len(wave) / sr)
split = []
for i in range(num_sections):
t = wave[i * sr : i * sr + sr]
split.append(t)
for i in range(num_sections):
recording_name = os.path.basename(audio_file[:-4])
out_file = f"{recording_name}_{str(i)}.wav"
sf.write(os.path.join(out_dir, out_file), split[i], sr)

What you have done is mostly correct. It just need minor changes.
First is getting the data which you have done correctly.
import os
import numpy as np
import librosa
import librosa.display
import soundfile as sf # Missing import
audio_dir = r'data\acoustics\recordings'
out_dir = r'data\acoustics\splits'
os.makedirs(out_dir, exist_ok=True)
audio_file = os.path.join(audio_dir, 'rec_20220729T160547Z.wav')
wave, sr = librosa.load(audio_file, sr=None)
Calculate the length of segment:
segment_dur_secs = 15
segment_length = sr * segment_dur_secs
Breaking up the data and saving to file:
num_sections = int(np.ceil(len(wave) / segment_length))
split = []
for i in range(num_sections):
t = wave[i * segment_length: (i + 1) * segment_length]
split.append(t)
for i in range(num_sections):
recording_name = os.path.basename(audio_file[:-4])
out_file = f"{recording_name}_{str(i)}.wav"
sf.write(os.path.join(out_dir, out_file), split[i], sr)
Alternatively:
split = []
for s in range(0, len(wave), segment_length):
t = wave[s: s + segment_length]
split.append(t)
recording_name = os.path.basename(audio_file[:-4])
for i, segment in enumerate(split):
out_file = f"{recording_name}_{i}.wav"
sf.write(os.path.join(out_dir, out_file), segment, sr)
Edit: There is an issue with the code here because sf is not defined. (Fixed the import)

normalizing mel spectrogram to unit peak amplitude?

I am new to both python and librosa. I am trying to follow this method for a speech recognizer: acoustic front end
My code:
import librosa
import librosa.display
import numpy as np
y, sr = librosa.load('test.wav', sr = None)
normalizedy = librosa.util.normalize(y)
stft = librosa.core.stft(normalizedy, n_fft = 256, hop_length=16)
mel = librosa.feature.melspectrogram(S=stft, n_mels=32)
melnormalized = librosa.util.normalize(mel)
mellog = np.log(melnormalized) - np.log(10**-5)
The problem is that when I apply librosa.util.normalize to variable mel, I expect values to be between 1 and -1, which they aren't. What am I missing?

If you want your output to be log-scaled and normalized to between -1 and +1, you should log-scale first, then normalize:
import librosa
import librosa.display
import numpy as np
y, sr = librosa.load('test.wav', sr = None)
normalizedy = librosa.util.normalize(y)
stft = librosa.core.stft(normalizedy, n_fft = 256, hop_length=16)
mel = librosa.feature.melspectrogram(S=stft, n_mels=32)
mellog = np.log(mel + 1e-9)
melnormalized = librosa.util.normalize(mellog)
# use melnormalized

Insert images in a folder into dataframe

Im trying to read images from folders into a dataframe , where each row in the dataframe is all the images for a folder :
import cv2
import os,glob
import matplotlib.pylab as plt
from os import listdir,makedirs
from os.path import isfile,join
import pandas as pd
import PIL
import numpy as np
from scipy.ndimage import imread
pth = 'C:/Users/Documents/myfolder/'
folders = os.listdir(pth)
videos = pd.DataFrame()
for folder in folders:
pth_upd = pth + folder + '/'
allfiles = os.listdir(pth_upd)
files = []
columns = ['data']
index = [folders]
for file in allfiles:
files.append(file) if ('.bmp' in file) else None
samples = np.empty((0,64,64))
for file in files:
img = cv2.imread(os.path.join(pth_upd,file),cv2.IMREAD_GRAYSCALE)
img = img.reshape(1,64,64)
samples = np.append(samples, img, axis=0)
result = pd.DataFrame([samples], index=[folder], columns=['videos'])
videos = videos.append(result)
after reading all the images in each folder into (samples array ) how can I insert images for each folder in a dataframe row
ValueError Traceback (most recent call last)
in
17 samples = np.append(samples, img, axis=0)
18
---> 19 result = pd.DataFrame([samples], index=[folder], columns=['videos'])
20 videos = videos.append(result)
ValueError: Must pass 2-d input
:

It's certainly possible to put strings of the resized images into pandas, but there are much better ways to accomplish CNN training. I adapted your image processing code to show how you could do what you asked:
import io
import pandas as pd
import numpy as np
import sklearn
import requests
import tempfile
import os
import cv2
# Image processing for the df
def process_imgfile(x):
img = cv2.imread(os.path.join(
x.Folder, x.image),cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (64, 64))
img = str(img)
return img
# Simulate folders with images in them
with tempfile.TemporaryDirectory() as f:
f1 = os.path.join(f, "Folder1")
f2 = os.path.join(f, "Folder2")
os.mkdir(f1)
os.mkdir(f2)
print(r.status_code)
for x in range(20):
with open(os.path.join(f1, "f1-{}.jpg".format(x)), "wb") as file1, open(
os.path.join(f2, "f2-{}.jpg".format(x)), "wb") as file2:
r = requests.get(
'https://upload.wikimedia.org/wikipedia/en/a/a9/Example.jpg',
stream=True)
for chunk in r.iter_content(16): # File writing...
file1.write(chunk)
file2.write(chunk)
result = [x for x in os.walk(f)]
folder1 = result[1][2]
folder2 = result[2][2]
# Generate dataframe data
j = {"Folder":[], "image":[]}
for x in folder1:
j["Folder"].append(result[1][0])
j["image"].append(x)
for x in folder2:
j["Folder"].append(result[2][0])
j["image"].append(x)
# Use the process_imgfile function to append image data
df = pd.DataFrame(j)
df["imgdata"] = df.apply(process_imgfile, axis=1)
But on a large set of images this is not going to work. Instead, check out ImageDataGenerator which can let you load images at train and test time. It can also help you apply augmentation or synthesize data.

Dask delayed + Matplotlib.savefig() -> FAIL

My goal is to produce multiple png files from respectively multiple numpy arrays, loaded from medical images in my HD.
To make things quicker, I'm using dask delayed.
Here's my working code:
import os.path
from glob import glob
import nibabel as nib
import numpy as np
from dask import delayed
def process(data):
# Need to have the import inside so that multiprocessing works.
# Apparently doesn't solve the issue anyway..
import matplotlib.pyplot as plt
outpath = '/Users/user/outputdir/'
name = os.path.basename(data.get_filename())
savename = name[:name.index('.')] + '.png'
plt.imshow(np.rot90(data.get_data()[15:74, 6:82, 18, 0]),
extent=[0, 1, 0, 1], aspect=1.28, cmap='gray')
plt.axis('off')
out = os.path.join(outpath, savename)
plt.savefig(out)
plt.close()
return out
L = []
for fn in glob("/Users/user/imagefiles/mb*.nii.gz"):
nifti = delayed(nib.load)(fn)
outpng = delayed(process)(nifti)
L.append(outpng)
results = delayed(print)(L)
results.compute()
My problem is that after each run some of the output images are empty (nothing in the png), and which images are empty seem pretty random, since all input data is valid.
I suspect this is a problem of multiprocessing and matplotlib, as seen in other related thread.
Does anyone have a suggestion on how to get this working with dask?
EDIT: Minimal working example
import os.path
import random
import string
import numpy as np
from dask import delayed
def gendata(fn):
return
def process(data):
# Need to have the import inside so that multiprocessing works.
import matplotlib.pyplot as plt
outpath = '/Users/user/Pictures/test/'
name = ''.join(random.choices(string.ascii_lowercase, k=10))
savename = name + '.png'
data = np.random.randint(0, 255, size=(100,100,20,2))
plt.imshow(np.rot90(data[15:74, 6:82, 18, 0]),
extent=[0, 1, 0, 1], aspect=1.28, cmap='gray')
plt.axis('off')
out = os.path.join(outpath, savename)
plt.savefig(out)
plt.close()
return out
L = []
for fn in range(0, 10):
nifti = delayed(gendata)(fn)
outpng = delayed(process)(nifti)
L.append(outpng)
results = delayed(print)(L)
results.compute()

Segmentation fault (core dumped) in scikit-learn Dictionary Learning

I have written a code for Dictionary Learning. It is working fine for 100 Images but if I use 200 Images to learn Dictionary I am getting Segment fault error .
Here is my code =>
from time import time
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.feature_extraction.image import reconstruct_from_patches_2d
from sklearn.utils.fixes import sp_version
from sklearn.datasets import load_sample_image
from scipy import ndimage
from skimage import color
from skimage import io
from PIL import Image
from sklearn.decomposition import SparseCoder
from sklearn.decomposition import sparse_encode
#from skimage import data,restoration
from scipy.misc import imfilter, imread
from scipy.signal import convolve2d as conv2
from skimage import data, img_as_float
from scipy import ndimage as ndi
from skimage import feature
from scipy.misc import imsave
from os import listdir
patch_size = (8,8)
reshape_size = (512,512)
c = np.asarray(Image.open('test/ILSVRC2012_test_00016644.JPEG').resize((512,512), Image.ANTIALIAS))
data = extract_patches_2d(c,(8,8))
total_img = 0
for f in listdir('test/'):
if f.endswith('.JPEG') and f.startswith('I'):
if (total_img <= 200):
total_img = total_img + 1
print(f)
print('data shape : ',data.shape)
print('total file : ',total_img)
A = np.asarray(Image.open('test/'+f).resize(reshape_size, Image.ANTIALIAS))
X = extract_patches_2d(A,patch_size)
data = np.append(data,X,axis=0)
print('patch shape',X.shape)
else:
break
print('Total number of Image : ',total_img)
print('Total Patches : ',data.shape)
print('total size of the array : ',data.nbytes)
n0 = np.asarray(Image.open('data2/004.jpg').resize((512,512), Image.ANTIALIAS))
n0 = n0 / 255
height, width, channel = n0.shape
n0 = n0 + 0.075 * np.random.randn(height, width,3)
n0 = n0 * 255
imsave('gray.png',n0)
patchsize = (8,8)
t0 = time()
data = data.reshape(data.shape[0], -1)
print('Extract patch shape :',data.shape)
data = data - np.mean(data, axis=0)
data = data / np.std(data, axis=0)
t1 = time()
print('Total time : ',round((t1-t0),2),' sec')
print('Learning the dictionary ....')
t2 = time()
n_iter = 5000
dico = MiniBatchDictionaryLearning(n_components=100,alpha=3,n_iter=n_iter)
V = dico.fit(data).components_
print('Dic shape : ',V.shape)
t3 = time()
print('No of iteration : ',n_iter)
print('Total time taken for Dictionary learning : ',round((t3-t2),2),' sec')
I have sufficient RAM around 500GB. But I am getting segment fault error.
How do I Fix this??

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python program to convert mapper and reducer program for hadoop - python

Related

Librosa Split .wav file into 15s intervals

normalizing mel spectrogram to unit peak amplitude?

Insert images in a folder into dataframe

Dask delayed + Matplotlib.savefig() -> FAIL

Segmentation fault (core dumped) in scikit-learn Dictionary Learning

Categories

Resources