Clipping multiple NETcDF4 files - python

I am trying to crop a bunch of netCDF4 files and then combine them into one CSV file. I have tried using the code below but can't seem to get the for loop to work.
import os
import xarray as xr
import netCDF4
import pandas as pd
import numpy as np
import glob
max_lat= -15
min_lat= -20
max_lon= -68
min_lon= -71
#for filename in glob.glob("/path/to/assignment/file*.txt"):
for filename in glob.glob("D:\\Precip\\*.nc"):
print(filename)
data=xr.open_dataset(filename,decode_times=False)
mask_lon = (data.longitude >= min_lon) & (data.longitude <= max_lon)
mask_lat = (data.latitude >= min_lat) & (data.latitude <= max_lat)
cropped_data = data.where(mask_lon & mask_lat , drop=True)
print(cropped_data['E'])
df=data['P'].to_dataframe()

Related

getting wrong results from a loop calculation and invalid syntax error

I have the below arrays and I need to calculate req[i] and JJ[i]. I'm getting allow the values zeros expect req[2] and req[8] but with wrong values. why that's happening?
the second issue is with the second line of JJ calculation ( JJ[i]= 0.00633*(2np.pimath.sqrt(kz[i]*ky[i])dx[i])/(pvt['muo'](np.log(req/well['rw']) + PP['s_h']))), it says invalid syntax. what wrong there?
these are the arrays I'm using:
and this is the code:
import numpy as np
#%% Importing modules Loading .yml/.yaml file
import yaml
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy.special as sc
import pandas as pd
# Productivity index:
# i=len(n)
# req=np.zeros((n))
# JJ=np.zeros((n))
def Prod_index(res, pvt,num,well, PP, w, well_drc,i):
for i in range(n):
if well_drc== 'v':
req[i]= 0.28*math.sqrt(math.sqrt(ky[i]/kx[i])*dx[i]**2 + math.sqrt(kx[i]/ky[i])*dy[i]**2)/((ky[i]/kx[i])**0.25 + (kx[i]/ky[i])**0.25)
JJ[i]= 0.00633*(2*np.pi*math.sqrt(kx[i]*kz[i])*res['h'])/(pvt['muo']*(np.log(req[i]/well['rw']))+ PP['s_v'])
elif well_drc=='h' :
req[i]=0.28*math.sqrt(math.sqrt(kx[i]*ky[i])*dz[i]**2 + math.sqrt(kz[i]/ky[i]*dy[i]**2)/((ky[i]/kz[i])**0.25 + (kz[i]/ky[i])**0.25)
JJ[i]= 0.00633*(2*np.pi*math.sqrt(kz[i]*ky[i])*dx[i])/(pvt['muo']*(np.log(req/well['rw']) + PP['s_h']))
return req, JJ

Add PNG object to a pandas dataframe

I tried to add the PNG object into a pandas dataframe, and it does not work at al:
!pip install rdkit-pypi
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
smiles_list = ['N[C#H](C(=O)O)C']
mol_list = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
mol_list.append(mol)
img =Draw.MolsToGridImage(mol_list, molsPerRow = 4)
glycine = mol_list[0]
figure_list = []
bi = {}
fp = AllChem.GetMorganFingerprintAsBitVect(glycine, 2, nBits = 1024, bitInfo = bi)
fp_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp, fp_arr)
np.nonzero(fp_arr)
list(fp.GetOnBits())
prints = [(glycine, x, bi) for x in fp.GetOnBits()]
figure = Draw.DrawMorganBits(prints, molsPerRow = 4, legends = [str(x) for x in fp.GetOnBits()])
figure_list.append(figure)
df = pd.DataFrame({'smiles': smiles_list[0]}, index = [0])
PandasTools.AddMoleculeColumnToFrame(df,'smiles','Molecule')
df['Fragments'] = figure_list
df
Instead of displaying the PNG image inside the pandas dataframe, it shows the message "<PIL.PngImagePlugin.PngImageFile image mode=RG...".
I would expect to add the PNG object into a pandas dataframe.
I used the following configuration in the googlle colab:
RDKit version: 2022.03.5
OS: Windows
Python version: Python 3.7.15
Are you using conda? No
If you are using conda, which channel did you install the rdkit from? Not applied
If you are not using conda: how did you install the RDKit? !pip install rdkit-pypi
The PNG file can be displayed inline in the pandas dataframe like shown below with the help of a few helper functions.
from IPython.display import HTML
import base64
from io import BytesIO
from PIL import Image
def get_thumbnail(path):
i = Image.open(path)
i.thumbnail((150, 150), Image.LANCZOS)
return i
def image_base64(im):
if isinstance(im, str):
im = get_thumbnail(im)
with BytesIO() as buffer:
im.save(buffer, 'jpeg')
return base64.b64encode(buffer.getvalue()).decode()
def image_formatter(im):
return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
And finally the code below returns the dataframe with the image.
HTML(df.to_html(formatters={'Fragments': image_formatter}, escape=False))

Reading compressed EXR scan-line files by loading one scanline at a time into memory

I have some compressed EXR scan-line files that are unable to be read using minexr (shows error assert self.compr == 0x00, 'Compression not supported.'). Additionally, I want to read them one scanline at a time, so as to not overload memory. On the internet, I can not find out how to do this. Any ideas? Here's my code so far:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import minexr
import os
ETC_PATH = Path(__file__).parent / 'etc'
os.chdir(r"C:\Users\DELL\Documents\Quixel\Trial task for hire evaluation\test1")
def main():
with open('u1_v1.exr', 'rb') as fp:
reader = minexr.load(fp)
UPDATE:
I'm now using OpenEXR, however OpenEXR in Python has different classes and methods than in the documentation s. How would I use OpenEXR to read the exr file one scaneline at a time? This is my updated code:
import sys
import array
import OpenEXR
import Imath
if len(sys.argv) != 3:
print "usage: exrnormalize.py exr-input-file exr-output-file"
sys.exit(1)
# Open the input file
file = OpenEXR.InputFile(sys.argv[1])
# Compute the size
dw = file.header()['dataWindow']
sz = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1)
# Read the three color channels as 32-bit floats
FLOAT = Imath.PixelType(Imath.PixelType.FLOAT)
(R,G,B) = [array.array('f', file.channel(Chan, FLOAT)).tolist() for Chan in ("R", "G", "B") ]

Insert images in a folder into dataframe

Im trying to read images from folders into a dataframe , where each row in the dataframe is all the images for a folder :
import cv2
import os,glob
import matplotlib.pylab as plt
from os import listdir,makedirs
from os.path import isfile,join
import pandas as pd
import PIL
import numpy as np
from scipy.ndimage import imread
pth = 'C:/Users/Documents/myfolder/'
folders = os.listdir(pth)
videos = pd.DataFrame()
for folder in folders:
pth_upd = pth + folder + '/'
allfiles = os.listdir(pth_upd)
files = []
columns = ['data']
index = [folders]
for file in allfiles:
files.append(file) if ('.bmp' in file) else None
samples = np.empty((0,64,64))
for file in files:
img = cv2.imread(os.path.join(pth_upd,file),cv2.IMREAD_GRAYSCALE)
img = img.reshape(1,64,64)
samples = np.append(samples, img, axis=0)
result = pd.DataFrame([samples], index=[folder], columns=['videos'])
videos = videos.append(result)
after reading all the images in each folder into (samples array ) how can I insert images for each folder in a dataframe row
ValueError Traceback (most recent call last)
in
17 samples = np.append(samples, img, axis=0)
18
---> 19 result = pd.DataFrame([samples], index=[folder], columns=['videos'])
20 videos = videos.append(result)
ValueError: Must pass 2-d input
:
It's certainly possible to put strings of the resized images into pandas, but there are much better ways to accomplish CNN training. I adapted your image processing code to show how you could do what you asked:
import io
import pandas as pd
import numpy as np
import sklearn
import requests
import tempfile
import os
import cv2
# Image processing for the df
def process_imgfile(x):
img = cv2.imread(os.path.join(
x.Folder, x.image),cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (64, 64))
img = str(img)
return img
# Simulate folders with images in them
with tempfile.TemporaryDirectory() as f:
f1 = os.path.join(f, "Folder1")
f2 = os.path.join(f, "Folder2")
os.mkdir(f1)
os.mkdir(f2)
print(r.status_code)
for x in range(20):
with open(os.path.join(f1, "f1-{}.jpg".format(x)), "wb") as file1, open(
os.path.join(f2, "f2-{}.jpg".format(x)), "wb") as file2:
r = requests.get(
'https://upload.wikimedia.org/wikipedia/en/a/a9/Example.jpg',
stream=True)
for chunk in r.iter_content(16): # File writing...
file1.write(chunk)
file2.write(chunk)
result = [x for x in os.walk(f)]
folder1 = result[1][2]
folder2 = result[2][2]
# Generate dataframe data
j = {"Folder":[], "image":[]}
for x in folder1:
j["Folder"].append(result[1][0])
j["image"].append(x)
for x in folder2:
j["Folder"].append(result[2][0])
j["image"].append(x)
# Use the process_imgfile function to append image data
df = pd.DataFrame(j)
df["imgdata"] = df.apply(process_imgfile, axis=1)
But on a large set of images this is not going to work. Instead, check out ImageDataGenerator which can let you load images at train and test time. It can also help you apply augmentation or synthesize data.

python program to convert mapper and reducer program for hadoop

I have two programs that the first program is about to convert .h5 file to .tiff file and second program is predict temperature using arima model.
here is first program for converting .h5 file to .tiff file and min and max .tiff and it also call geo tiff.
from osgeo import gdal
import numpy as np
import os
import h5py
from collections import defaultdict
from osgeo import osr
import datetime
in_dir = r'/Users/sunnybhargav/Desktop/jan'
out_dir = r'/Users/sunnybhargav/Desktop/new_output'
#in_dir = input('Enter input directory path where hdf files are stored: ')
#out_dir = input('Enter output directory path where geotiff files are to be stored: ')
def arrayToTif(array,tifFilePath,proj,transform,nodatavalue):
with open(tifFilePath,'a') as file:
pass
# write raster
out_ds = gdal.GetDriverByName('GTiff').Create(tifFilePath,
array.shape[1],
array.shape[0],
1, #Number of bands
gdal.GDT_Float32)
out_ds.GetRasterBand(1).WriteArray(array)
out_ds.GetRasterBand(1).SetNoDataValue(nodatavalue)
# close tif to write into disk (free tif file)
out_ds = None
dates_dict = defaultdict(list)
for root,directories,filenames in os.walk(in_dir):
for filename in filenames:
if (filename.endswith('.h5')):
hdffileDate = filename[6:15]
hdfdate = (int(hdffileDate[0:2]))
dates_dict[hdfdate].append(filename)
print(filename)
for key in dates_dict.keys():
file_list = dates_dict[key]
min_lst = 1000*np.ones((2816,2805))
max_lst = -1000*np.ones((2816,2805))
for v in file_list:
hdf_ds = h5py.File(os.path.join(in_dir,v))
lst = np.array(hdf_ds['LST'])[0,:,:]
hdf_ds = gdal.Open(os.path.join(in_dir,v))
metadata = hdf_ds.GetMetadata_Dict()
lst = lst.astype('Float32')
max_lst = np.maximum(max_lst,lst)
lst[lst==-999] = 999
min_lst = np.minimum(min_lst,lst)
min_lst[min_lst==999] = -999
transform = (0,1,0,0,0,-1)
proj = None
nodatavalue = -999
tiffileDate = v[6:15]
MinName = 'MIN' +v[0:2]+str.lower(tiffileDate) + '.tif'
MaxName = 'MAX' +v[0:2]+str.lower(tiffileDate) + '.tif'
arrayToTif
(max_lst,os.path.join(out_dir,MaxName),proj,transform,nodatavalue)
arrayToTif
(min_lst,os.path.join(out_dir,MinName),proj,transform,nodatavalue)
del lst
del min_lst
del max_lst
second program
in second program is for tiff to get ndarray and then get out put for particular all max temprature and predict next 5 day prediction.
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from numpy import genfromtxt
import csv
import datetime
from datetime import datetime
import time
from matplotlib import pyplot
from pandas import Series
from statsmodels.tsa.arima_model import ARIMA
import numpy
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
import numpy as np
import subprocess
import gdal,osr
from gdalconst import *
import os
import numpy as np
from PIL import Image
import scipy.misc
from datetime import datetime
# import timeseries as ts
count = 1
max_temp = []
min_temp = []
filename = []
filenamer = []
max_temp_points = []
min_temp_points = []
source = r'/Volumes/bhargav 1/data/NEW_MAX'
for root, dirs, filenames in os.walk(source):
print(filenames)
for f in filenames:
print (f)
dataset = gdal.Open( source + '//' + f ,gdal.GA_ReadOnly)
#print(dataset)
geotransform = dataset.GetGeoTransform()
band = dataset.GetRasterBand(1)
data = band.ReadAsArray(0,0,dataset.RasterXSize,dataset.RasterYSize).astype(np.float64)
#print(np.histogram(data,bins=500))
print(np.shape(data))
max_temp_point = data[793][1160]
max_temp_point = max_temp_point - 273
print(max_temp_point)
print("Count:",count)
max_temp_points.append(max_temp_point)
count = count + 1
print(np.shape(max_temp_points))
print(np.mean(max_temp_points))
count = 1
np.save("Max_temp_points_1",max_temp_points)
X = max_temp_points
model = ARIMA(X, order=(5,0,4))
model_fit = model.fit(disp=-1)
# print summary of fit model
print(model_fit.summary())
forecast = model_fit.predict()
print (forecast)
# plot
start_index = len(X)
end_index = start_index + 6
predict_val = model_fit.predict(start=start_index, end=end_index)
print('Prediction:',predict_val)
pyplot.plot(X)
pyplot.plot(forecast, color='red')
pyplot.show()

Categories