Add PNG object to a pandas dataframe - python

I tried to add the PNG object into a pandas dataframe, and it does not work at al:
!pip install rdkit-pypi
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
smiles_list = ['N[C#H](C(=O)O)C']
mol_list = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
mol_list.append(mol)
img =Draw.MolsToGridImage(mol_list, molsPerRow = 4)
glycine = mol_list[0]
figure_list = []
bi = {}
fp = AllChem.GetMorganFingerprintAsBitVect(glycine, 2, nBits = 1024, bitInfo = bi)
fp_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp, fp_arr)
np.nonzero(fp_arr)
list(fp.GetOnBits())
prints = [(glycine, x, bi) for x in fp.GetOnBits()]
figure = Draw.DrawMorganBits(prints, molsPerRow = 4, legends = [str(x) for x in fp.GetOnBits()])
figure_list.append(figure)
df = pd.DataFrame({'smiles': smiles_list[0]}, index = [0])
PandasTools.AddMoleculeColumnToFrame(df,'smiles','Molecule')
df['Fragments'] = figure_list
df
Instead of displaying the PNG image inside the pandas dataframe, it shows the message "<PIL.PngImagePlugin.PngImageFile image mode=RG...".
I would expect to add the PNG object into a pandas dataframe.
I used the following configuration in the googlle colab:
RDKit version: 2022.03.5
OS: Windows
Python version: Python 3.7.15
Are you using conda? No
If you are using conda, which channel did you install the rdkit from? Not applied
If you are not using conda: how did you install the RDKit? !pip install rdkit-pypi

The PNG file can be displayed inline in the pandas dataframe like shown below with the help of a few helper functions.
from IPython.display import HTML
import base64
from io import BytesIO
from PIL import Image
def get_thumbnail(path):
i = Image.open(path)
i.thumbnail((150, 150), Image.LANCZOS)
return i
def image_base64(im):
if isinstance(im, str):
im = get_thumbnail(im)
with BytesIO() as buffer:
im.save(buffer, 'jpeg')
return base64.b64encode(buffer.getvalue()).decode()
def image_formatter(im):
return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
And finally the code below returns the dataframe with the image.
HTML(df.to_html(formatters={'Fragments': image_formatter}, escape=False))

Related

How to save rdkit DrawMorganBit output as image?

code:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, AllChem, PandasTools, DataStructs
mol = Chem.MolFromSmiles('O=C1N([C##H](C)C2CC2)CC3=CC(C4=C(C)N=C(NC(C)=O)S4)=CC(S(=O)(C)=O)=C31')
bi = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, bitInfo=bi)
fp_arr = np.zeros(1,)
DataStructs.ConvertToNumpyArray(fp, fp_arr)
fp_arr = np.nonzero(fp_arr)[0]
for ar in fp_arr:
img = Draw.DrawMorganBit(mol, ar, bi, useSVG=True)
img.save("submol.png")
I want to create an image using DrawMorganBit to know how Molecule's fingerprint bit was generated. (Using PyCharm instead of Jupyter Notebook)
However, two problems have arisen: the kekulize problem and the image storage problem.
I dont know what's causing the kekulize problem and how to save DrawMorganBit image
rdkit version 2022.3.4
Error
kekulize problem:
rdkit.Chem.rdchem.KekulizeException: Can't kekulize mol. Unkekulized atoms: 5 6 8 9 14
save problem:
AttributeError: 'str' object has no attribute 'save'
The solution for the problem with kekulizing can be found here:
https://github.com/rdkit/rdkit/issues/5129
Your problem with saving the image is that you are trying to save an SVG as a png.
With this code you should get an image with all MorganBits.
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Draw
drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
drawOptions.prepareMolsBeforeDrawing = False
from rdkit.Chem.Draw import IPythonConsole
mol = Chem.MolFromSmiles('O=C1N([C##H](C)C2CC2)CC3=CC(C4=C(C)N=C(NC(C)=O)S4)=CC(S(=O)(C)=O)=C31')
bi = {}
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bi)
tpls = [(mol, x, bi) for x in fp.GetOnBits()]
p = Draw.DrawMorganBits(tpls, molsPerRow=5, legends=[str(x) for x in fp.GetOnBits()], drawOptions=drawOptions)
p.save('submol.png')

I get the error module 'keygen' has no attribute 'keygen

I have also installed the appropriate libraries but still the error is showing. The code is written below:
import keygen as kg
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
img = mpimg.imread('Images/111.png')
plt.imshow(img)
#plt.show()
#Now generating the choatic Key
height = img.shape[0]
width = img.shape[1]
key = kg.keygen(0.01,3.951,height*width)
I get error at last line.
here keygen if a function not a library code for that function.
def keygen(x,r,size):
key = []
for i in range(size):
x = r*x*(1-x)
key.append(int((x*pow(10,16))%256))
return key

Insert images in a folder into dataframe

Im trying to read images from folders into a dataframe , where each row in the dataframe is all the images for a folder :
import cv2
import os,glob
import matplotlib.pylab as plt
from os import listdir,makedirs
from os.path import isfile,join
import pandas as pd
import PIL
import numpy as np
from scipy.ndimage import imread
pth = 'C:/Users/Documents/myfolder/'
folders = os.listdir(pth)
videos = pd.DataFrame()
for folder in folders:
pth_upd = pth + folder + '/'
allfiles = os.listdir(pth_upd)
files = []
columns = ['data']
index = [folders]
for file in allfiles:
files.append(file) if ('.bmp' in file) else None
samples = np.empty((0,64,64))
for file in files:
img = cv2.imread(os.path.join(pth_upd,file),cv2.IMREAD_GRAYSCALE)
img = img.reshape(1,64,64)
samples = np.append(samples, img, axis=0)
result = pd.DataFrame([samples], index=[folder], columns=['videos'])
videos = videos.append(result)
after reading all the images in each folder into (samples array ) how can I insert images for each folder in a dataframe row
ValueError Traceback (most recent call last)
in
17 samples = np.append(samples, img, axis=0)
18
---> 19 result = pd.DataFrame([samples], index=[folder], columns=['videos'])
20 videos = videos.append(result)
ValueError: Must pass 2-d input
:
It's certainly possible to put strings of the resized images into pandas, but there are much better ways to accomplish CNN training. I adapted your image processing code to show how you could do what you asked:
import io
import pandas as pd
import numpy as np
import sklearn
import requests
import tempfile
import os
import cv2
# Image processing for the df
def process_imgfile(x):
img = cv2.imread(os.path.join(
x.Folder, x.image),cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (64, 64))
img = str(img)
return img
# Simulate folders with images in them
with tempfile.TemporaryDirectory() as f:
f1 = os.path.join(f, "Folder1")
f2 = os.path.join(f, "Folder2")
os.mkdir(f1)
os.mkdir(f2)
print(r.status_code)
for x in range(20):
with open(os.path.join(f1, "f1-{}.jpg".format(x)), "wb") as file1, open(
os.path.join(f2, "f2-{}.jpg".format(x)), "wb") as file2:
r = requests.get(
'https://upload.wikimedia.org/wikipedia/en/a/a9/Example.jpg',
stream=True)
for chunk in r.iter_content(16): # File writing...
file1.write(chunk)
file2.write(chunk)
result = [x for x in os.walk(f)]
folder1 = result[1][2]
folder2 = result[2][2]
# Generate dataframe data
j = {"Folder":[], "image":[]}
for x in folder1:
j["Folder"].append(result[1][0])
j["image"].append(x)
for x in folder2:
j["Folder"].append(result[2][0])
j["image"].append(x)
# Use the process_imgfile function to append image data
df = pd.DataFrame(j)
df["imgdata"] = df.apply(process_imgfile, axis=1)
But on a large set of images this is not going to work. Instead, check out ImageDataGenerator which can let you load images at train and test time. It can also help you apply augmentation or synthesize data.

python program to convert mapper and reducer program for hadoop

I have two programs that the first program is about to convert .h5 file to .tiff file and second program is predict temperature using arima model.
here is first program for converting .h5 file to .tiff file and min and max .tiff and it also call geo tiff.
from osgeo import gdal
import numpy as np
import os
import h5py
from collections import defaultdict
from osgeo import osr
import datetime
in_dir = r'/Users/sunnybhargav/Desktop/jan'
out_dir = r'/Users/sunnybhargav/Desktop/new_output'
#in_dir = input('Enter input directory path where hdf files are stored: ')
#out_dir = input('Enter output directory path where geotiff files are to be stored: ')
def arrayToTif(array,tifFilePath,proj,transform,nodatavalue):
with open(tifFilePath,'a') as file:
pass
# write raster
out_ds = gdal.GetDriverByName('GTiff').Create(tifFilePath,
array.shape[1],
array.shape[0],
1, #Number of bands
gdal.GDT_Float32)
out_ds.GetRasterBand(1).WriteArray(array)
out_ds.GetRasterBand(1).SetNoDataValue(nodatavalue)
# close tif to write into disk (free tif file)
out_ds = None
dates_dict = defaultdict(list)
for root,directories,filenames in os.walk(in_dir):
for filename in filenames:
if (filename.endswith('.h5')):
hdffileDate = filename[6:15]
hdfdate = (int(hdffileDate[0:2]))
dates_dict[hdfdate].append(filename)
print(filename)
for key in dates_dict.keys():
file_list = dates_dict[key]
min_lst = 1000*np.ones((2816,2805))
max_lst = -1000*np.ones((2816,2805))
for v in file_list:
hdf_ds = h5py.File(os.path.join(in_dir,v))
lst = np.array(hdf_ds['LST'])[0,:,:]
hdf_ds = gdal.Open(os.path.join(in_dir,v))
metadata = hdf_ds.GetMetadata_Dict()
lst = lst.astype('Float32')
max_lst = np.maximum(max_lst,lst)
lst[lst==-999] = 999
min_lst = np.minimum(min_lst,lst)
min_lst[min_lst==999] = -999
transform = (0,1,0,0,0,-1)
proj = None
nodatavalue = -999
tiffileDate = v[6:15]
MinName = 'MIN' +v[0:2]+str.lower(tiffileDate) + '.tif'
MaxName = 'MAX' +v[0:2]+str.lower(tiffileDate) + '.tif'
arrayToTif
(max_lst,os.path.join(out_dir,MaxName),proj,transform,nodatavalue)
arrayToTif
(min_lst,os.path.join(out_dir,MinName),proj,transform,nodatavalue)
del lst
del min_lst
del max_lst
second program
in second program is for tiff to get ndarray and then get out put for particular all max temprature and predict next 5 day prediction.
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from numpy import genfromtxt
import csv
import datetime
from datetime import datetime
import time
from matplotlib import pyplot
from pandas import Series
from statsmodels.tsa.arima_model import ARIMA
import numpy
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
import numpy as np
import subprocess
import gdal,osr
from gdalconst import *
import os
import numpy as np
from PIL import Image
import scipy.misc
from datetime import datetime
# import timeseries as ts
count = 1
max_temp = []
min_temp = []
filename = []
filenamer = []
max_temp_points = []
min_temp_points = []
source = r'/Volumes/bhargav 1/data/NEW_MAX'
for root, dirs, filenames in os.walk(source):
print(filenames)
for f in filenames:
print (f)
dataset = gdal.Open( source + '//' + f ,gdal.GA_ReadOnly)
#print(dataset)
geotransform = dataset.GetGeoTransform()
band = dataset.GetRasterBand(1)
data = band.ReadAsArray(0,0,dataset.RasterXSize,dataset.RasterYSize).astype(np.float64)
#print(np.histogram(data,bins=500))
print(np.shape(data))
max_temp_point = data[793][1160]
max_temp_point = max_temp_point - 273
print(max_temp_point)
print("Count:",count)
max_temp_points.append(max_temp_point)
count = count + 1
print(np.shape(max_temp_points))
print(np.mean(max_temp_points))
count = 1
np.save("Max_temp_points_1",max_temp_points)
X = max_temp_points
model = ARIMA(X, order=(5,0,4))
model_fit = model.fit(disp=-1)
# print summary of fit model
print(model_fit.summary())
forecast = model_fit.predict()
print (forecast)
# plot
start_index = len(X)
end_index = start_index + 6
predict_val = model_fit.predict(start=start_index, end=end_index)
print('Prediction:',predict_val)
pyplot.plot(X)
pyplot.plot(forecast, color='red')
pyplot.show()

Dask delayed + Matplotlib.savefig() -> FAIL

My goal is to produce multiple png files from respectively multiple numpy arrays, loaded from medical images in my HD.
To make things quicker, I'm using dask delayed.
Here's my working code:
import os.path
from glob import glob
import nibabel as nib
import numpy as np
from dask import delayed
def process(data):
# Need to have the import inside so that multiprocessing works.
# Apparently doesn't solve the issue anyway..
import matplotlib.pyplot as plt
outpath = '/Users/user/outputdir/'
name = os.path.basename(data.get_filename())
savename = name[:name.index('.')] + '.png'
plt.imshow(np.rot90(data.get_data()[15:74, 6:82, 18, 0]),
extent=[0, 1, 0, 1], aspect=1.28, cmap='gray')
plt.axis('off')
out = os.path.join(outpath, savename)
plt.savefig(out)
plt.close()
return out
L = []
for fn in glob("/Users/user/imagefiles/mb*.nii.gz"):
nifti = delayed(nib.load)(fn)
outpng = delayed(process)(nifti)
L.append(outpng)
results = delayed(print)(L)
results.compute()
My problem is that after each run some of the output images are empty (nothing in the png), and which images are empty seem pretty random, since all input data is valid.
I suspect this is a problem of multiprocessing and matplotlib, as seen in other related thread.
Does anyone have a suggestion on how to get this working with dask?
EDIT: Minimal working example
import os.path
import random
import string
import numpy as np
from dask import delayed
def gendata(fn):
return
def process(data):
# Need to have the import inside so that multiprocessing works.
import matplotlib.pyplot as plt
outpath = '/Users/user/Pictures/test/'
name = ''.join(random.choices(string.ascii_lowercase, k=10))
savename = name + '.png'
data = np.random.randint(0, 255, size=(100,100,20,2))
plt.imshow(np.rot90(data[15:74, 6:82, 18, 0]),
extent=[0, 1, 0, 1], aspect=1.28, cmap='gray')
plt.axis('off')
out = os.path.join(outpath, savename)
plt.savefig(out)
plt.close()
return out
L = []
for fn in range(0, 10):
nifti = delayed(gendata)(fn)
outpng = delayed(process)(nifti)
L.append(outpng)
results = delayed(print)(L)
results.compute()

Categories