Convert .png file to pandas DataFrame - python

I have a 512x512 .png radar image name png_image
I also have a csv rainfall_colour_table with the following headers B, G, R, rainfall_mm, that I can use to link colour values to rainfall intensity values.
What is the best way to read png_image and convert it to a 512x512 Pandas DataFrame where the values are rainfall_mm from rainfall_colour_table.
I have used the following approach.
from urllib.request import urlopen, Request
import cv2
import os
import numpy as np
import pandas as pd
# NOTE: the ftp sourve of the png changes every 30 min
# get any current .png file name from the website ftp://ftp.bom.gov.au/anon/gen/radar//
# and set the below variable
file_name =
# get connection to file
input_url = "ftp://ftp.bom.gov.au/anon/gen/radar//" + file_name
req = Request(input_url)
req_html = urlopen(req).read()
# read file
radar_image = np.fromstring(req_html, np.uint8) # read byte image
radar_image = cv2.imdecode(radar_image, cv2.IMREAD_COLOR) # convert to numppy array
# OS agnostic relative file path
# get the current directory path
base_dir = os.path.dirname(__file__)
# OS agnostic relative file path
# load colour to mm/hr concurrency table
rainfall_colour_table = os.path.join(os.sep, base_dir, 'sample_data', 'radar_colours.csv')
rainfall_colour_df = pd.read_csv(rainfall_colour_table)
rainfall_colour_df.set_index(['B', 'G', 'R'], inplace=True)
# switch colours with rain intensity
radar_df = pd.DataFrame(rainfall_colour_df.loc[list(map(tuple, pin))].rainfall.values for pin in radar_image)
radar_df.columns = ['pixel_col_' + str(col) for col in radar_df.columns]
The rainfall_colour_table.csv
colour_id,rainfall,B,G,R
2,1.5,255,180,180
3,2.5,255,120,120
4,4,255,20,20
5,6,195,216,0
6,10,144,150,0
7,15,102,102,0
8,20,0,255,255
9,35,0,200,255
10,50,0,150,255
11,80,0,100,255
12,120,0,0,255
13,200,0,0,200
14,300,0,0,120
15,360,0,0,40

Related

Python : Get features of several images

I would like to get the feature of a several images located in the same folder.
My codes are as follow - Prerequisites (librairies needed):
import numpy as np
from PIL import Image
import glob
import cv2
import os
Definition of folder where are located the images (around 6000)
images_dir = "TrainImages"
Creation of a function that defines the different variables et compute them
def get_data_from_image(image_path):
cv_img = cv2.imread(image_path)
(means, stds) = cv2.meanStdDev(cv_img)
stats = np.concatenate([means, stds]).flatten()
image_features_list = [stats.tolist()]
return image_features_list
Creation of a variable that scans and analyses the images
image_files = [x.path for x in os.scandir(images_dir)]
Creation of a loop function
i = 0
mylist =[]
for i in range (4): # I test only 4 images, could be more
mylist.append((get_data_from_image(image_files[i])))
Running the stuff
image_features_list = get_data_from_image(image_files[i])
Look at the output
image_features_list
The output provides only the feature of one image, instead of all images located in the folder
[Out]:
[[114.31548828125001,
139.148388671875,
139.57832682291667,
50.54138521536725,
53.82290182999255,
51.946187641459595]]
I would be grateful if I could have a solution on how to have the features of all images (not only one). At this effect, do not hesitate to correct the code.
Thanks and kindest regards
After some commments from friendly persons, here is an additional information for those who would be interested by the response : The output to look at is mylist.
mylist
[Out]:
[[[144.28788548752834,
151.28145691609978,
148.6195351473923,
51.50620316379085,
53.36979275398226,
52.2493589172815]],
[[56.220865079365076,
59.99653968253968,
60.28386507936508,
66.72797279655177,
65.24673515467009,
64.93141350917332]],
[[125.2066064453125,
118.1168994140625,
145.0827685546875,
68.95463582009148,
52.65138276425348,
56.68269683130363]],
[[114.31548828125001,
139.148388671875,
139.57832682291667,
50.54138521536725,
53.82290182999255,
51.946187641459595]]]
Thanks for your help. It is a great forum here !
Try this approach and tell me if its successful
import os, os.path
import numpy as np
from PIL import Image
import cv2
def get_data_from_image(image_path):
cv_img = cv2.imread(image_path)
(means, stds) = cv2.meanStdDev(cv_img)
stats = np.concatenate([means, stds]).flatten()
image_features_list = [stats.tolist()]
return image_features_list
images_dir = 'C:\\Users\\User\\Directory\\TrainImages\\'
images_names = []
with os.scandir(images_dir) as dirs:
for entry in dirs:
images_names.append(entry.name)
for image in images_names:
path = images_dir + image
image_features_list = get_data_from_image(path))
print(image_features_list)

From numpy array to DICOM

My code reads a DICOM file, takes the pixel information to a numpy array then it modifies the numpy array. It uses lists because im trying to operate with multiple DICOM files at the same time.
I havent found any information on how to take my modified numpy array and make it a DICOM file again so i can use it outside Python.
#IMPORT
import cv2
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import SimpleITK as sitk
from glob import glob
import pydicom as dicom
data_path = "C:\\Users\\oliva\\Desktop\\Py tesis\\dicom\\"
output_path = working_path = "C:\\Users\\oliva\\Desktop\\Py tesis\\dicom1\\"
path = glob(data_path + '/*.dcm')
#Checks if we are in the correct path
print ("Total of %d DICOM images.\nFirst 5 filenames:" % len(path))
print ('\n'.join(path[:14]))
data_set = []
for element in path:
imagen=sitk.ReadImage(element)
#imagen = cv2.imread(element)
array_imagen = sitk.GetArrayViewFromImage(imagen)
array2_imagen=array_imagen[0]
imagen_array_norm = np.uint8(cv2.normalize(array2_imagen, None, 0, 255, cv2.NORM_MINMAX))
data_set.append(imagen_array_norm)
#Check
print(len(data_set))
print(type(data_set[1]))
plt.imshow(data_set[4], cmap=plt.cm.gray)
#Equalization
data_set_eq = equal(data_set)
print(len(data_set_eq))
print(type(data_set_eq[6]))
plt.imshow(data_set_eq[7], cmap=plt.cm.gray)
#Filtering
data_set_m = median(data_set)
print(len(data_set_m))
print(type(data_set_m[6]))
plt.imshow(data_set_m[8], cmap=plt.cm.gray)
#Functions
def equal(data):
data_set_eq = []
for element in data_set:
imagen_array_eq = cv2.equalizeHist(element)
data_set_eq.append(imagen_array_eq)
return data_set_eq
def median(data):
data_set_m = []
for element in data_set:
imagen_array_m =cv2.medianBlur(element,5)
data_set_m.append(imagen_array_m)
return data_set_m
I would like some enlightenment on how to produce a DICOM file from my modified numpy array.
You can convert the numpy array back to a SimpleITK image, and then write it out as Dicom. The code would look something like this:
for x in data_set:
img = sitk.GetImageFromArray(x)
sitk.WriteImage(img, "your_image_name_here.dcm")
From the file name suffix, SimpleITK knows to write Dicom.
Note that the filtering you are doing can be accomplished within SimpleITK. You don't really need to use OpenCV. Check out the following filters in SimpleITK: IntensityWindowingImageFilter, AdaptiveHistogramEqualizationFilter, and MedianImageFilter.
https://itk.org/SimpleITKDoxygen/html/classitk_1_1simple_1_1IntensityWindowingImageFilter.html
https://itk.org/SimpleITKDoxygen/html/classitk_1_1simple_1_1AdaptiveHistogramEqualizationImageFilter.html
https://itk.org/SimpleITKDoxygen/html/classitk_1_1simple_1_1MedianImageFilter.html

How to convert Image files to CSV with label

Test folder has folders named from 0 to 9. The 0-9 folders include respective handwritten digit images. I want to convert the images to a single test.csv file such that the first column gives the label of the digit (i.e 0-9) and the rest columns give the pixel value if image.
I created the csv but the first column for the label is being shown empty.
from scipy.misc import imread
import numpy as np
import pandas as pd
import os
import imageio
import glob
root = './test'
# go through each directory in the root folder given above
for directory, subdirectories, files in os.walk(root):
# go through each file in that directory
for file in files:
# read the image file and extract its pixels
im = imread(os.path.join(directory,file))
value = im.flatten()
value = np.hstack((directory[8:],value))
df = pd.DataFrame(value).T
df = df.sample(frac=1) # shuffle the dataset
with open('test.csv', 'a') as dataset:
df.to_csv(dataset, header=False, index=False)
from scipy.misc import imread
import numpy as np
import pandas as pd
import os
import imageio
import glob
import pathlib
v = []
for i,files in enumerate(pathlib.Path('./Train').glob('*/**/*.png')):
im = imread(files.as_posix())
value = im.flatten()
value = np.hstack((int(files.parent.name),value))
v.append(value)
df = pd.DataFrame(v)
df = df.sample(frac=1)
df.to_csv('train.csv',header=False,index=False)
This is how I corrected my code.

loop error in converting shapefiles to raster for entire folder set

My code creates a raster from a shapefile, but now I have tried to get it to loop through all the shapefiles in a particular folder, but I am still getting an error in the loop. Please can someone take a look?
This is the error I get:
RuntimeError: not a string.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 14 14:17:53 2018
#author: me
"""
from osgeo import ogr, gdal
import subprocess
import os
#change directory
os.chdir('/Users/SpatialDataET')
#Name of folder containing all shapefiles to be transformed
folder = 'region_shapes'
#Accesses all shapefiles in the folder (even if there are 100 or 1000 shapefiles)
shapefiles = [folder + '/' + file for file in os.listdir(folder) if 'shp' in file]
#creates object/folder for storing rasterized version (fills in later)
OutputImages = 'Imagefolder'
#Create an output directory (puts the new geotiffs into a separate folder) if none exists
if not os.path.exists(OutputImages):
os.mkdir(OutputImages)
#reference with which to grab resolution (x/y spacing, projection and geotransformation)
RefImage = '/Users/ETa_CMRSET_mm-month-1_monthly_2000.01.01.tif'
gdalformat = 'GTiff'
datatype = gdal.GDT_Byte
burnVal = 1 #value for the output image pixels
# Get projection info from reference image
Image = gdal.Open(RefImage, gdal.GA_ReadOnly)
for i in shapefiles:
shapefiles[i][-9:-3] = ogr.Open(shapefiles)
Shapefile_layer = Shapefile.GetLayer()
# Rasterize
print("Rasterising shapefile...")
Output = gdal.GetDriverByName(gdalformat).Create(OutputImages, Image.RasterXSize, Image.RasterYSize, 1, datatype, options=['COMPRESS=DEFLATE'])
Output.SetProjection(Image.GetProjectionRef())
Output.SetGeoTransform(Image.GetGeoTransform())
# Write data to band 1
Band = Output.GetRasterBand(1)
Band.SetNoDataValue(0)
gdal.RasterizeLayer(Output, [1], Shapefile_layer, burn_values=[burnVal])
# Close datasets
Band = None
Output = None
Image = None
Shapefile = None
# Build image overviews
subprocess.call("gdaladdo --config COMPRESS_OVERVIEW DEFLATE "+OutputImages+" 2 4 8 16 32 64", shell=True)
print("Done.")
Your loop should start like this
for shapefile in shapefiles:
ds = ogr.Open(shapefile)
ds_layer = ds.GetLayer()
I renamed it to ds since it will not be a shapefile anymore but a ogr datasource once loaded with ogr.

OpenCV and Content Based Image retrieval - Is there a way to work with an online database of images without downloading them

I'm trying to build a CBIR system and recently wrote a program in Python using OpenCV functions that lets me query a local database of images and return a result (followed this tutorial). I now need to link this up with another web scraping module (used Scrapy) wherein I output ~1000 links to images online. These images are scattered throughout the web and should be input to the first OpenCV module. Is it possible to perform calculations on this online image set without downloading it ?
These are the steps I followed for the OpenCV module
1) Define the region-based color image descriptor
2) Extract features from dataset (Indexing) (dataset to be passed as command line argument)
# import the necessary packages
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
from colordescriptor import ColorDescriptor
import argparse
import glob
import cv2
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required = True,
help = "Path to the directory that contains the images to be indexed")
ap.add_argument("-i", "--index", required = True,
help = "Path to where the computed index will be stored")
args = vars(ap.parse_args())
# initialize the color descriptor
cd = ColorDescriptor((8, 12, 3))
# open the output index file for writing
output = open(args["index"], "w")
# use glob to grab the image paths and loop over them
for imagePath in glob.glob(args["dataset"] + "/*.jpg"):
# extract the image ID (i.e. the unique filename) from the image
# path and load the image itself
imageID = imagePath[imagePath.rfind("/") + 1:]
image = cv2.imread(imagePath)
# describe the image
features = cd.describe(image)
# write the features to file
features = [str(f) for f in features]
output.write("%s,%s\n" % (imageID, ",".join(features)))
# close the index file
output.close()
3) Deifning the similarity metric
# import the necessary packages
import numpy as np
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import csv
class Searcher:
def __init__(self, indexPath):
# store our index path
self.indexPath = indexPath
def search(self, queryFeatures, limit = 5):
# initialize our dictionary of results
results = {}
# open the index file for reading
with open(self.indexPath) as f:
# initialize the CSV reader
reader = csv.reader(f)
# loop over the rows in the index
for row in reader:
# parse out the image ID and features, then compute the
# chi-squared distance between the features in our index
# and our query features
features = [float(x) for x in row[1:]]
d = self.chi2_distance(features, queryFeatures)
# now that we have the distance between the two feature
# vectors, we can udpate the results dictionary -- the
# key is the current image ID in the index and the
# value is the distance we just computed, representing
# how 'similar' the image in the index is to our query
results[row[0]] = d
# close the reader
f.close()
# sort our results, so that the smaller distances (i.e. the
# more relevant images are at the front of the list)
results = sorted([(v, k) for (k, v) in results.items()])
# return our (limited) results
return results[:limit]
def chi2_distance(self, histA, histB, eps = 1e-10):
# compute the chi-squared distance
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
# return the chi-squared distance
return d
`
4) Perform the actual search
# import the necessary packages
from colordescriptor import ColorDescriptor
from searcher import Searcher
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import argparse
import cv2
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--index", required = True,
help = "Path to where the computed index will be stored")
ap.add_argument("-q", "--query", required = True,
help = "Path to the query image")
ap.add_argument("-r", "--result-path", required = True,
help = "Path to the result path")
args = vars(ap.parse_args())
# initialize the image descriptor
cd = ColorDescriptor((8, 12, 3))
# load the query image and describe it
query = cv2.imread(args["query"])
features = cd.describe(query)
# perform the search
searcher = Searcher(args["index"])
results = searcher.search(features)
# display the query
cv2.imshow("Query", query)
# loop over the results
for (score, resultID) in results:
# load the result image and display it
result = cv2.imread(args["result_path"] + "/" + resultID)
cv2.imshow("Result", result)
cv2.waitKey(0)
And the final command line command is:
python search.py --index index.csv --query query.png --result-path dataset
where index.csv is the file generated after step 2 on the database of images. query.png is my query image and dataset is the folder containing the ~100 images.
So is it possible to modify the indexing such that I don't need a local dataset and to be querying and indexing can be done directly from the list of URLs ?

Categories