How to convert a CSV table into COCO format in python? - python

I have a CSV table with the following columns:
column_names = ['image_id', 'xmin', 'ymin', 'width', 'height', 'xmax','ymax']
where xmin, ymin, xmax and ymax represent the bounding box that encloses some object; width and height, the image dimensions; and image_id, the file name (.JPG file). Since I want to do object detection, I need to convert this table into COCO format. Amazingly enough, I can't find any answer to this inquiry in the internet.

I had the same issue before, then I found this code it is very helpful
you will need to change the column names to this columns and update the csv file
column_names =['filename','class','width', 'height','xmin','ymin','xmax','ymax']
then try this code
import numpy as np
import json
import pandas as pd
path = 'annotations.csv' # the path to the CSV file
save_json_path = 'traincoco.json'
data = pd.read_csv(path)
images = []
categories = []
annotations = []
category = {}
category["supercategory"] = 'none'
category["id"] = 0
category["name"] = 'None'
categories.append(category)
data['fileid'] = data['filename'].astype('category').cat.codes
data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
data['categoryid'] = data['categoryid']+1
data['annid'] = data.index
def image(row):
image = {}
image["height"] = row.height
image["width"] = row.width
image["id"] = row.fileid
image["file_name"] = row.filename
return image
def category(row):
category = {}
category["supercategory"] = 'None'
category["id"] = row.categoryid
category["name"] = row[2]
return category
def annotation(row):
annotation = {}
area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
annotation["segmentation"] = []
annotation["iscrowd"] = 0
annotation["area"] = area
annotation["image_id"] = row.fileid
annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]
annotation["category_id"] = row.categoryid
annotation["id"] = row.annid
return annotation
for row in data.itertuples():
annotations.append(annotation(row))
imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
for row in imagedf.itertuples():
images.append(image(row))
catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
for row in catdf.itertuples():
categories.append(category(row))
data_coco = {}
data_coco["images"] = images
data_coco["categories"] = categories
data_coco["annotations"] = annotations
json.dump(data_coco, open(save_json_path, "w"), indent=4)

Related

Blank Strings Are Returned in Python Dataframe

I wrote a code to convert PDF to CSV, read the CSV file, and export only relevant information from the CSV file. The function is supposed to return filtered information such as english_name: 'someones name', original_language_name: 'someones name' etc, but instead the command returned english_name: '', original_language_name: '' etc. Below is the code that I wrote:
import pandas as pd
import tabula
from pandas import DataFrame
from backend.classes import Shareholder, Officer
from typing import List
def strip_string(string):
return str(string).strip()
def get_float_without_thousands_separator(string, thousands_separator):
return float(string.replace(thousands_separator, ''))
def extract_officers_and_shareholders_lists_from_df(df, total_number_of_shares, no_data_placeholder, number_of_shares, thousands_separator):
officers = []
shareholders = []
NAME = 'Nama'
POSITION = 'Jabatan'
for row in range((df.shape[0])):
if str(df[POSITION][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
position = strip_string(df[POSITION][row])
officer = Officer(english_name=english_name, original_language_name=original_language_name, position=position)
officers.append(officer)
elif str(df[number_of_shares][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
number_of_shares_string = strip_string(df[number_of_shares][row])
number_of_shares_number = get_float_without_thousands_separator(number_of_shares_string, thousands_separator)
shareholding_percentage = (number_of_shares_number / total_number_of_shares) * 100
shareholder = Shareholder(english_name=english_name, original_language_name=original_language_name, shareholding_percentage=shareholding_percentage)
shareholders.append(shareholder)
return officers, shareholders
def get_officers_and_shareholders_lists(pdf_input_file):
NO_DATA_PLACEHOLDER = '-'
NUMBER_OF_SHARES = 'Jumlah Lembar Saham'
THOUSANDS_SEPARATOR = '.'
output_file_path = 'CSV/Officers_and_Shareholders.csv'
tabula.convert_into(pdf_input_file, output_file_path, output_format='csv', pages='all')
df = pd.read_csv(output_file_path, header=3, on_bad_lines='skip')
all_shares = df[NUMBER_OF_SHARES].to_list()
all_shares_strings = [strip_string(shares) for shares in all_shares if strip_string(shares) != NO_DATA_PLACEHOLDER]
all_shares_numbers = [get_float_without_thousands_separator(shares, THOUSANDS_SEPARATOR) for shares in all_shares_strings]
total_number_of_shares = sum(all_shares_numbers)
return extract_officers_and_shareholders_lists_from_df(
df=df,
total_number_of_shares=total_number_of_shares,
number_of_shares=NUMBER_OF_SHARES,
no_data_placeholder=NO_DATA_PLACEHOLDER,
thousands_separator=THOUSANDS_SEPARATOR)
The command call that I use for the codes on the above is python3 -m backend.officers_and_shareholders. Is there a method to pass in so that english_name returns a name, original_language_name returns a name?

pylucence cannot find a word that was presented in the text which indexed earlier

I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

Read multiple csv files and write multiple netCDF files

I have the following Python code works perfectly fine for a single .csv file to convert for a netCDF file.
But, I have multiple files (365), as, 'TRMM_1998_01_02_newntcl.csv', 'TRMM_1998_01_03_newntcl.csv'....upto 'TRMM_1998_12_31_newntcl.csv'.
Can somebody help me to write to loop through all the csv files and create 365 netCDF files using this code.?
Anyhelp is appreciated.
Thanks in advance.
import numpy as np
def convert_file(filename):
data = np.loadtxt(fname=filename, delimiter=',')
# filename = "TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day)
Lat_data = np.loadtxt('Latitude.csv', delimiter=',')
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
# create a netcdf Data object
with netCDF4.Dataset('TEST_file.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'precipitation'
ds.institution = 'Institute'
ds.author = 'Author'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
# ds.createDimension('time', 0)
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds
# print filename
# load the data
path='C:\Users\.spyder2'
os.chdir(path)
d=datetime.date(1998,01,01)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))
It looks like you can use a datetime.date object to loop through all of the days in a year. First, you should put the code you have in a function that takes a filename. Then, you can just make a date object and call the function in a loop:
import datetime
d=datetime.date(1998,1,1)
while d.year==1998:
d+=datetime.timedelta(days=1)
convert_file("TRMM_{}_{}_{}_newntcl.csv".format(d.year,d.month,d.day))
If I read your question correctly, there is an easier way of using os in this case. You can just take in the file names and use them in a loop:
import os
main_fp = "C:\\Users\\spyder2"
path, dirs, files = os.walk(main_fp).next()
for f_path in files:
data = np.loadtxt(f_path, delimiter=',')
Lat_data = np.loadtxt('Latitude.csv', delimiter=',') #put lat and long csv's in separate folder, so you don't read them into the loop
Lon_data = np.loadtxt('Longitude.csv', delimiter=',')
#strip csv extentions
new_fname = f_path.strip('.csv')
with netCDF4.Dataset(new_fname+'.nc', mode="w", format='NETCDF4') as ds:
# some file-level meta-data attributes:
ds.Conventions = "CF-1.6"
ds.title = 'Non TC precipitation'
ds.institution = 'AIR-Worldwide'
ds.author = 'Dr. Dumindu Jayasekera'
lat_arr = data[:,0] # the first column
lon_arr = data[:,1] # the second column
precip_arr = data[:,2] # the third column
nlat = lat_arr.reshape( (161, 321) )
nlon = lon_arr.reshape( (161, 321) )
ds.createDimension('latitude', 161)
ds.createDimension('longitude', 321)
precip = ds.createVariable('precip', 'f4', ('latitude', 'longitude'))
precip[:] = data[:,2]
## adds some attributes
precip.units = 'mm'
precip.long_name = 'Precipitation'
lat = ds.createVariable('lat', 'f4', ('latitude'))
lat[:] = Lat_data[:]
## adds some attributes
lat.units = 'degrees_South'
lat.long_name = 'Latitude'
lon = ds.createVariable('lon', 'f4', ('longitude'))
lon[:] = Lon_data[:]
## adds some attributes
lon.units = 'degrees_East'
lon.long_name = 'Longitude'
print ds

convert contents of metadata file into variables list

Hi I m wanting to convert the contents of a file (in this case a Landsat 7 metadata file) into a series of variables defined by the contents of the file using Python 2.7. The file contents looks like this:
GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10
GROUP = IMAGE_ATTRIBUTES
CLOUD_COVER = 0.00
IMAGE_QUALITY = 9
SUN_AZIMUTH = 137.38394502
SUN_ELEVATION = 48.01114126
GROUND_CONTROL_POINTS_MODEL = 55
GEOMETRIC_RMSE_MODEL = 3.790
GEOMETRIC_RMSE_MODEL_Y = 2.776
GEOMETRIC_RMSE_MODEL_X = 2.580
END_GROUP = IMAGE_ATTRIBUTES
Example of interested variable items:
GROUP = MIN_MAX_RADIANCE
RADIANCE_MAXIMUM_BAND_1 = 293.700
RADIANCE_MINIMUM_BAND_1 = -6.200
RADIANCE_MAXIMUM_BAND_2 = 300.900
RADIANCE_MINIMUM_BAND_2 = -6.400
RADIANCE_MAXIMUM_BAND_3 = 234.400
RADIANCE_MINIMUM_BAND_3 = -5.000
RADIANCE_MAXIMUM_BAND_4 = 241.100
RADIANCE_MINIMUM_BAND_4 = -5.100
RADIANCE_MAXIMUM_BAND_5 = 47.570
RADIANCE_MINIMUM_BAND_5 = -1.000
RADIANCE_MAXIMUM_BAND_6_VCID_1 = 17.040
RADIANCE_MINIMUM_BAND_6_VCID_1 = 0.000
RADIANCE_MAXIMUM_BAND_6_VCID_2 = 12.650
RADIANCE_MINIMUM_BAND_6_VCID_2 = 3.200
RADIANCE_MAXIMUM_BAND_7 = 16.540
RADIANCE_MINIMUM_BAND_7 = -0.350
RADIANCE_MAXIMUM_BAND_8 = 243.100
RADIANCE_MINIMUM_BAND_8 = -4.700
END_GROUP = MIN_MAX_RADIANCE
I am open to other ideas as I don't need all entries as variables, just a selection. And I see some headers are listed more than once. i.e. GROUP is used multiple times. I need to be able to select certain variables (integer values) and use in formulas in other areas of code. ANY help would be appreciated (novice python coder).
I'm not sure exactly what you are looking for, but maybe something like this:
s = '''GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10'''
output = {} #Dict
for line in s.split("\n"): #Iterates through every line in the string
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
print output #Output is a dictonary containing all key-value pairs in your metadata seperated by "="
print output["SENSOR_ID"] #Outputs "ETM"
==============
Edited:
f = open('metadata.txt', 'r') #open file for reading
def build_data(f): #build dictionary
output = {} #Dict
for line in f.readlines(): #Iterates through every line in the string
if "=" in line: #make sure line has data as wanted
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
return output #Returns a dictionary with the key, value pairs.
data = build_data(f)
print data["IMAGE_QUALITY"] #prints 9

Categories