I have used tensorflow object detection-api to train my own object detector.But at that time, the images were annotated using labelimg which create xml file for each image.Now I have got labeled images which have json file for each image.So how I use these json files to create tfrecords.
First I created csv files by using my own script.
import os
import glob
import pandas as pd
import json
import pickle
def json_to_csv():
path_to_json = 'images/train/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
path_to_jpeg = 'images/train/'
jpeg_files = [pos_jpeg for pos_jpeg in os.listdir(path_to_jpeg) if pos_jpeg.endswith('.jpeg')]
fjpeg=(list(reversed(jpeg_files)))
n=0
csv_list = []
labels=[]
for j in json_files:
data_file=open('images/train/{}'.format(j))
data = json.load(data_file)
width,height=data['display_width'],data['display_height']
for item in data["items"]:
box = item['bounding_box']
if item['upc']!='None':
name=item['upc']
labels.append(name)
xmin=box['left']
ymin=box['top']
xmax=box['right']
ymax=box['bottom']
value = (fjpeg[n],
width,
height,
name,
xmin,
ymin,
xmax,
ymax
)
csv_list.append(value)
n=n+1
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
csv_df = pd.DataFrame(csv_list, columns=column_name)
labels_train=list(set(labels))
with open("train_labels.txt", "wb") as fp: #Pickling
pickle.dump(labels_train, fp)
return csv_df
def main():
for directory in ['train']:
csv_df = json_to_csv()
csv_df.to_csv('data/{}_labels.csv'.format(directory), index=None)
print('Successfully converted json to csv.')
main()
Then I use this script to create tfrecords.
We have some documentation on the subject.
Note that labelimg should produce outputs similar to the PASCAL VOC datasets we use, so those scripts may also be of use.
Related
I have to convert json files as I said, here is the code:enter image description here
def AnalysisJson():
file_path = 'my_file'
for root,dirs,files in os.walk(file_path):
for file in files:
InputPath = open(file_path + '\\'+ file, encoding="utf-8")
for i in files:
df = json.load(InputPath)
demo = pd.json_normalize(df,record_path = 'label_annotations')
demo.to_csv('files.csv')
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I want to convert these files, if the code is surely hard to run, I wish someone will give me an advice, thanks!
I am not sure that I understand correctly what you want, but here is an answer based on my interpretation of your question.
import json
import os
from glob import glob
import pandas as pd
def json_to_csv(dir_path: str) -> None:
for file_path in glob(os.path.join(dir_path, '*.json')):
with open(file_path, encoding='utf-8') as f:
data = json.load(f)
df = pd.json_normalize(data, record_path='label_annotations')
df.to_csv(file_path.replace('.json', '.csv'), index=False)
txt files generated from labelimg ,the txt files need to convert into one csv table with x_center, y_center , height ,and weight some txt files contains more than one line of flies
then the same image_id should have two set of rows each i was able to read only one line of txt file ,but unable to read more than one values in .txt file
import os
import glob
import pandas as pd
import numpy as np
os.chdir(r'D:\karami\Labeled\train1\labels')
myFiles = glob.glob('*.txt')
width=1024
height=1024
image_id=0
final_df=[]
for item in myFiles:
row=[]
bbox_temp=[]
with open(item, 'rt') as fd:
first_line = fd.readline()
splited = first_line.split();
row.append(image_id)
row.append(width)
row.append(height)
try:
bbox_temp.append(float(splited[1])*width)
bbox_temp.append(float(splited[2])*height)
bbox_temp.append(float(splited[3])*width)
bbox_temp.append(float(splited[4])*height)
row.append(bbox_temp)
final_df.append(row)
except:
print("file is not in YOLO format!")
df = pd.DataFrame(final_df,columns=['image_id', 'width', 'height','bbox'])
df.to_csv("saved.csv",index=False)
import os
import glob
import pandas as pd
import numpy as np
os.chdir(r'D:\karami\Labeled\train1\labels')
myFiles = glob.glob('*.txt')
width=1024
height=1024
image_id=0
final_df=[]
for item in myFiles:
row=[]
bbox_temp=[]
image_id+=1
with open(item, 'rt') as fd:
for line in fd.readlines():
splited = line.split():
row.append(image_id)
row.append(width)
row.append(height)
try:
bbox_temp.append(float(splited[1])*width)
bbox_temp.append(float(splited[2])*height)
bbox_temp.append(float(splited[3])*width)
bbox_temp.append(float(splited[4])*height)
row.append(bbox_temp)
final_df.append(row)
except:
print("file is not in YOLO format!")
df = pd.DataFrame(final_df,columns=['image_id', 'width', 'height','bbox'])
df.to_csv("saved.csv",index=False)
if you are trying to convert between formal data annotation type like COCO, PascalVOC, YOLO use python library such as imgann, to reduce code errors and time.
''' This file is used to convert annotations from .txt file to tenforflow csv formate
'''
import os
import os.path
import argparse
import pandas as pd
from PIL import Image
from xml.dom.minidom import Document
def write_to_csv(ann_path ,img_path ,dict):
annos = [] #Kindly adjust the indentations
# Read txts
for files in os.walk(ann_path): #Kindly adjust the indentations
for file in files[2]:
print (file + "-->start!")
# Read image and get its size attributes
img_name = os.path.splitext(file)[0] + '.jpg'
fileimgpath = os.path.join(img_path ,img_name)
im = Image.open(fileimgpath)
w = int(im.size[0])
h = int(im.size[1])
# Read txt file
filelabel = open(os.path.join(ann_path , file), "r")
lines = filelabel.read().split('\n')
obj = lines[:len(lines)-1]
# name = dict[obj[0]]
for i in range(0, int(len(obj))):
objbud=obj[i].split(' ')
name = dict[objbud[0]]
# print(name)
x1 = float(objbud[1])
y1 = float(objbud[2])
w1 = float(objbud[3])
h1 = float(objbud[4])
xmin = int((x1*w) - (w1*w)/2.0)
ymin = int((y1*h) - (h1*h)/2.0)
xmax = int((x1*w) + (w1*w)/2.0)
ymax = int((y1*h) + (h1*h)/2.0)
annos.append([img_name ,w ,h ,name ,xmin ,ymin ,xmax ,ymax])
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax' ]
df = pd.DataFrame(annos, columns=column_name)
print(annos[:10])
return df
if __name__ == "__main__" :
# Argument Parser
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="txt path")
ap.add_argument("-img", "--image", required=True, help="images path")
ap.add_argument("-o", "--output", required=True, help="output csv path ")
args = vars(ap.parse_args())
# Define class number according to the classes in the .txt file
dict = {'0' : 'autorickshaw',
'1': "bus",
'2': "car",
'3': "motorcycle",
'4': "truck",
'5': "vehicle fallback",
}
# Assign paths
ann_path = args["input"]
img_path = args["image"]
csv_path = args["output"]
data=write_to_csv(ann_path ,img_path ,dict)
# print()
data.to_csv(csv_path, index=None)
print('Successfully converted xml to csv. And your output file is {}'.format(args["output"]))
# Command to run the code :
#python3 txt_to_csv.py -i path_of_Text_files_labels_directory -img data\images -o data\data.csv
#Output format will be
#filename ,height ,width ,class ,xmin ,ymin ,xmax ,ymax
I have about 5,000 .gzip files (~1MB each). Each of these files contains data in a jsonlines format. Here's what it looks like:
{"category_id":39,"app_id":12731}
{"category_id":45,"app_id":12713}
{"category_id":6014,"app_id":13567}
I want to parse these files and convert them to a pandas dataframe. Is there a way to speed up this process? Here's my code but it's kinda slow (0.5s per file)
import pandas as pd
import jsonlines
import gzip
import os
import io
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with open(f'{path}/{file}', 'rb') as f:
data = f.read()
unzipped_data = gzip.decompress(data)
decoded_data = io.BytesIO(unzipped_data)
reader = jsonlines.Reader(decoded_data)
for line in reader:
if line['category_id'] == 6014:
result.append(line)
df = pd.DataFrame(result)
This should allow you to read each line without loading the whole file.
import pandas as pd
import json
import gzip
import os
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with gzip.open(f'{path}/{file}') as f:
for line in f:
data = json.loads(line)
if data['category_id'] == 6014:
result.append(data)
df = pd.DataFrame(result)
I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')
I am making a document classifier and here is my code:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer,
TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def readFiles(path):
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
path = os.path.join(root, filename)
inBody = False
lines = []
f = io.open(path, 'r', encoding='latin1')
for line in f:
if inBody:
lines.append(line)
elif line == '\n':
inBody = True
f.close()
message = '\n'.join(lines)
yield path, message
def dataFrameFromDirectory(path, classification):
rows = []
index = []
for filename, message in readFiles(path):
rows.append({'resume': message, 'class': classification})
index.append(filename)
return DataFrame(rows, index=index)
data = DataFrame({'resume': [], 'class': []})
data = data.append(dataFrameFromDirectory(r'<path>', 'Yes'))
data = data.append(dataFrameFromDirectory(r'<path>', 'No'))
Then I split the data, and used Tfidf Vectorizer:
tf=TfidfVectorizer(min_df=1, stop_words='english')
data_traintf=tf.fit_transform(data_train)
mnb=MultinomialNB()
mnb.fit(data_traintf,class_train)
After training and testing, I saved my classifier as a pickle file:
import pickle
with open(r'clf.pkl','wb') as f:
pickle.dump(mnb,f)
But when I load it again and try to use the classifier, I get TfidfVectorizer - Vocabulary wasn't fitted error. So I tried using pipeline and saved my vectorizer as well :
from sklearn.pipeline import Pipeline
classifier=Pipeline([('tfidf',tf),('multiNB',mnb)])
with open(r'clf_1.pkl','wb') as f:
pickle.dump(classifier,f)
But still I get the same error. What might be going wrong?
EDIT: The pickle file was stored successfully and on the other end, I loaded the file:
import pickle
with open(r'clf_1.pkl','rb') as f:
clf=pickle.load(f)
And created a test data frame. When I do test_tf=tf.fit(test['resume']) it works fine but pred=clf.predict(test_tf) gives error TypeError: 'TfidfVectorizer' object is not iterable
Do I need to loop through the data frame that has around 15 objects?