Text Extraction from a folder having multiple images to CSV

Text Extraction from a folder having multiple images to CSV - python

I have a file having multiple images and I want to extract the text from them and then to store it in CSV . I tried but not getting satisfied result.
this is my code
enter image description here
from PIL import Image
from pytesseract import pytesseract
import os
#Define path to tessaract.exe
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#Define path to image
path_to_images = r'all_img_iteration_demo/'
#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract
#Get the file names in the directory
for root, dirs, file_names in os.walk(path_to_images):
#Iterate over each file_name in the folder
for file_name in file_names:
#Open image with PIL
img = Image.open(path_to_images + file_name)
#Extract text from image
text = pytesseract.image_to_string(img)
list_1 = text.split()
print(list_1)
jdemo = "datarecords.csv"
with open(jdemo, 'w') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(list_1)
this is the output
enter image description here
what i am getting right now is is this..
i tried to store the extracted text to csv but only last value i am getting in the file.
enter image description here
i want output of each extracted text into new row of csv file . and then iterate over next image and so on..

Related

I want to replace all the images embedded in various xlsx files with a different image

currently the images are not being replaced in the modified xlsx files.
Also I would like some help in modifying this code so that it can find and replace the images in any cell and not just in cell A1
This my code:
import os
from openpyxl import load_workbook
from openpyxl.drawing.image import Image
from PIL import Image as PILImage
# Get folder path and image file path from user
folder_path = input("Enter path to folder containing xlsx files: ").replace('"','')
image_file_path = input("Enter path to image file: ").replace('"','')
# Load the image file using Pillow
image = PILImage.open(image_file_path)
# Loop through all xlsx files in the folder
for filename in os.listdir(folder_path):
if filename.endswith('.xlsx'):
# Load the xlsx file using openpyxl
workbook = load_workbook(os.path.join(folder_path, filename))
# Loop through all sheets in the workbook
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
# Replace the image in cell A1 with the new image provided by the user
if sheet['A1']._value is not None and hasattr(sheet['A1']._value, 'img'):
old_image = sheet['A1']._value.img
old_image.ref = sheet['A1'].coordinate
new_image = Image(image_file_path)
new_image.width = image.width
new_image.height = image.height
sheet._images.remove(old_image)
sheet._images.append(new_image)
# Save the modified workbook in the same folder
parent_folder_path = os.path.dirname(os.path.join(folder_path, filename))
new_file_path = os.path.join(parent_folder_path, 'modified_' + filename)
workbook.save(new_file_path)

Hexadecimal to Image Conversion

I am converting the hexadecimal files to images. The input files are converted to byte string using binascii library. The problem arises when the byte string is written to form an image. The output of all the hexadecimal files is same. I will be grateful if someone provides me a solution.
Here is my code:
import binascii
import cv2
import os
from tkinter import *
from tkinter import filedialog
#Hide the root window that comes by default
root=Tk()
root.withdraw()
#Browse and select txt files
dir=[]
dir=filedialog.askopenfilenames(
initialdir="C:\Binaries\Hexadecimal_Text_Files",
title="Open Text file",
filetypes=(("Text Files", "*.txt"),)
)
#Reading data in txt files and decoding hexadecimal characters
for x in dir:
tf=open(x)#Open file
data=tf.read()#Read data in file
data=data.replace(' ','')#Remove whitespaces
data=data.replace('\n','')#Remove breaks in lines
data=binascii.a2b_hex(data)
tf.close()
#Extract txt filename without extension
pathname, extension = os.path.splitext(f"{x}")#Split path into filename and extenion
filename = pathname.split('/')#Get filename without txt extension
filepath=f"C:\Binaries\Images\{filename[-1]}.png"#Defining name of image file same as txt file
#Write data into image
with open(filepath, 'wb') as image_file:
img=image_file.write(data)
#Resizing Image
img=cv2.resize(img,(500,500))
cv2.imwrite(filepath,img)
Output:

I made my own version because I could not get yours to work, but if you want to make yours work, at least one problem with I found is with this line:
img=cv2.resize(img,(500,500))
by printing all the variables after the supposed "conversion", I found that your variable img in the previous line is not an image but the result of image_file.write(data) which returns the number of bytes written to the file and not the image itself, which is probably why it always prints the same image.
Here is my version
root=Tk()
root.withdraw()
file_path = filedialog.askopenfilename(
initialdir = "C:\Binaries\Images",
title = "Select Hexadecimal Text File",
filetypes = (("Text Files", "*.txt"),)
)
with open(file_path, "r") as hex_file:
hex_data = hex_file.read().replace("\n", "")
#replaces white spaces and new lines from file
binary_data = binascii.a2b_hex(hex_data)
#converts the hexadecimal data to binary
pathname, extension = os.path.splitext(file_path)
image_path = pathname + ".png"
#image path and format
with open(image_path, "wb") as image_file:
image_file.write(binary_data)
#writing the binary data to image file
img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
#if txt file is empty
if img is None:
print("Error: Image not loaded!")
else:
cv2.imshow("image", img)
#waits for key input and closes when pressing any key
cv2.waitKey(0)
cv2.destroyAllWindows()

I have converted the hexadecimal files into images by using numpy array and Pillow. Now I am getting different images.
import numpy as np
import binascii
import os
from PIL import Image as im
from tkinter import *
from tkinter import filedialog
# Hide the root window that comes by default
root = Tk()
root.withdraw()
# Browse and select txt files
dir = []
dir = filedialog.askopenfilenames(
initialdir="C:\Binaries\Folder_3",
title="Open Text file",
filetypes=(("Text Files", "*.txt"),)
)
# Reading data in txt files and decoding hexadecimal characters
for temp in dir:
tf = open(temp) # Open file
data = tf.read() # Read data in file
data= data.replace('\'','') #Remove label
data = data.replace(' ', '') # Remove whitespaces
data = data.replace('\n', '') # Remove breaks in lines
data = binascii.a2b_hex(data)
tf.close()
#Converting bytes array to numpy array
a = np.frombuffer(data, dtype='uint8')
#print(a) //Display array
#Finding optimal factor pair for size of image
x = len(a)
val1=0
val2=0
for i in range(1, int(pow(x, 1 / 2))+1):
if x % i == 0:
val1=i
val2=int(x / i)
#Converting 1-D to 2-D numpy array
a = np.reshape(a, (val1, val2))
#print(a) #Display 2-D array
#Writing array to image
data = im.fromarray(a)
# Split path into filename and extenion
pathname, extension = os.path.splitext(f"{temp}")
filename = pathname.split('/') # Get filename without txt extension
# Defining name of image file same as txt file
filepath = f"C:\Binaries\Images_3\{filename[-1]}.png"
#Resize image
data=data.resize((500,500))
#Saving image into path
data.save(filepath)

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()

Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

Cache error while doing OCR on a directory of pdf's in python

I am trying to OCR an entire directory of pdf files using pytesseract and imagemagick but the issue is that imagemagick is consuming all my Temp folder space and finally I'm getting a cache error i.e "CacheError: unable to extend cache 'C:/Users/Azu/AppData/Local/Temp/magick-18244WfgPyAToCsau11': No space left on device # error/cache.c/OpenPixelCache/3883" I have also written a code to delete the temp folder content once OCR'd but still facing the same issue.
Here's the code till now:
import io
import os
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
tempdir = r"C:\Users\Azu\AppData\Local\Temp"
filesall = os.listdir(tempdir)
for file in files:
name = os.path.basename(file).split('.')[0]
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ name + ".txt", 'w') as f:
f.write(str(extracted_texts))
for ifile in filesall:
if "magick" in ifile:
os.remove(os.path.join(tempdir,ifile))

Converting images to csv file in python

I have converted my image into a csv file and it's like a matrix but I want it to be a single row.
How can I convert all of the images in dataset into a csv file (each image into one line).
Here's the code I've used:
from PIL import Image
import numpy as np
import os, os.path, time
format='.jpg'
myDir = "Lotus1"
def createFileList(myDir, format='.jpg'):
fileList = []
print(myDir)
for root, dirs, files in os.walk(myDir, topdown=False):
for name in files:
if name.endswith(format):
fullName = os.path.join(root, name)
fileList.append(fullName)
return fileList
fileList = createFileList(myDir)
fileFormat='.jpg'
for fileFormat in fileList:
format = '.jpg'
# get original image parameters...
width, height = fileList.size
format = fileList.format
mode = fileList.mode
# Make image Greyscale
img_grey = fileList.convert('L')
# Save Greyscale values
value = np.asarray(fileList.getdata(),dtype=np.float64).reshape((fileList.size[1],fileList.size[0]))
np.savetxt("img_pixels.csv", value, delimiter=',')
input :
http://uupload.ir/files/pto0_lotus1_1.jpg
output:http://uupload.ir/files/huwh_output.png

From your question, I think you want to know about numpy.flatten(). You want to add
value = value.flatten()
right before your np.savetxt call. It will flatten the array to only one dimension and it should then print out as a single line.
The rest of your question is unclear bit it implies you have a directory full of jpeg images and you want a way to read through them all. So first, get a file list:
def createFileList(myDir, format='.jpg'):
fileList = []
print(myDir)
for root, dirs, files in os.walk(myDir, topdown=False):
for name in files:
if name.endswith(format):
fullName = os.path.join(root, name)
fileList.append(fullName)
return fileList
The surround your code with a for fileName in fileList:
Edited to add complete example
Note that I've used csv writer and changed your float64 to ints (which should be ok as pixel data is 0-255
from PIL import Image
import numpy as np
import sys
import os
import csv
#Useful function
def createFileList(myDir, format='.jpg'):
fileList = []
print(myDir)
for root, dirs, files in os.walk(myDir, topdown=False):
for name in files:
if name.endswith(format):
fullName = os.path.join(root, name)
fileList.append(fullName)
return fileList
# load the original image
myFileList = createFileList('path/to/directory/')
for file in myFileList:
print(file)
img_file = Image.open(file)
# img_file.show()
# get original image parameters...
width, height = img_file.size
format = img_file.format
mode = img_file.mode
# Make image Greyscale
img_grey = img_file.convert('L')
#img_grey.save('result.png')
#img_grey.show()
# Save Greyscale values
value = np.asarray(img_grey.getdata(), dtype=np.int).reshape((img_grey.size[1], img_grey.size[0]))
value = value.flatten()
print(value)
with open("img_pixels.csv", 'a') as f:
writer = csv.writer(f)
writer.writerow(value)

How about you convert your images to 2D numpy arrays and then write them as txt files with .csv extensions and , as delimiters?
Maybe you could use a code like following:
np.savetxt('np.csv', image, delimiter=',')

import numpy as np
import cv2
import os
IMG_DIR = '/home/kushal/Documents/opencv_tutorials/image_reading/dataset'
for img in os.listdir(IMG_DIR):
img_array = cv2.imread(os.path.join(IMG_DIR,img), cv2.IMREAD_GRAYSCALE)
img_array = (img_array.flatten())
img_array = img_array.reshape(-1, 1).T
print(img_array)
with open('output.csv', 'ab') as f:
np.savetxt(f, img_array, delimiter=",")

import os
import pandas as pd
path = 'path-to-the-folder'
os.chdir(path)
lists = os.listdir(path)
labels = []
file_lst = []
for folder in lists:
files = os.listdir(path +"/"+folder)
for file in files:
path_file = path + "/" + folder + "/" + file
file_lst.append(path_file)
labels.append(folder)
dictP_n = {"path": file_lst,
"label_name": labels,
"label": labels}
data = pd.DataFrame(dictP_n, index = None)
data = data.sample(frac=1)
data['label'] = data['label'].replace({"class1": 0, "class2": 1 })
data.to_csv("path-to-save-location//file_name.csv", index =None)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Text Extraction from a folder having multiple images to CSV - python

Related

I want to replace all the images embedded in various xlsx files with a different image

Hexadecimal to Image Conversion

How to read all pdf files in a directory and convert to text file using tesseract python 3?

Cache error while doing OCR on a directory of pdf's in python

Converting images to csv file in python

Categories

Resources