image to text conversion using Tesseract - python

I am trying to load all images in a folder and extract text from images. I keep getting error message for the second for loop. For example,
AttributeError: 'numpy.ndarray' object has no attribute 'read'
It seems I cannot access list Img. Any idea?
# import OpenCV, Numpy, Python image library, Tesseract OCR
import os
import cv2
import numpy
from PIL import Image
import pytesseract
import glob
#set tesseract path
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'
#read all image with .jpg format in a specifying folder
img = []
for i in glob.glob("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\\Reports\\Image\\*.jpg"):
n= cv2.imread(i,0) #convert image to grayscale
print(i)
img.append(n)
for j in img:
im = Image.open(j)
text = pytesseract.image_to_string (j, lang='eng')
with open("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\Reports\\Image\\test.txt", "w") as f:
f.write(text.encode('utf8'))

I have Mac OSX but you can adjust this code to file Window's path directory.
import os
from os import path
from glob import glob
from pytesseract import image_to_string
from PIL import Image, ImageEnhance, ImageFilter
def enhance_img(filename):
# Enhance image and save as under new name
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('newfilename')
def convert_img(filename):
image = Image.open(filename)
# Convert image to text
file = open ('parsing.txt', 'a')
file.write(image_to_string(image))
file.close
def find_ext(dir, ext):
return glob(path.join(dir, "*.{}".format(ext)))
# use the following for change directory
# os.chdir(path)
filename = find_ext("","png")
for file in filename:
# convert image to text
convert_img(file)
If you want to enhance the image then include the following block and adjust the code above to loop through the new filenames.
def enhance_img(filename):
# Enhance image and save as under new name
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('newfilename')
For file in filename:
# to enhance image if needed
newfilename = filename[-3] + '_1.png'
enhance_img(file)

Related

I am trying to install "pytesseract" but it doesn't work

I am trying to install "pytesseract" but the code after that doesn't work
from PIL import Image
from pytesseract import pytesseract
#Define path to tessaract.exe
path_to_tesseract = r'C:\Python310\Scripts\pytesseract.exe'
#Define path to image
path_to_image = 'extract.png'
#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract
#Open image with PIL
img = Image.open(path_to_image)
#Extract text from image
text = pytesseract.image_to_string(img)
print(text)
when run the code it shows this error
(ModuleNotFoundError: No module named 'pytesseract')

Python Attribute Error Raised while using Thumbnail method of PIL

I am using PIL to make an application to open all images in a folder. I sought for tutorials for PIL. I tried to find tutorials with list of images, but I failed to do so. I found some, but I had to list the file location beforehand. It annoyed me. So, instead I want the user to choose a folder, and the application would load all the images for the user. But, while making the thumbnails for the list of images, I got an error which I'm not familiar with. This is the exact error:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\tkinter\__init__.py", line
1892, in __call__
return self.func(*args)
File "f:\OG\Python\ImageViewer.py", line 47, in openFolder
GetFiles()
File "f:\OG\Python\ImageViewer.py", line 87, in GetFiles
with Image.open(i) as img:
prefix = fp.read(16)
raise AttributeError(name)
The minimal code to get this error is:
import glob
from PIL import Image, ImageTk
fileDir = "Your Folder"
imageList = []
image_list = []
for filename in glob.glob(fileDir + '/*.jpg'): # gets jpg
im = Image.open(filename)
imageList.append(im)
for i in imageList:
with Image.open(i) as img: # This raises the error
imageList[i] = img.thumbnail((550, 450))
for i in image_list: # Would this work?
image_list[i] = ImageTk.PhotoImage(imageList[i])
I would like to know if the code that is commented with 'Would this work?' would work or not.
Just remove the reading part again which doesn't make sense
import glob
from PIL import Image, ImageTk
fileDir =r"your path"
imageList = []
for filename in glob.glob(fileDir + '/*.jpg'): # gets jpg
im = Image.open(filename)
imageList.append(im)
imageList will look like this :
[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=200x200 at 0x25334A87D90>]
here is the blockbuster solution
import glob
from PIL import Image, ImageTk
import PIL
from pathlib import Path
fileDir = r"your_path_here"
imageList = []
for filename in glob.glob(fileDir + '/*.jpg'): # gets jpg
im = Image.open(filename)
imageList.append(im)
im.thumbnail((550, 450))
im.save(fileDir+'/'+Path(filename).name.split('.')[0]+'_thumbnail.png')
I solved it, I edited the code as follows:
import glob
from PIL import Image, ImageTk
fileDir = "Your Folder"
imageList = []
image_list = []
count = 0
for filename in glob.glob(fileDir + '/*.jpg'): # gets jpg
imageList.append(filename)
for i in imageList:
with Image.open(i) as img:
i = img.thumbnail((550, 450))
for i in imageList: # This gives a Key Error Now
image_list.append(ImageTk.PhotoImage(imageList[count]))
count = count + 1
Basically, Introduced a new variable count with a value of 0, removed open from first for loop, used append method for the last for loop and added count 1 each time :)

read images from zipfile in python throwing errors

i am trying to get metadata from images inside a zip file. and it's throwing this error:
PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x00000157389C2130>
here is my code:
import zipfile
from PIL import Image
from PIL.ExifTags import TAGS
from io import BytesIO
with zipfile.ZipFile("samples.zip", "r") as f:
for name in f.namelist():
image_data = f.read(name)
image = Image.open(BytesIO(image_data))
exif_data = image.getexif()
width, height = image.size
print(width, height)
i tried every solution i could find and still get the error. please help

How to put text on multiple images using python?

from PIL import Image, ImageDraw, ImageFont
import glob
import os
images = glob.glob("directory_path/*.jpg")
for img in images:
images = Image.open(img)
draw = ImageDraw.Draw(images)
font = ImageFont.load_default() #Downloaded Font from Google font
text = "Text on all images from directory"
draw.text((0,150),text,(250,250,250),font=font)
images.save(img)
I have to put text on all images , I have tried above code but its not working
This code worked for me just fine, but the text was hard to read because it was small and white. I did change directory_path to images and put my images in there. The images looked like this, the text is small and on the left side:
Here is the solution
from PIL import Image,ImageDraw,ImageFont
import glob
import os
images=glob.glob("path/*.jpg")
for img in images:
images=Image.open(img)
draw=ImageDraw.Draw(images)
font=ImageFont.load_default()
text="Whatever text"
draw.text((0,240),text,(250,250,250),font=font)
images.save(img)
one possible problem with the code may be that you are using the images variable for saving the list of images and also to iterate through the images.
Try this code, this will work for sure.
from PIL import Image, ImageDraw, ImageFont
import glob
import os
images = glob.glob("new_dir/*.jpg")
print(images)
for img in images:
image = Image.open(img)
draw = ImageDraw.Draw(image)
font = ImageFont.load_default() #Downloaded Font from Google font
text = "Text on all images from directory"
draw.text((0,150),text,fill = 'red' ,font=font)
image.save(img)

Tesseract OCR fails on TIFF files

I have a multiple page .tif file, I am trying to extract text from it using Tesseract OCR but I am getting this error
TypeError: Unsupported image object
Code
from PIL import Image
import pytesseract
img = Image.open('Group 1/1_CHE_MDC_1.tif')
text = pytesseract.image_to_string(img.seek(0)) # OCR on 1st Page
text = ' '.join(text.split())
print(text)
ERROR
Any idea why its happening
Image.seek does not have a return value so you're essentially running:
pytesseract.image_to_string(None)
Instead do:
img.seek(0)
text = pytesseract.image_to_string(img)
I had a same question and i have tried below code and it worked for me :-
import glob
import pytesseract
import os
os.chdir("Set your Tesseract-OCR .exe file path")
b = ''
for i in glob.glob('Fullpath of your image directory/*.tif'): <-- you can give *.jpg extension in case of jpg image
if glob.glob('*.tif'):
b = b + (pytesseract.image_to_string(i))
print(b)
Happy learning !

Categories