Tesseract OCR fails on TIFF files - python

I have a multiple page .tif file, I am trying to extract text from it using Tesseract OCR but I am getting this error
TypeError: Unsupported image object
Code
from PIL import Image
import pytesseract
img = Image.open('Group 1/1_CHE_MDC_1.tif')
text = pytesseract.image_to_string(img.seek(0)) # OCR on 1st Page
text = ' '.join(text.split())
print(text)
ERROR
Any idea why its happening

Image.seek does not have a return value so you're essentially running:
pytesseract.image_to_string(None)
Instead do:
img.seek(0)
text = pytesseract.image_to_string(img)

I had a same question and i have tried below code and it worked for me :-
import glob
import pytesseract
import os
os.chdir("Set your Tesseract-OCR .exe file path")
b = ''
for i in glob.glob('Fullpath of your image directory/*.tif'): <-- you can give *.jpg extension in case of jpg image
if glob.glob('*.tif'):
b = b + (pytesseract.image_to_string(i))
print(b)
Happy learning !

Related

blank output after solving the captcha using pytesseract?

I am trying to solve a captcha :
and run a script :
from PIL import Image
from pytesseract import pytesseract
path_to_tesseract = r"/usr/local/Cellar/tesseract/5.0.1/bin/tesseract"
image_path2 = r"captcha2.jpg"
img = Image.open(image_path2)
pytesseract.tesseract_cmd = path_to_tesseract
text = pytesseract.image_to_string(img)
print(text[:-1])
captchaText=text[:-1]
but output is blank and when I use the same script with the following captcha:
it works great.

how to extract tiff images from the subfolder into. hocr format in another similar subfolder

For me, the code is running perfectly but unable to get the output. I am extracting multiple-page TIFF to text and later to HOCR. Please suggest, where it went wrong
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract- OCR\tesseract.exe"
image = Image.open(r"C:\Users\multipage.tiff")
config = ("--oem 3 --psm 6")
txt = ''
for frame in range(image.n_frames):
image.seek(frame)
txt += pytesseract.image_to_string(image, config = config, lang='eng') + '\n'
print(txt)
with open(r"C:\Users\multipage_output.txt", mode = 'w') as f:
f.write(txt)
My Query is how to extract tiff images from the subfolder into. hocr format in another similar subfolder. The above code is working fine to retrieve tiff to a text file.
Example
D:\\subfolder\Subfolder1\tiff image to E:\subfolder\Subfolder1\Hocr image

image to text conversion using Tesseract

I am trying to load all images in a folder and extract text from images. I keep getting error message for the second for loop. For example,
AttributeError: 'numpy.ndarray' object has no attribute 'read'
It seems I cannot access list Img. Any idea?
# import OpenCV, Numpy, Python image library, Tesseract OCR
import os
import cv2
import numpy
from PIL import Image
import pytesseract
import glob
#set tesseract path
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'
#read all image with .jpg format in a specifying folder
img = []
for i in glob.glob("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\\Reports\\Image\\*.jpg"):
n= cv2.imread(i,0) #convert image to grayscale
print(i)
img.append(n)
for j in img:
im = Image.open(j)
text = pytesseract.image_to_string (j, lang='eng')
with open("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\Reports\\Image\\test.txt", "w") as f:
f.write(text.encode('utf8'))
I have Mac OSX but you can adjust this code to file Window's path directory.
import os
from os import path
from glob import glob
from pytesseract import image_to_string
from PIL import Image, ImageEnhance, ImageFilter
def enhance_img(filename):
# Enhance image and save as under new name
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('newfilename')
def convert_img(filename):
image = Image.open(filename)
# Convert image to text
file = open ('parsing.txt', 'a')
file.write(image_to_string(image))
file.close
def find_ext(dir, ext):
return glob(path.join(dir, "*.{}".format(ext)))
# use the following for change directory
# os.chdir(path)
filename = find_ext("","png")
for file in filename:
# convert image to text
convert_img(file)
If you want to enhance the image then include the following block and adjust the code above to loop through the new filenames.
def enhance_img(filename):
# Enhance image and save as under new name
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('newfilename')
For file in filename:
# to enhance image if needed
newfilename = filename[-3] + '_1.png'
enhance_img(file)

Get text from image

I need to use pytesseract to extract text from this picture: enter image description here
However, i used pytesseract. It wont work.Here is my code:
try:
import Image
except ImportError:
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('1.png')))
you have to install textract using this we can extract text from any extension.
# some python file
import textract
text = textract.process("path/to/file.extension")
you can provide file as image or pdf or any docs.
for your code
text = textract.process("1.png")

How to get letters from an image using python

i want capture the letters(characters & Numbers) from an image using python please help me how can i do it explain me with any sample code.
I hope this will help you out if your image is clear (positively less Noise).
Use "PyTesser" Project of Google in this Case.
PyTesser is an Optical Character Recognition module for Python.
It takes as input an image or image file and outputs a string.
You can get PyTesser from this link.
Here's an example:
>>> from pytesser import *
>>> image = Image.open('fnord.tif') # Open image object using PIL
>>> print image_to_string(image) # Run tesseract.exe on image
fnord
>>> print image_file_to_string('fnord.tif')
fnord
I use tesseract for this.
There is also a Python library for it: https://code.google.com/p/python-tesseract/
Example from the main page:
import tesseract
api = tesseract.TessBaseAPI()
api.Init(".","eng",tesseract.OEM_DEFAULT)
api.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyz")
api.SetPageSegMode(tesseract.PSM_AUTO)
mImgFile = "eurotext.jpg"
mBuffer=open(mImgFile,"rb").read()
result = tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)
print "result(ProcessPagesBuffer)=",result
Here is my code for Python3 not using the tesseract library but the .exe file:
import os
import tempfile
def tesser_exe():
path = os.path.join(os.environ['Programfiles'], 'Tesseract-OCR', 'tesseract.exe')
if not os.path.exists(path):
raise NotImplementedError('You must first install tesseract from https://code.google.com/p/tesseract-ocr/downloads/detail?name=tesseract-ocr-setup-3.02.02.exe&can=2&q=')
return path
def text_from_image_file(image_name):
assert image_name.lower().endswith('.bmp')
output_name = tempfile.mktemp()
exe_file = tesser_exe() # path to the tesseract.exe file from
return_code = subprocess.call([exe_file, image_name, output_name, '-psm', '7'])
if return_code != 0:
raise NotImplementedError('error handling not implemented')
return open(output_name + '.txt', encoding = 'utf8').read()

Categories