Get text from image - python

I need to use pytesseract to extract text from this picture: enter image description here
However, i used pytesseract. It wont work.Here is my code:
try:
import Image
except ImportError:
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('1.png')))

you have to install textract using this we can extract text from any extension.
# some python file
import textract
text = textract.process("path/to/file.extension")
you can provide file as image or pdf or any docs.
for your code
text = textract.process("1.png")

Related

blank output after solving the captcha using pytesseract?

I am trying to solve a captcha :
and run a script :
from PIL import Image
from pytesseract import pytesseract
path_to_tesseract = r"/usr/local/Cellar/tesseract/5.0.1/bin/tesseract"
image_path2 = r"captcha2.jpg"
img = Image.open(image_path2)
pytesseract.tesseract_cmd = path_to_tesseract
text = pytesseract.image_to_string(img)
print(text[:-1])
captchaText=text[:-1]
but output is blank and when I use the same script with the following captcha:
it works great.

NameError: name 'img_new' is not defined, how to fix?

Im trying to get pytesseract to work at identifying an image as single characters and not words.
Using code: This works, but only for detecting words not single characters in the image.
#importing modules
import pytesseract
from PIL import Image
# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
#converting image to text
print(pytesseract.image_to_string(Image.open('C:\Program Files\Tesseract-OCR\image2.png')))
Attempting to view single characters Code:
#importing modules
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
#converting image to text
text = pytesseract.image_to_string(img_new, lang='eng', config='--psm 10')
print(pytesseract.image_to_string(Image.open('C:\Program Files\Tesseract-OCR\image2.png')))
I get error
text = pytesseract.image_to_string(img_new, lang='eng', config='--psm 10')
NameError: name 'img_new' is not defined

Extracting text from scanned PDF without saving the scan as a new file image

I would like to extract text from scanned PDFs.
My "test" code is as follows:
from pdf2image import convert_from_path
from pytesseract import image_to_string
from PIL import Image
converted_scan = convert_from_path('test.pdf', 500)
for i in converted_scan:
i.save('scan_image.png', 'png')
text = image_to_string(Image.open('scan_image.png'))
with open('scan_text_output.txt', 'w') as outfile:
outfile.write(text.replace('\n\n', '\n'))
I would like to know if there is a way to extract the content of the image directly from the object converted_scan, without saving the scan as a new "physical" image file on the disk?
Basically, I would like to skip this part:
for i in converted_scan:
i.save('scan_image.png', 'png')
I have a few thousands scans to extract text from. Although all the generated new image files are not particularly heavy, it's not negligible and I find it a bit overkill.
EDIT
Here's a slightly different, more compact approach than Colonder's answer, based on this post. For .pdf files with many pages, it might be worth adding a progress bar to each loop using e.g. the tqdm module.
from wand.image import Image as w_img
from PIL import Image as p_img
import pyocr.builders
import regex, pyocr, io
infile = 'my_file.pdf'
tool = pyocr.get_available_tools()[0]
tool = tools[0]
req_image = []
txt = ''
# to convert pdf to img and extract text
with w_img(filename = infile, resolution = 200) as scan:
image_png = scan.convert('png')
for i in image_png.sequence:
img_page = w_img(image = i)
req_image.append(img_page.make_blob('png'))
for i in req_image:
content = tool.image_to_string(
p_img.open(io.BytesIO(i)),
lang = tool.get_available_languages()[0],
builder = pyocr.builders.TextBuilder()
)
txt += content
# to save the output as a .txt file
with open(infile[:-4] + '.txt', 'w') as outfile:
full_txt = regex.sub(r'\n+', '\n', txt)
outfile.write(full_txt)
UPDATE MAY 2021
I realized that although pdf2image is simply calling a subprocess, one doesn't have to save images to subsequently OCR them. What you can do is just simply (you can use pytesseract as OCR library as well)
from pdf2image import convert_from_path
for img in convert_from_path("some_pdf.pdf", 300):
txt = tool.image_to_string(img,
lang=lang,
builder=pyocr.builders.TextBuilder())
EDIT: you can also try and use pdftotext library
pdf2image is a simple wrapper around pdftoppm and pdftocairo. It internally does nothing more but calls subprocess. This script should do what you want, but you need a wand library as well as pyocr (I think this is a matter of preference, so feel free to use any library for text extraction you want).
from PIL import Image as Pimage, ImageDraw
from wand.image import Image as Wimage
import sys
import numpy as np
from io import BytesIO
import pyocr
import pyocr.builders
def _convert_pdf2jpg(in_file_path: str, resolution: int=300) -> Pimage:
"""
Convert PDF file to JPG
:param in_file_path: path of pdf file to convert
:param resolution: resolution with which to read the PDF file
:return: PIL Image
"""
with Wimage(filename=in_file_path, resolution=resolution).convert("jpg") as all_pages:
for page in all_pages.sequence:
with Wimage(page) as single_page_image:
# transform wand image to bytes in order to transform it into PIL image
yield Pimage.open(BytesIO(bytearray(single_page_image.make_blob(format="jpeg"))))
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
# Ex: Will use tool 'libtesseract'
langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))
# Ex: Will use lang 'fra'
# Note that languages are NOT sorted in any way. Please refer
# to the system locale settings for the default language
# to use.
for img in _convert_pdf2jpg("some_pdf.pdf"):
txt = tool.image_to_string(img,
lang=lang,
builder=pyocr.builders.TextBuilder())

Tesseract OCR fails on TIFF files

I have a multiple page .tif file, I am trying to extract text from it using Tesseract OCR but I am getting this error
TypeError: Unsupported image object
Code
from PIL import Image
import pytesseract
img = Image.open('Group 1/1_CHE_MDC_1.tif')
text = pytesseract.image_to_string(img.seek(0)) # OCR on 1st Page
text = ' '.join(text.split())
print(text)
ERROR
Any idea why its happening
Image.seek does not have a return value so you're essentially running:
pytesseract.image_to_string(None)
Instead do:
img.seek(0)
text = pytesseract.image_to_string(img)
I had a same question and i have tried below code and it worked for me :-
import glob
import pytesseract
import os
os.chdir("Set your Tesseract-OCR .exe file path")
b = ''
for i in glob.glob('Fullpath of your image directory/*.tif'): <-- you can give *.jpg extension in case of jpg image
if glob.glob('*.tif'):
b = b + (pytesseract.image_to_string(i))
print(b)
Happy learning !

Python Image Library can't convert to pdf

I have a function to convert a bmp to pdf with PILLOW, this script I have it in non compiled version and compiled version (.exe). In the first one it works correctly, but in the second PILLOW throws an exception ('PDF'). Specifically fails in the .save ()
Paths and filename with extension are correct.
from PIL import Image
def bmp2pdf(self, file):
''' Convert a bmp file to PDF file, and delete old bmp file '''
img = Image.open(file)
output = file.replace('.bmp', '.pdf')
try:
img.save(output, "PDF", resolution=100.0)
remove(file)
except Exception as err:
print(err)
In the compiled version the output is:
'PDF'
Thx.
Follow this code.It works. 3 line code.
from PIL import Image
def bmp2pdf(self,path):
img = Image.open(path)
img.save('image.pdf','pdf')
I got a file named image.pdf with the image in it.
I had to add in my setup to generate the .exe I should import PIL and not PIL.IMAGE, so the whole module is loaded and the pdf feature is available
I'm using cx_freeze:
'Packages': [
'PyQt5.uic',
"...",
'PIL',
]

Categories