Running OCR Python - python

I am trying to make some OCR with Python. I found this code on the internet, which did what I want to. But when I try to run it, I receive this error message.
Leave my code Here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
pdf = wi(filename="test1(citibank).pdf", resolution=300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage=wi(image=img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognisedtext = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang='es')
recognisedtext.append(text)
print(recognisedtext[1])

Related

Python Image extraction sequence from pdf

I was trying to extract images from a pdf using PyMuPDF (fitz). My pdf has multiple images in a single page. I am maintaining a proper sequence number while saving my images. I saw that the images being extracted don't follow a proper sequence. Sometimes it is starting to extract from the bottom, sometimes from the top and so on. Is there a way to modify my code so that the extraction follow a proper sequence?
Given below is the code I am using :
import fitz
from PIL import Image
filename = "document.pdf"
doc = fitz.open(filename)
for i in range(len(doc)):
img_num = 0
p_no = 1
for img in doc.getPageImageList(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4:
img_num += 1
pix.writeImage("%s-%s.jpg" % (str(p_no),str(img_num)))
else:
img_num += 1
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage("%s-%s.jpg" % (str(p_no),str(img_num)))
pix1 = None
pix = None
p_no += 1
Given below is a sample page of the pdf
I have the same problem I've used the following code:
import fitz
import io
from PIL import Image
file = "file_path"
pdf_file = fitz.open(file)
for page_index in range(len(pdf_file)):
# get the page itself
page = pdf_file[page_index]
image_list = page.getImageList()
# printing number of images found in this page
if image_list:
print(f"[+] Found {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on the given pdf page", page_index)
for image_index, img in enumerate(page.getImageList(), start=1):
print(img)
print(image_index)
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extractImage(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
The most probable way is to locate the 'img' var and order them.
I'd love to hear any further sggestions or if you found better idea/solution.

Cache error when using pytesseract for OCR

I am trying to use pytesseract OCR to extract text from all the PDFs in a directory, but I am getting an error message that there is not enough space on my device.
I would like to delete each image from the cache after it is no longer required, as this user was advised to do, but I can't find anything in the pytesseract documentation explaining how to do this.
Here is my code:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
return extract
Here is the error message:
CacheError: unable to extend cache 'C:/Users/b00kgrrl/AppData/Local/Temp/magick-11952ORBzkae3wXX_18': No space left on device # error/cache.c/OpenPixelCache/3889
I solved this myself using code found here and here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import winshell
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
tempdir = r"C:\Users\b00kgrrl\AppData\Local\Temp"
cache = os.listdir( tempdir )
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
for item in cache:
if item.endswith(".jpg") or item.startswith("magick-"):
os.remove( os.path.join( tempdir, item ) )
winshell.recycle_bin().empty(confirm=False, show_progress=False, sound=False)
return extract

Cache error while doing OCR on a directory of pdf's in python

I am trying to OCR an entire directory of pdf files using pytesseract and imagemagick but the issue is that imagemagick is consuming all my Temp folder space and finally I'm getting a cache error i.e "CacheError: unable to extend cache 'C:/Users/Azu/AppData/Local/Temp/magick-18244WfgPyAToCsau11': No space left on device # error/cache.c/OpenPixelCache/3883" I have also written a code to delete the temp folder content once OCR'd but still facing the same issue.
Here's the code till now:
import io
import os
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
tempdir = r"C:\Users\Azu\AppData\Local\Temp"
filesall = os.listdir(tempdir)
for file in files:
name = os.path.basename(file).split('.')[0]
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ name + ".txt", 'w') as f:
f.write(str(extracted_texts))
for ifile in filesall:
if "magick" in ifile:
os.remove(os.path.join(tempdir,ifile))

how to add multiple pdfs to be converted into excel?

I have program which converts pdf to excel, Now i want add multiple inputs i.e. multiple pdfs to be converted one by one.
my code is below:
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os
pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.
you can use loop
for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
pdf = wi(filename=name, resolution=300)
# rest of code
or you can use sys.argv to get names as
script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf
and code
import sys
for name in sys.argv[1:]:
pdf = wi(filename=name, resolution=300)
# rest of code
Try the code below. This will loop through every PDF file in the folder directory you define. Be sure to update your file_path to be where your PDFs are saved, making sure you use double backslashs in place of single backslashes.
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os
file_path = "C:\\Users\\..."
for file in os.listdir(file_path):
if file.endswith(".pdf"):
pdf = wi(file, resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open(file+'.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.

How to get text from image using pytesseract in images

I need to use pytesseract to extract text from this picture. This is possible?
ex.images:
Code:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
im = Image.open("4.jpg") # the second one
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('4temp.jpg')
text = pytesseract.image_to_string(Image.open('4temp.jpg'))
print(text)
Decoded text:
1 - EN
2- f-ybizc
3- CALE NGS
4- [BLANK]
Thanks!

Categories