Cache error when using pytesseract for OCR - python

I am trying to use pytesseract OCR to extract text from all the PDFs in a directory, but I am getting an error message that there is not enough space on my device.
I would like to delete each image from the cache after it is no longer required, as this user was advised to do, but I can't find anything in the pytesseract documentation explaining how to do this.
Here is my code:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
return extract
Here is the error message:
CacheError: unable to extend cache 'C:/Users/b00kgrrl/AppData/Local/Temp/magick-11952ORBzkae3wXX_18': No space left on device # error/cache.c/OpenPixelCache/3889

I solved this myself using code found here and here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import winshell
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
tempdir = r"C:\Users\b00kgrrl\AppData\Local\Temp"
cache = os.listdir( tempdir )
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
for item in cache:
if item.endswith(".jpg") or item.startswith("magick-"):
os.remove( os.path.join( tempdir, item ) )
winshell.recycle_bin().empty(confirm=False, show_progress=False, sound=False)
return extract

Related

why PyTesseract detect 8 as 3 .?

I wrote a simple function in my app, which converts certain very simple images containing numbers into numbers . thats it ! . but if image has 8 its show's 3 . any help ? this is image example
also i tried convert my image also to sharpness . its not work perfectly for other numbers . how i can fix it ?
import tkinter
from tkinter import font
from pyasn1.type.univ import Null
import requests
from tkinter import *
import tkinter as tk
import sys
import os
from PIL import Image
from pytesseract import pytesseract
def resource_path(relative_path):
""" Get absolute path to resource, works for dev and for PyInstaller """
try:
# PyInstaller creates a temp folder and stores path in _MEIPASS
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
path_to_tesseract =resource_path(r"Tesseract-OCR\\tesseract.exe")
Startbalance_window = Tk()
#Setting title of screen
Startbalance_window.title("SS")
#setting height and width of screen
Startbalance_window.geometry("500x180")
Startbalance_window.configure(background=("#091c2d"))
path_to_tesseract =resource_path(r"Tesseract-OCR\\tesseract.exe")
pytesseract.tesseract_cmd = path_to_tesseract
def get_wallet():
urlentryGP_text = urlentry_GP.get()
pytesseract.tesseract_cmd = path_to_tesseract
imgURL_GP= Image.open(requests.get(urlentryGP_text, stream=True).raw)
response = requests.get(urlentryGP_text)
width, height = imgURL_GP.size # get image size
imgURL_GP = imgURL_GP.resize((800, int(800/(width/height))), Image.ANTIALIAS)
im_s_1 = imgURL_GP.convert("L") #black&white
custom_config = r'--oem 3 --psm 6 outputbase digits'
query = pytesseract.image_to_string(im_s_1, lang='eng+fra',nice=1,config=custom_config) # getting all text in the image
image_list2 = query.split()
OSBALANCE_image21 = image_list2[-1].replace(',',"") #getting last value and remove comma from it
OSBALANCE_image21 = OSBALANCE_image21.replace('#',"0")
OSBALANCE_image21 = OSBALANCE_image21.replace(';',"")
OSBALANCE_image21 = OSBALANCE_image21.replace(':',"")
OSBALANCE_image21 = OSBALANCE_image21.replace('/',"")
OSBALANCE_image21 = OSBALANCE_image21.replace('L',"")
OSBALANCE_image21 = OSBALANCE_image21.replace('J',"")
OSBALANCE_image21 = OSBALANCE_image21.replace('I',"")
print(OSBALANCE_image21)
sum_balance_text.set(f" Your Balance is: {OSBALANCE_image21} ")
sum_balance.place(x=265,y=120)
global urlentry_GP
urlentry_GP = StringVar()
URL_Entry_GP = Entry(Startbalance_window, textvariable=urlentry_GP).place(x=300,y=50)
submit_imagebutton= Button(Startbalance_window, text="Extract Balance", width=10, height=1, bg="orange",command=get_wallet,font='lato 8 bold',cursor="hand2")
submit_imagebutton.place(x=290,y=80, width=148,height=20)
global sum_balance_text
sum_balance_text = StringVar()
sum_balance=Label(Startbalance_window,textvariable=sum_balance_text,bg="#091c2d",fg="white",font=("default",9,"bold"))
Startbalance_window.resizable(0,0)
Startbalance_window.mainloop()
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = 'bin/tesseract.exe'
tessdata = 'tessdata'
filename = 'WENWg.png'
im = Image.open(filename)
width, height = im.size
scale_factor = 3
im_scaled = im.resize((width*scale_factor, height*scale_factor))
print(pytesseract.image_to_string(im_scaled, config=f'--psm 6 --tessdata-dir "{tessdata}"').replace('\n\f', ''))
Output:
Total quide price:
123,802,858

Cache error while doing OCR on a directory of pdf's in python

I am trying to OCR an entire directory of pdf files using pytesseract and imagemagick but the issue is that imagemagick is consuming all my Temp folder space and finally I'm getting a cache error i.e "CacheError: unable to extend cache 'C:/Users/Azu/AppData/Local/Temp/magick-18244WfgPyAToCsau11': No space left on device # error/cache.c/OpenPixelCache/3883" I have also written a code to delete the temp folder content once OCR'd but still facing the same issue.
Here's the code till now:
import io
import os
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
tempdir = r"C:\Users\Azu\AppData\Local\Temp"
filesall = os.listdir(tempdir)
for file in files:
name = os.path.basename(file).split('.')[0]
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ name + ".txt", 'w') as f:
f.write(str(extracted_texts))
for ifile in filesall:
if "magick" in ifile:
os.remove(os.path.join(tempdir,ifile))

how to add multiple pdfs to be converted into excel?

I have program which converts pdf to excel, Now i want add multiple inputs i.e. multiple pdfs to be converted one by one.
my code is below:
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os
pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.
you can use loop
for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
pdf = wi(filename=name, resolution=300)
# rest of code
or you can use sys.argv to get names as
script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf
and code
import sys
for name in sys.argv[1:]:
pdf = wi(filename=name, resolution=300)
# rest of code
Try the code below. This will loop through every PDF file in the folder directory you define. Be sure to update your file_path to be where your PDFs are saved, making sure you use double backslashs in place of single backslashes.
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os
file_path = "C:\\Users\\..."
for file in os.listdir(file_path):
if file.endswith(".pdf"):
pdf = wi(file, resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open(file+'.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.

Running OCR Python

I am trying to make some OCR with Python. I found this code on the internet, which did what I want to. But when I try to run it, I receive this error message.
Leave my code Here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
pdf = wi(filename="test1(citibank).pdf", resolution=300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage=wi(image=img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognisedtext = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang='es')
recognisedtext.append(text)
print(recognisedtext[1])

"!empty() in function 'cv::CascadeClassifier::detectMultiScale'"

I am trying to work on a Face Recognition system in Python OpenCV but I keep getting the following error
"!empty() in function 'cv::CascadeClassifier::detectMultiScale'"
This is the code that I'm using:
import cv2
import os
import numpy as np
from PIL import Image
import pickle
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
image_dir = os.path.join(BASE_DIR, "foto")
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt2.xml')
recognizer = cv2.face.LBPHFaceRecognizer_create()
current_id = 0
label_ids = {}
y_labels = []
x_train = []
for root, dirs, files in os.walk(image_dir):
for file in files:
if file.endswith("png") or file.endswith("jpg"):
path = os.path.join(root, file)
label = os.path.basename(root).replace(" ", "-").lower()
#print(label, path)
if not label in label_ids:
label_ids[label] = current_id
current_id += 1
id_ = label_ids[label]
#print(label_ids)
#y_labels.append(label) # some number
#x_train.append(path) # verify this image, turn into a NUMPY
arrray, GRAY
pil_image = Image.open(path).convert("L") # grayscale
size = (550, 550)
final_image = pil_image.resize(size, Image.ANTIALIAS)
image_array = np.array(final_image, "uint8")
#print(image_array)
faces = face_cascade.detectMultiScale(image_array, scaleFactor=1.5, minNeighbors=5)
for (x,y,w,h) in faces:
roi = image_array[y:y+h, x:x+w]
x_train.append(roi)
y_labels.append(id_)
#print(y_labels)
#print(x_train)
with open("pickles/face-labels.pickle", 'wb') as f:
pickle.dump(label_ids, f)
recognizer.train(x_train, np.array(y_labels))
recognizer.save("recognizers/face-trainner.yml")
What am I doing wrong?
You need to put the full path to the file.
Example:
face_cascade = cv2.CascadeClassifier('C:\\working_Dir\\data\\codes\\OpenCV\\classifiers\\haarcascade_frontalface_alt2.xml')
You can download these codes from the github Repo here : Face Detection with Python using OpenCV
I had the same issue, you need to add double slashes instead of single ones.
git clone https://github.com/opencv/opencv.git
faceCascade = cv2.CascadeClassifier('opencv/data/haarcascades/haarcascade_frontalface_default.xml')
you have to give full path of your haarcascade_frontalface_alt2.xml file
like this:-"C:\Python39\Lib\site-packages\cv2\data\haarcascade_frontalface_alt2.xml"
Xml file is missing.
Try to give full path directly like this.
face_cascade = cv2.CascadeClassifier('C:\opencv\sources\data\haarcascades\haarcascade_frontalface_default.xml')
more importantly the file should be in C Directory

Categories