how to add multiple pdfs to be converted into excel?

how to add multiple pdfs to be converted into excel? - python

I have program which converts pdf to excel, Now i want add multiple inputs i.e. multiple pdfs to be converted one by one.
my code is below:
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os
pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.

you can use loop
for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
pdf = wi(filename=name, resolution=300)
# rest of code
or you can use sys.argv to get names as
script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf
and code
import sys
for name in sys.argv[1:]:
pdf = wi(filename=name, resolution=300)
# rest of code

Try the code below. This will loop through every PDF file in the folder directory you define. Be sure to update your file_path to be where your PDFs are saved, making sure you use double backslashs in place of single backslashes.
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os
file_path = "C:\\Users\\..."
for file in os.listdir(file_path):
if file.endswith(".pdf"):
pdf = wi(file, resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open(file+'.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.

Related

Image does not exist

I want to print images in word so that I can print them easier, same size, etc.
I use this piece of code:
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
import os
folder = "./mercedes/"
for img_name in os.listdir(folder):
doc = DocxTemplate("C:\\Users\\jpp08\\Documents\\MIDDELKOOP\\testimg.docx")
img1 = InlineImage(doc, image_descriptor=f'{img_name}', width=Mm(20), height=Mm(10))
context = {'img1': img1}
doc.render(context)
doc.save(f"{folder}/{img_name}" +".docx")
But I got the error that "name of picture".jpg does not exist. But it's the right name and right location.

How could I pass my file name through this method?

I am taking a screenshot, and then I need to reference the shot I just took so I can translate what's inside it.
When I directly pass a file location, e.g "filex.png", to readtext, it works, but I just need it to pass the written file into it.
import easyocr
import pyautogui
import time
import cv2
import numpy as np
reader = easyocr.Reader(['en'])
tag = 1
for i in range(2):
time.sleep(4)
image = pyautogui.screenshot(region=(630,400,650,130))
image = cv2.cvtColor(np.array(image),
cv2.COLOR_RGB2BGR)
tag+=1
img = cv2.imwrite(f"image{tag}.png", image)
results = reader.readtext(img)
text=""
for result in results:
text += result[1] + " "
print(text)

In answer to your specific question, I think you're looking for something like:
import easyocr
import pyautogui
import time
import cv2
import numpy as np
reader = easyocr.Reader(['en'])
tag = 1
for i in range(2):
time.sleep(4)
image = pyautogui.screenshot(region=(630, 400, 650, 130))
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
tag += 1
f_name = f"image{tag}.png"
cv2.imwrite(f_name, image)
results = reader.readtext(f_name)
text = ""
for result in results:
text += result[1] + " "
print(text)
You can just store your file name in a variable in pass it to both imwrite and readtext
There are other options as well, depending on what information you need access to within the program, and how quickly you need to process your data.
Option: Pass the np.array directly to readtext
image = pyautogui.screenshot(region=(630, 400, 650, 130))
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
results = reader.readtext(image)
Option: Pass the data from the written file to the readtext function.
f_name = f"image{tag}.png"
cv2.imwrite(f_name, image)
with open(f_name, 'rb') as f:
results = reader.readtext(f.read())

import pyautogui
import easyocr
import numpy as np
reader = easyocr.Reader(['en'],gpu=False)
im = pyautogui.screenshot(region=(630,400,650,130)
result = reader.readtext(np.array(im),detail = 0)
just pass the pyautogui image as np.array

PIL Image Open is not able to open some files in a zip folder

I have about 300000 image files in a zip folder. Some of those files have path starting with '__'. PIL function Image.Open() is not able to open these files. Please suggest a way to open them. My code below:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from io import BytesIO
from PIL import Image
from PIL import UnidentifiedImageError
problem_files = []
file_paths = []
img_list = []
img_size = (128,128)
with ZipFile('/XXX/YYY/ZZZ/AI_ML/Project2/words.zip') as myzip:
contents = myzip.namelist()
for i in range(0,len(contents)-1):
text = str(contents[i])
if '.png' in text:
file_paths.append(contents[i])
for path in file_paths:
img = myzip.read(path)
try:
img_data = Image.open(BytesIO(img))
except UnidentifiedImageError:
problem_files.append(path)
img_data = img_data.convert('L')
img_data = img_data.resize(img_size)
image_as_array = np.array(img_data, np.uint8)
image_as_array = np.reshape(image_as_array,(1,-1))
img_list.append(image_as_array)
This puts all the files with path starting with '__' into problem_files list
problem_files[-10:]
['__MACOSX/words/j04/j04-070/._j04-070-08-07.png',
'__MACOSX/words/j04/j04-070/._j04-070-04-07.png',
'__MACOSX/words/j04/j04-070/._j04-070-04-06.png',
'__MACOSX/words/j04/j04-070/._j04-070-08-06.png',
'__MACOSX/words/j04/j04-070/._j04-070-06-03.png',
'__MACOSX/words/j04/j04-070/._j04-070-06-01.png',
'__MACOSX/words/j04/j04-070/._j04-070-08-04.png',
'__MACOSX/words/j04/j04-070/._j04-070-04-04.png',
'__MACOSX/words/j04/j04-070/._j04-070-04-05.png',
'__MACOSX/words/j04/j04-070/._j04-070-08-05.png']
There are about 100000 images in problem_files list

when i run the below code to convert pdf image to jpg policy error popped up

import os
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
pdfim=wi(filename="salem-father.pdf",resolution=300)
PolicyError: not authorized `salem-father.pdf' #
error/constitute.c/ReadImage/412

You can convert or extract images from pdf file to jpg or its format using below code.
requirements.txt :
PyMuPDF==1.16.5
python-dateutil==2.8.0
pytz==2019.3
six==1.12.0
code:
import fitz
import random, string
doc = "mypdf.pdf" # path to pdf file
doc = fitz.open(doc)
pno = doc.loadPage(4) # enter the page
text = pno.getText('dict')# dict format of the file
blocks = text["blocks"]
imgblocks = [b for b in blocks if b["type"] == 1]
x = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase +
string.digits) for _ in range(16))
if imgblocks:
for index, img in enumerate(imgblocks):
img_name1 = "%s-%s.%s" % (x, index, img['ext']) # png
img_name2 = "%s-%s.jpg" % (x, index) # jpg
with open(img_name1, 'wb') as f:
f.write(img['image'])
with open(img_name2, 'wb') as f:
f.write(img['image'])

Cache error while doing OCR on a directory of pdf's in python

I am trying to OCR an entire directory of pdf files using pytesseract and imagemagick but the issue is that imagemagick is consuming all my Temp folder space and finally I'm getting a cache error i.e "CacheError: unable to extend cache 'C:/Users/Azu/AppData/Local/Temp/magick-18244WfgPyAToCsau11': No space left on device # error/cache.c/OpenPixelCache/3883" I have also written a code to delete the temp folder content once OCR'd but still facing the same issue.
Here's the code till now:
import io
import os
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
tempdir = r"C:\Users\Azu\AppData\Local\Temp"
filesall = os.listdir(tempdir)
for file in files:
name = os.path.basename(file).split('.')[0]
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ name + ".txt", 'w') as f:
f.write(str(extracted_texts))
for ifile in filesall:
if "magick" in ifile:
os.remove(os.path.join(tempdir,ifile))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to add multiple pdfs to be converted into excel? - python

Related

Image does not exist

How could I pass my file name through this method?

PIL Image Open is not able to open some files in a zip folder

when i run the below code to convert pdf image to jpg policy error popped up

Cache error while doing OCR on a directory of pdf's in python

Categories

Resources