OCR output save to one text file automatically to folder - python

I am doing OCR on image.
from PIL import Image
import pytesseract
from pytesseract import image_to_string
img1=Image.open('my.png')
print(image_to_string(img1))
How can save the extracted information into text file called "Output.txt"

I found the easy way to save OCR output into text file
def ocr(file_to_ocr):
im = Image.open(file_to_ocr)
txt=pytesseract.image_to_string(im)
return txt
directory = os.path.join("Your_path")
for root,dirs,files in os.walk(directory):
for file in files:
if file.endswith(".jpg"):
pre_fix=file[:-4]
txt=ocr(file)
with open(directory+"\\"+pre_fix+".txt",'w') as f: f.write(str(txt))

You could create a unique folder name using uuid, and then write the output.txt to it like so?:
from uuid import uuid4
import os
folder_name = str(uuid4())
os.makedirs(folder_name)
with open('./{fn}/output.txt'.format(fn=folder_name),'wb') as f:
f.write(image_to_string(img1))

Related

Python: pdf2image doesn't write .jpg - no error message

I'm working on a python script that checks the .pdf files in a directory, creates a new directory for each file, converts the .pdf into images, and writes the images as jpg into the new directory. I'm using pdf2image and have the following code:
import os
#import main
import glob
#import cv2
import matplotlib.pyplot as plt
from pdf2image import convert_from_path
from PIL import Image
path = "C:/Users/d/Desktop/Reis/"
for file in glob.iglob(path + "*.pdf"):
print(file)
name = os.path.basename(file)
filename = name.split(".")[0]
print(filename)
images = os.mkdir(path + filename)
pages = convert_from_path("C:/Users/d/Desktop/Reis/Reis_Wasser_Verhaeltnis.pdf",
350,
poppler_path=r'C:/Program Files/poppler-22.04.0/Library/bin',
output_folder=images)
for i in range(len(pages)):
pages[i].save('page' + str(i) + '.jpg', 'JPEG')
When I run my code I don't get an error message but no images either. Does anyone have an idea what I'm overseeing?
os.mkdir creates the Folder but it is of type boolean. Thus:
images = os.mkdir(path + filename)
returns only True and cannot be used as the output folder. My script writes the images into the default project directory.

how to get the file name from directory in python

I want to get the file name in my directory to use it as dynamically in ml model.
import os
i = os.listdir('./upload')[0]
print(i)
# ...read data from file `i`...
The problem with your approach is that os.listdir returns file names not file paths. So you need to add the directory name to your filepath:
cv2.imread(os.path.join('upload', i))
Or you can use glob which returns paths:
import glob
import cv2
for image_path in glob.glob('./upload/*.jpg'):
image = cv2.imread(image_path)

how to open multiple files in pdfplumber?

I have multiple PDF files created with Access DB forms. The only way I can extract text from them is using pdfplumber. Here is my code and it works perfectly for just 1 file.
import pdfplumber
with pdfplumber.open('CS_page_1.pdf') as pdf:
page = pdf.pages[0]
string = page.extract_text()
file_name = string[43:48]
print(file_name)
I need to use this extracted string to rename this file and the 100 other files in the folder.
What would be the best way to do it?
Would first build a list of all the pdfs in your folder using glob (https://docs.python.org/3/library/glob.html).
Then iterate through each of them- pdfplumb them to obtain the desired string (which you want to rename the file to)- and then rename each individually (https://www.tutorialspoint.com/python/os_rename.htm). Something like this:
import glob
import pdfplumber
import os
arr_of_files = (glob.glob("/path/to/pdfs/*.pdf"))
for file in arr_of_files:
with pdfplumber.open(file) as pdf:
page = pdf.pages[0]
string = page.extract_text()
file_name = string[43:48]
os.rename(file, file_name)
import pdfplumber
import glob
from tqdm.auto import tqdm
for current_pdf_file in tqdm(glob.glob("<pathname>\.pdf")):
with pdfplumber.open(current_pdf_file) as my_pdf:
# do other things here?

Generate .txt files from pdf files keeping the name same as in pdf using python

I have a directory containing pdf files. I have written the code that performs OCR when you pass a filename to an object of the wand.image class. What I want to do presently is to loop over the directory of pdf files and generate a OCR'd txt file for each pdf and save it some directory. The code that I have written till now is as follows:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
pdf = wi(filename = r"D:\files\aba7d525-04b8-4474-a40d-e94f9656ed42.pdf", resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_text = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_text.append(text)
print(extracted_text[0])
The thing is if you see my code, ("pdf = .."), I have hardcoded a filename in my code but I need to pass a directory there so that all the files in that directory can be OCR'd and also I need to take as output all those files with their filenames with just .pdf being replaced by .txt. How can I do that
You can use glob
Example:
import os
import glob
from wand.image import Image as wi
files = glob.glob("D:\files\*")
for file in files:
pdf = wi(filename = file, resolution = 300)
# write your code
with open("D:\extracted_files\" + os.path.split(file)[-1].split(".")[0] + ".txt", 'w') as f:
f.write(extracted_text)

Create a new .txt file for each .pdf files in a directory in python

My code is supposed to take each pdf from a directory, OCR it and return a .txt file for each OCR'd pdf. The name of the pdf and the .txt file should be the same except .pdf being changed to .txt. I am stuck in the part of splitting the input pdf name to generate the same name with a .txt extension for a OCR'd file. A sample file in the directory looks like this : "000dbf9d-d53f-465f-a7ce-722722136fb7465.pdf". I need output as "000dbf9d-d53f-465f-a7ce-722722136fb7465.txt". Also my code doesn't create new .txt files but overwrites on one file for each iteration. I need a new .txt file for each OCR'd .pdf files. Code till now:
import io
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
for file in files:
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ "\\file1.txt", 'w') as f:
f.write(str(extracted_texts))
You just need to keep track of your file name and re-use it in the last two lines:
# ...
import os
files = glob.glob(r"D:\files\**")
for file in files:
#print(file)
# Get the name of the file less any suffixes
name = os.path.basename(file).split('.')[0]
# ...
# Use `name` from above to name your text file
with open("D:\\extracted_text\\" + name + ".txt", 'w') as f:
f.write(str(extracted_texts))

Categories