I have Optical Character Recognition (OCR) project. I am generating an API using Django Framework. The API should look like as below:
{
"id": 1,
"title": "PDF Title",
"input": "input.pdf",
"output": "output.pdf"
}
My models.py file as below:
from django.db import models
from .create_pdf_output import *
# Create your models here.
class Document(models.Model):
title = models.CharField(max_length=255)
pdf_input = models.FileField(upload_to='documents/inputs', max_length=200, blank=False)
pdf_output = models.FileField(upload_to='documents/outputs', max_length=200, blank=True)
def save(self):
self.pdf_output = create_pdf_output(self.pdf_input)
super(Document, self).save()
def __str__(self):
return self.title
In models.py file I call create_pdf_file.py which should perform OCR operations on pdf input
create_pdf_output.py
import shutil
import cv2
import numpy as np
import img2pdf
from pdf2image import convert_from_path
import os
def create_pdf_output(pdf_input):
pdf = str(pdf_input)
pdf_name = os.path.splitext(pdf)[0] + "_out.pdf"
pages = convert_from_path(pdf_input, 500)
if not os.path.exists('images'):
os.mkdir('images')
for ind, page in enumerate(pages):
page.save(f'images/out{ind}.jpg', 'JPEG')
for ind, img in enumerate(os.listdir('images')):
img_rgb = cv2.imread(os.path.join('images', img))
img_gray = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)
template = cv2.imread('templates.jpg', 0)
w, h = template.shape[::-1]
res = cv2.matchTemplate(img_gray, template, cv2.TM_CCOEFF_NORMED)
threshold = 0.8
loc = np.where(res >= threshold)
for pt in zip(*loc[::-1]):
cropped_image = img_rgb[pt[1]:pt[1]+h, pt[0]:pt[0]+ 4*h]
blurred = cv2.blur(cropped_image, (50,50))
img_rgb[pt[1]:pt[1]+h, pt[0]:pt[0]+ 4*h] = blurred
if not os.path.exists('results'):
os.mkdir('results')
cv2.imwrite(f'results/res{ind}.jpg', img_rgb)
dirname = 'images'
with open(pdf_name, 'wb') as f:
imgs = []
for fname in os.listdir(dirname):
if not fname.endswith('.jpg'):
continue
path = os.path.join(dirname, fname)
if os.path.isdir(path):
continue
imgs.append(path)
f.write(img2pdf.convert(imgs))
shutil.rmtree('images')
shutil.rmtree('results')
return pdf_name
However, since create_input_output function takes FieldFile as input it generates an error as below.
expected str, bytes or os.PathLike object, not FieldFile
My question is how can I work with input file, so I can generate output pdf file?
views.py
# pdf api
class OcrPDfDataApiView(APIView):
parser_class = [MultiPartParser,]
def post(self, request, format=None):
if 'data' not in request.data:
raise ParseError("Empty content")
filename = "whatever.pdf" # received file name
file_obj = request.data['data']
with default_storage.open('tmp/'+filename, 'wb+') as destination:
for chunk in file_obj.chunks():
destination.write(chunk)
dirName = os.path.dirname(__file__)
cwd = Path.cwd()
filename = os.path.join(cwd, 'media','tmp','whatever.pdf')
texts = print_pages(filename)
return Response(texts)
utils.py
import pytesseract
from PIL import Image
from PIL import ImageFilter
import requests
import docx2txt
import pdf2image
import PyPDF2
import pyttsx3
import os
import io
def print_pages(pdf_file):
images = pdf2image.convert_from_path(pdf_file)
for pages,img in enumerate(images):
text = pytesseract.image_to_string(img)
print(text)
return text
Related
I am trying to crop a pdf within and lambda and save the file. Ideally I just want to zoom in as otherwise the OCR package does not recognize some of the fonts. The rectangle I am using just seems to shift the margins versus actually cropping or zooming in.
Thanks!
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
rect=fitz.Rect(50, 50, 545, 792)
page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
new_bytes = doc.write()
bucketname1='modified'
s3.Bucket(bucketname1).put_object(Key=filename, Body=new_bytes)
I am trying to write a program that reads in a folder of photos, analyses their heights and widths, and resizes them accordingly, then sends them to a word document. I keep getting this error and I am unsure what is causing it:
from docx import Document
import cv2
from PIL import Image
import glob
import os
import numpy
document = Document()
img_dir = "C:/Users/27832/Desktop/Report Images"
data_path = os.path.join(img_dir,'*g')
files = glob.glob(data_path)
photos = []
for pic in files:
imagg = cv2.imread(pic)
photos.append(imagg)
for i in range(0, len(photos)):
if 0.85*(photos[i].shape[0]) < (photos[i].shape[1]) < 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((314, 314))
document.add_picture(resized_image)
elif (photos[i].shape[1]) >= 0.85*(photos[i].shape[0]):
resized_image = photos[i].resize((257, 382))
document.add_picture(resized_image)
elif (photos[i].shape[1]) <= 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((401, 325))
document.add_picture(resized_image)
document.save("C:/Users/27832/Desktop/Word Macro Program/Report.docx")
I am trying to use pytesseract OCR to extract text from all the PDFs in a directory, but I am getting an error message that there is not enough space on my device.
I would like to delete each image from the cache after it is no longer required, as this user was advised to do, but I can't find anything in the pytesseract documentation explaining how to do this.
Here is my code:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
return extract
Here is the error message:
CacheError: unable to extend cache 'C:/Users/b00kgrrl/AppData/Local/Temp/magick-11952ORBzkae3wXX_18': No space left on device # error/cache.c/OpenPixelCache/3889
I solved this myself using code found here and here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import winshell
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
tempdir = r"C:\Users\b00kgrrl\AppData\Local\Temp"
cache = os.listdir( tempdir )
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
extract.append(text)
for item in cache:
if item.endswith(".jpg") or item.startswith("magick-"):
os.remove( os.path.join( tempdir, item ) )
winshell.recycle_bin().empty(confirm=False, show_progress=False, sound=False)
return extract
I use django-dropbox for the image storage on my website.
When i upload the image i want to automatically create an thumbnail and save it.
I tryed different methods but with no success
Any help will be apreciated
I get this exception error:
cannot identify image file
Here is the code:
class Product(models.Model):
image = models.ImageField(upload_to='images', storage=STORAGE, null=True, blank=True)
thumb = models.ImageField(upload_to='thumbs', storage=STORAGE, null=True, blank=True)
def __unicode__(self):
return "%s,%s" %(self.id,self.title)
def save(self):
# create a thumbnail
self.create_thumbnail()
super(Product, self).save()
def create_thumbnail(self):
# create a thumbnail
#self.create_thumbnail()
from PIL import Image
from cStringIO import StringIO
from django.core.files.uploadedfile import SimpleUploadedFile
import os
try:
DJANGO_TYPE = self.image.file.content_type
print DJANGO_TYPE
if DJANGO_TYPE == 'image/jpeg':
PIL_TYPE = 'jpeg'
FILE_EXTENSION = 'jpg'
elif DJANGO_TYPE == 'image/png':
PIL_TYPE = 'png'
FILE_EXTENSION = 'png'
else:
print error
print "Working1"
print StringIO(self.image.read())
im = Image.open(StringIO(self.image.read()))
size = 128, 128
im.thumbnail(size, Image.ANTIALIAS)
# Save the thumbnail
print "Working2"
temp_handle = StringIO()
im.save(temp_handle, PIL_TYPE)
temp_handle.seek(0)
# Save image to a SimpleUploadedFile which can be saved into
# ImageField
print "Working3"
suf = SimpleUploadedFile(os.path.split(self.image.name)[-1],
temp_handle.read(), content_type=DJANGO_TYPE)
# Save SimpleUploadedFile into image field
self.thumb.save('%s_thumbnail.%s'%(os.path.splitext(suf.name)[0],FILE_EXTENSION), suf, save=True)
print "Working4"
except Exception as e:
print e
Code is like this:
class Document(models.Model):
docfile = models.FileField(upload_to='documents/big/')
thumbnail = models.FileField(upload_to='documents/small/')
def create_thumbnail(self):
if not self.image:
return
from PIL import Image
from cStringIO import StringIO
from django.core.files.uploadedfile import SimpleUploadedFile
import os
THUMBNAIL_SIZE = (200,200)
DJANGO_TYPE = self.image.file.content_type
if DJANGO_TYPE == 'image/jpeg':
PIL_TYPE = 'jpeg'
FILE_EXTENSION = 'jpg'
elif DJANGO_TYPE == 'image/png':
PIL_TYPE = 'png'
FILE_EXTENSION = 'png'
image = Image.open(StringIO(self.image.read()))
image.thumbnail(THUMBNAIL_SIZE, Image.ANTIALIAS)
temp_handle = StringIO()
image.save(temp_handle, PIL_TYPE)
temp_handle.seek(0)
suf = SimpleUploadedFile(os.path.split(self.image.name)[-1],
temp_handle.read(), content_type=DJANGO_TYPE)
self.thumbnail.save('%s_thumbnail.%s'%(os.path.splitext(suf.name)[0],FILE_EXTENSION), suf, save=False)
def save(self):
self.create_thumbnail()
super(ImageWithThumbnail, self).save()
Now I want to save the thumbnail image in documents/small/. But how to call this create_thumbnail function from view or model when uploading the image to create the thumbnail?
Did u mean you want to call the function in your views.
Then you can do
from urappname.models import Document
and call using
Document.create_thumbnail()
You can do this by overriding the default save() method of django.
For that, define this save() function under your model class as below:
def save(self, *args, **kwargs):
self.create_thumbnail()
super(Document, self).save(*args, **kwargs)