I have Optical Character Recognition (OCR) project. I am generating an API using Django Framework. The API should look like as below:
"id": 1,
"title": "PDF Title",
"input": "input.pdf",
"output": "output.pdf"
My models.py file as below:
from django.db import models
from .create_pdf_output import *
# Create your models here.
class Document(models.Model):
title = models.CharField(max_length=255)
pdf_input = models.FileField(upload_to='documents/inputs', max_length=200, blank=False)
pdf_output = models.FileField(upload_to='documents/outputs', max_length=200, blank=True)
def save(self):
self.pdf_output = create_pdf_output(self.pdf_input)
super(Document, self).save()
def __str__(self):
return self.title
In models.py file I call create_pdf_file.py which should perform OCR operations on pdf input
import shutil
import cv2
import numpy as np
import img2pdf
from pdf2image import convert_from_path
import os
def create_pdf_output(pdf_input):
pdf = str(pdf_input)
pdf_name = os.path.splitext(pdf)[0] + "_out.pdf"
pages = convert_from_path(pdf_input, 500)
if not os.path.exists('images'):
for ind, page in enumerate(pages):
page.save(f'images/out{ind}.jpg', 'JPEG')
for ind, img in enumerate(os.listdir('images')):
img_rgb = cv2.imread(os.path.join('images', img))
img_gray = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)
template = cv2.imread('templates.jpg', 0)
w, h = template.shape[::-1]
res = cv2.matchTemplate(img_gray, template, cv2.TM_CCOEFF_NORMED)
threshold = 0.8
loc = np.where(res >= threshold)
for pt in zip(*loc[::-1]):
cropped_image = img_rgb[pt[1]:pt[1]+h, pt[0]:pt[0]+ 4*h]
blurred = cv2.blur(cropped_image, (50,50))
img_rgb[pt[1]:pt[1]+h, pt[0]:pt[0]+ 4*h] = blurred
if not os.path.exists('results'):
cv2.imwrite(f'results/res{ind}.jpg', img_rgb)
dirname = 'images'
with open(pdf_name, 'wb') as f:
imgs = []
for fname in os.listdir(dirname):
if not fname.endswith('.jpg'):
path = os.path.join(dirname, fname)
if os.path.isdir(path):
return pdf_name
However, since create_input_output function takes FieldFile as input it generates an error as below.
expected str, bytes or os.PathLike object, not FieldFile
My question is how can I work with input file, so I can generate output pdf file?
# pdf api
class OcrPDfDataApiView(APIView):
parser_class = [MultiPartParser,]
def post(self, request, format=None):
if 'data' not in request.data:
raise ParseError("Empty content")
filename = "whatever.pdf" # received file name
file_obj = request.data['data']
with default_storage.open('tmp/'+filename, 'wb+') as destination:
for chunk in file_obj.chunks():
dirName = os.path.dirname(__file__)
cwd = Path.cwd()
filename = os.path.join(cwd, 'media','tmp','whatever.pdf')
texts = print_pages(filename)
return Response(texts)
import pytesseract
from PIL import Image
from PIL import ImageFilter
import requests
import docx2txt
import pdf2image
import PyPDF2
import pyttsx3
import os
import io
def print_pages(pdf_file):
images = pdf2image.convert_from_path(pdf_file)
for pages,img in enumerate(images):
text = pytesseract.image_to_string(img)
return text
I am trying to crop a pdf within and lambda and save the file. Ideally I just want to zoom in as otherwise the OCR package does not recognize some of the fonts. The rectangle I am using just seems to shift the margins versus actually cropping or zooming in.
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
rect=fitz.Rect(50, 50, 545, 792)
page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
new_bytes = doc.write()
s3.Bucket(bucketname1).put_object(Key=filename, Body=new_bytes)
I am trying to write a program that reads in a folder of photos, analyses their heights and widths, and resizes them accordingly, then sends them to a word document. I keep getting this error and I am unsure what is causing it:
from docx import Document
import cv2
from PIL import Image
import glob
import os
import numpy
document = Document()
img_dir = "C:/Users/27832/Desktop/Report Images"
data_path = os.path.join(img_dir,'*g')
files = glob.glob(data_path)
photos = []
for pic in files:
imagg = cv2.imread(pic)
for i in range(0, len(photos)):
if 0.85*(photos[i].shape[0]) < (photos[i].shape[1]) < 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((314, 314))
elif (photos[i].shape[1]) >= 0.85*(photos[i].shape[0]):
resized_image = photos[i].resize((257, 382))
elif (photos[i].shape[1]) <= 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((401, 325))
document.save("C:/Users/27832/Desktop/Word Macro Program/Report.docx")
I am trying to use pytesseract OCR to extract text from all the PDFs in a directory, but I am getting an error message that there is not enough space on my device.
I would like to delete each image from the cache after it is no longer required, as this user was advised to do, but I can't find anything in the pytesseract documentation explaining how to do this.
Here is my code:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
return extract
Here is the error message:
CacheError: unable to extend cache 'C:/Users/b00kgrrl/AppData/Local/Temp/magick-11952ORBzkae3wXX_18': No space left on device # error/cache.c/OpenPixelCache/3889
I solved this myself using code found here and here:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import winshell
def extract_text_from_image(path):
pdfFile = wi(filename = path, resolution = 300)
image = pdfFile.convert('jpeg')
tempdir = r"C:\Users\b00kgrrl\AppData\Local\Temp"
cache = os.listdir( tempdir )
imageBlobs = []
for img in image.sequence:
imgPage = wi(image = img)
extract = []
for imgBlob in imageBlobs:
image = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(image, lang = 'eng')
for item in cache:
if item.endswith(".jpg") or item.startswith("magick-"):
os.remove( os.path.join( tempdir, item ) )
winshell.recycle_bin().empty(confirm=False, show_progress=False, sound=False)
return extract
I use django-dropbox for the image storage on my website.
When i upload the image i want to automatically create an thumbnail and save it.
I tryed different methods but with no success
Any help will be apreciated
I get this exception error:
cannot identify image file
Here is the code:
class Product(models.Model):
image = models.ImageField(upload_to='images', storage=STORAGE, null=True, blank=True)
thumb = models.ImageField(upload_to='thumbs', storage=STORAGE, null=True, blank=True)
def __unicode__(self):
return "%s,%s" %(self.id,self.title)
def save(self):
# create a thumbnail
super(Product, self).save()
def create_thumbnail(self):
# create a thumbnail
from PIL import Image
from cStringIO import StringIO
from django.core.files.uploadedfile import SimpleUploadedFile
import os
DJANGO_TYPE = self.image.file.content_type
if DJANGO_TYPE == 'image/jpeg':
PIL_TYPE = 'jpeg'
elif DJANGO_TYPE == 'image/png':
PIL_TYPE = 'png'
print error
print "Working1"
print StringIO(self.image.read())
im = Image.open(StringIO(self.image.read()))
size = 128, 128
im.thumbnail(size, Image.ANTIALIAS)
# Save the thumbnail
print "Working2"
temp_handle = StringIO()
im.save(temp_handle, PIL_TYPE)
# Save image to a SimpleUploadedFile which can be saved into
# ImageField
print "Working3"
suf = SimpleUploadedFile(os.path.split(self.image.name)[-1],
temp_handle.read(), content_type=DJANGO_TYPE)
# Save SimpleUploadedFile into image field
self.thumb.save('%s_thumbnail.%s'%(os.path.splitext(suf.name)[0],FILE_EXTENSION), suf, save=True)
print "Working4"
except Exception as e:
print e
Code is like this:
class Document(models.Model):
docfile = models.FileField(upload_to='documents/big/')
thumbnail = models.FileField(upload_to='documents/small/')
def create_thumbnail(self):
if not self.image:
from PIL import Image
from cStringIO import StringIO
from django.core.files.uploadedfile import SimpleUploadedFile
import os
THUMBNAIL_SIZE = (200,200)
DJANGO_TYPE = self.image.file.content_type
if DJANGO_TYPE == 'image/jpeg':
PIL_TYPE = 'jpeg'
elif DJANGO_TYPE == 'image/png':
PIL_TYPE = 'png'
image = Image.open(StringIO(self.image.read()))
image.thumbnail(THUMBNAIL_SIZE, Image.ANTIALIAS)
temp_handle = StringIO()
image.save(temp_handle, PIL_TYPE)
suf = SimpleUploadedFile(os.path.split(self.image.name)[-1],
temp_handle.read(), content_type=DJANGO_TYPE)
self.thumbnail.save('%s_thumbnail.%s'%(os.path.splitext(suf.name)[0],FILE_EXTENSION), suf, save=False)
def save(self):
super(ImageWithThumbnail, self).save()
Now I want to save the thumbnail image in documents/small/. But how to call this create_thumbnail function from view or model when uploading the image to create the thumbnail?
Did u mean you want to call the function in your views.
Then you can do
from urappname.models import Document
and call using
You can do this by overriding the default save() method of django.
For that, define this save() function under your model class as below:
def save(self, *args, **kwargs):
super(Document, self).save(*args, **kwargs)