I've written a script in python in combination with pytesseract to extract a word out of an image. There is only a single word TOOLS available in that image and that is what I'm after. Currently my below script is giving me wrong output which is WIS. What Can I do to get the text?
Link to that image
This is my script:
import requests, io, pytesseract
from PIL import Image
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
img = img.resize([100,100], Image.ANTIALIAS)
img = img.convert('L')
img = img.point(lambda x: 0 if x < 170 else 255)
imagetext = pytesseract.image_to_string(img)
print(imagetext)
# img.show()
This is the status of the modified image when I run the above script:
The output I'm having:
WIS
Expected output:
TOOLS
The key is matching image transformation to the tesseract abilities. Your main problem is that the font is not a usual one. All you need is
from PIL import Image, ImageEnhance, ImageFilter
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
# remove texture
enhancer = ImageEnhance.Color(img)
img = enhancer.enhance(0) # decolorize
img = img.point(lambda x: 0 if x < 250 else 255) # set threshold
img = img.resize([300, 100], Image.LANCZOS) # resize to remove noise
img = img.point(lambda x: 0 if x < 250 else 255) # get rid of remains of noise
# adjust font weight
img = img.filter(ImageFilter.MaxFilter(11)) # lighten the font ;)
imagetext = pytesseract.image_to_string(img)
print(imagetext)
And voila,
TOOLS
are recognized.
The key issue with your implementation lies here:
img = img.resize([100,100], Image.ANTIALIAS)
img = img.point(lambda x: 0 if x < 170 else 255)
You could try different sizes and different threshold:
import requests, io, pytesseract
from PIL import Image
from PIL import ImageFilter
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
filters = [
# ('nearest', Image.NEAREST),
('box', Image.BOX),
# ('bilinear', Image.BILINEAR),
# ('hamming', Image.HAMMING),
# ('bicubic', Image.BICUBIC),
('lanczos', Image.LANCZOS),
]
subtle_filters = [
# 'BLUR',
# 'CONTOUR',
'DETAIL',
'EDGE_ENHANCE',
'EDGE_ENHANCE_MORE',
# 'EMBOSS',
'FIND_EDGES',
'SHARPEN',
'SMOOTH',
'SMOOTH_MORE',
]
for name, filt in filters:
for subtle_filter_name in subtle_filters:
for s in range(220, 250, 10):
for threshold in range(250, 253, 1):
img_temp = img.copy()
img_temp.thumbnail([s,s], filt)
img_temp = img_temp.convert('L')
img_temp = img_temp.point(lambda x: 0 if x < threshold else 255)
img_temp = img_temp.filter(getattr(ImageFilter, subtle_filter_name))
imagetext = pytesseract.image_to_string(img_temp)
print(s, threshold, name, subtle_filter_name, imagetext)
with open('thumb%s_%s_%s_%s.jpg' % (s, threshold, name, subtle_filter_name), 'wb') as g:
img_temp.save(g)
and see what works for you.
I would suggest you resize your image while keeping the original ratio. You could also try some alternative to img_temp.convert('L')
Best so far: TWls and T0018
You can try to manipulate the image manually and see if you can find some edit that can provide a better output (for instance http://gimpchat.com/viewtopic.php?f=8&t=1193)
By knowing in advance the font you could probably achieve a better result too.
Related
Getting error: line 50, in
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
TypeError: only integer scalar arrays can be converted to a scalar index
import os
import argparse
import cv2
from deepface import DeepFace
import numpy as np
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--first", required=True,
help="first input image")
ap.add_argument("-d", "--directory", required=True,
help="directory of images to compare")
args = vars(ap.parse_args())
# load the first input image
imageA = cv2.imread(args["first"])
rgbA = cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB)
# detect the face in the first image
boxesA = DeepFace.detectFace(rgbA)
# make sure there is a face in the first image
if len(boxesA) == 0:
print("No face detected in the first image")
exit()
def preprocess_face(face, size=(96, 96)):
# extract the face ROI and resize it to the desired size
face = cv2.resize(face, size)
# compute the scaling factor for the images
factor_0 = size[0] / face.shape[0]
factor_1 = size[1] / face.shape[1]
factor = np.min([factor_0, factor_1])
# stretch the face ROI to the desired size
face = cv2.resize(face, None, fx=factor, fy=factor)
# convert the face ROI to grayscale
gray = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
# normalize the grayscale image
gray = gray / 255.0
# return the preprocessed face
return [gray]
boxesA = boxesA.astype(int)
# extract the face encoding for the first image
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
encodingA = DeepFace.detectFace(faceA, boxesA)[0]
# initialize a dictionary to store the image names and scores
scores = {}
# loop over the images in the directory
for image_name in os.listdir(args["directory"]):
# load the image
imageB = cv2.imread(os.path.join(args["directory"], image_name))
rgbB = cv2.cvtColor(imageB, cv2.COLOR_BGR2RGB)
# detect the face in the image
boxesB = DeepFace.detectFace(rgbB, enforce_detection=False)
# make sure there is a face in the image
if len(boxesB) == 0:
continue
boxesB = boxesB.astype(int)
# extract the face encoding for the image
facesB = []
for i in range(len(boxesB)):
faceB = preprocess_face(rgbB[boxesB[i][1]:boxesB[i][3], boxesB[i][0]:boxesB[i][2]])
facesB.extend(faceB)
encodingB = DeepFace.detectFace(facesB, boxesB)[0]
# compare the face encodings
score = DeepFace.verifyFace(encodingA, encodingB)
similarity_percentage = score * 100
# store the image name and score in the dictionary
scores[image_name] = score
# sort the scores in descending order
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
# display the top 10 scores
for i in range(10):
image_name, score = sorted_scores[i]
print("{}: {}".format(image_name, score))
Usage of the script is: python Image_Comparison_Deepface.py -f C:/folder/image.png -d C:/testpics
Script was created with ChatGPT.
My output doesn't show anything and I honestly can't find out why
This is the full code, but I think the problem is when I'm passing the argument to aRed, aGreen, aBlue, originalImage = openImage(response.content)
When I run that code in collab python notebook, my image isn't showing up for some reason! Maybe it's the way I'm passing the URL as an argument in the line above?
import numpy
from PIL import Image
import requests
from io import BytesIO
# FUNCTION DEFINTIONS:
# open the image and return 3 matrices, each corresponding to one channel (R, G and B channels)
def openImage(imagePath):
imOrig = Image.open(BytesIO(imagePath))
im = numpy.array(imOrig)
aRed = im[:, :, 0]
aGreen = im[:, :, 1]
aBlue = im[:, :, 2]
return [aRed, aGreen, aBlue, imOrig]
# compress the matrix of a single channel
def compressSingleChannel(channelDataMatrix, singularValuesLimit):
uChannel, sChannel, vhChannel = numpy.linalg.svd(channelDataMatrix)
aChannelCompressed = numpy.zeros((channelDataMatrix.shape[0], channelDataMatrix.shape[1]))
k = singularValuesLimit
leftSide = numpy.matmul(uChannel[:, 0:k], numpy.diag(sChannel)[0:k, 0:k])
aChannelCompressedInner = numpy.matmul(leftSide, vhChannel[0:k, :])
aChannelCompressed = aChannelCompressedInner.astype('uint8')
return aChannelCompressed
# MAIN PROGRAM:
response = requests.get('https://i.imgur.com/BIOFZNo.png')
print ('*** Image Compression using SVD - a demo')
aRed, aGreen, aBlue, originalImage = openImage(response.content)
# image width and height:
imageWidth = 1000
imageHeight = 1000
#number of singular values to use for reconstructing the compressed image
singularValuesLimit = 160
aRedCompressed = compressSingleChannel(aRed, singularValuesLimit)
aGreenCompressed = compressSingleChannel(aGreen, singularValuesLimit)
aBlueCompressed = compressSingleChannel(aBlue, singularValuesLimit)
imr=Image.fromarray(aRedCompressed,mode=None)
img=Image.fromarray(aGreenCompressed,mode=None)
imb=Image.fromarray(aBlueCompressed,mode=None)
newImage = Image.merge("RGB", (imr,img,imb))
originalImage.show()
newImage.show()
There are no errors in compiling the program, it just doesn't show up anything.
Thank you all!
Here is the link to my file: https://colab.research.google.com/drive/12K0nWKRdOpZ3gSfTn0wuP8Y0_UUeUxEE
You don't need to specify .show() in interactive modes. Just remove that part, and it will work fine.
import numpy
from PIL import Image
import requests
from io import BytesIO
# FUNCTION DEFINTIONS:
# open the image and return 3 matrices, each corresponding to one channel (R, G and B channels)
def openImage(imagePath):
imOrig = Image.open(BytesIO(imagePath))
im = numpy.array(imOrig)
aRed = im[:, :, 0]
aGreen = im[:, :, 1]
aBlue = im[:, :, 2]
return [aRed, aGreen, aBlue, imOrig]
# compress the matrix of a single channel
def compressSingleChannel(channelDataMatrix, singularValuesLimit):
uChannel, sChannel, vhChannel = numpy.linalg.svd(channelDataMatrix)
aChannelCompressed = numpy.zeros((channelDataMatrix.shape[0], channelDataMatrix.shape[1]))
k = singularValuesLimit
leftSide = numpy.matmul(uChannel[:, 0:k], numpy.diag(sChannel)[0:k, 0:k])
aChannelCompressedInner = numpy.matmul(leftSide, vhChannel[0:k, :])
aChannelCompressed = aChannelCompressedInner.astype('uint8')
return aChannelCompressed
# MAIN PROGRAM:
response = requests.get('https://i.imgur.com/BIOFZNo.png')
print ('*** Image Compression using SVD - a demo')
aRed, aGreen, aBlue, originalImage = openImage(response.content)
# image width and height:
imageWidth = 1000
imageHeight = 1000
#number of singular values to use for reconstructing the compressed image
singularValuesLimit = 160
aRedCompressed = compressSingleChannel(aRed, singularValuesLimit)
aGreenCompressed = compressSingleChannel(aGreen, singularValuesLimit)
aBlueCompressed = compressSingleChannel(aBlue, singularValuesLimit)
imr=Image.fromarray(aRedCompressed,mode=None)
img=Image.fromarray(aGreenCompressed,mode=None)
imb=Image.fromarray(aBlueCompressed,mode=None)
newImage = Image.merge("RGB", (imr,img,imb))
originalImage
OriginalImage will be displayed. For new image, in next code cell:
newImage
I tried to detect text in images specially images with quotes using OpenCV Python. For that I first train some text images. I detect each characters of text in the image to train. For images with proper word style the characters are detect properly. But for some images the text(character) area can't be detect properly. I attached the code for this below. How can I modify the code so that the characters can be detected properly
import sys
import numpy as np
import cv2
import os
MIN_CONTOUR_AREA = 100
RESIZED_IMAGE_WIDTH = 20
RESIZED_IMAGE_HEIGHT = 30
def main():
imgTrainingNumbers = cv2.imread("E:\God - Level 4 Research Project\Testings\Tharu\godd/jbpoetry.png")
if imgTrainingNumbers is None:
print ("error: image not read from file \n\n")
os.system("pause")
return
imgGray = cv2.cvtColor(imgTrainingNumbers, cv2.COLOR_BGR2GRAY)
imgBlurred = cv2.GaussianBlur(imgGray, (5,5), 0)
imgThresh = cv2.adaptiveThreshold(imgBlurred,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV,
11,
2)
cv2.imshow("imgThresh", imgThresh)
imgThreshCopy = imgThresh.copy()
imgContours, npaContours, npaHierarchy = cv2.findContours(imgThreshCopy,
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
npaFlattenedImages = np.empty((0, RESIZED_IMAGE_WIDTH * RESIZED_IMAGE_HEIGHT))
intClassifications = []
intValidChars = [ord('0'), ord('1'), ord('2'), ord('3'), ord('4'), ord('5'), ord('6'), ord('7'), ord('8'), ord('9'),
ord('A'), ord('B'), ord('C'), ord('D'), ord('E'), ord('F'), ord('G'), ord('H'), ord('I'), ord('J'),
ord('K'), ord('L'), ord('M'), ord('N'), ord('O'), ord('P'), ord('Q'), ord('R'), ord('S'), ord('T'),
ord('U'), ord('V'), ord('W'), ord('X'), ord('Y'), ord('Z'),ord('a'),ord('b'),ord('c'),ord('d'),
ord('e'),ord('f'),ord('g'),ord('h'),ord('i'),ord('j'),ord('k'),ord('l'),ord('m'),ord('n'),ord('o'),
ord('p'),ord('q'),ord('r'),ord('s'),ord('t'),ord('u'),ord('v'),ord('w'),ord('x'),ord('y'),ord('z') ]
for npaContour in npaContours:
if cv2.contourArea(npaContour) > MIN_CONTOUR_AREA:
[intX, intY, intW, intH] = cv2.boundingRect(npaContour)
cv2.rectangle(imgTrainingNumbers,
(intX, intY),
(intX+intW,intY+intH),
(0, 0, 255),
2)
imgROI = imgThresh[intY:intY+intH, intX:intX+intW]
imgROIResized = cv2.resize(imgROI, (RESIZED_IMAGE_WIDTH, RESIZED_IMAGE_HEIGHT))
cv2.imshow("imgROI", imgROI)
cv2.imshow("imgROIResized", imgROIResized)
cv2.imshow("training_numbers.png", imgTrainingNumbers)
intChar = cv2.waitKey(0)
if intChar == 27:
sys.exit()
elif intChar in intValidChars:
print(intChar)
intClassifications.append(intChar)
print(intChar)
npaFlattenedImage = imgROIResized.reshape((1, RESIZED_IMAGE_WIDTH * RESIZED_IMAGE_HEIGHT))
npaFlattenedImages = np.append(npaFlattenedImages, npaFlattenedImage, 0)
fltClassifications = np.array(intClassifications, np.float32)
npaClassifications = fltClassifications.reshape((fltClassifications.size, 1))
print ("\n\ntraining complete !!\n")
np.savetxt("classificationsNEWG.txt", npaClassifications)
np.savetxt("flattened_imagesNEWG.txt", npaFlattenedImages)
cv2.destroyAllWindows()
return
if __name__ == "__main__":
main()
What you are trying to do is a very naive approach, just applying the threshold and detecting contours won't work here. A lot of research papers have been published around this task. You may refer those and try to implement or can use image_to_boxes function of the famous tesseract OCR. You can download it from here and as you are using python you can install pytesseract - python wrapper for tesseract from here and use the following code to achieve what you are expecting.
import pytesseract
import cv2
originalImg = cv2.imread('tp.png')
originalImg = cv2.resize(originalImg, None, fx=2.5, fy=2.5)
img = cv2.cvtColor(originalImg, cv2.COLOR_BGR2GRAY)
_,img = cv2.threshold(img,100,255,cv2.THRESH_BINARY)
h, w = img.shape
letters = pytesseract.image_to_boxes(img)
letters = letters.split('\n')
letters = [letter.split() for letter in letters]
for letter in letters:
cv2.rectangle(originalImg, (int(letter[1]), h - int(letter[2])), (int(letter[3]), h - int(letter[4])), (0,0,255), 1)
cv2.imshow('', originalImg)
The resultant image
Note that there are many false detections, you need to ignore them in your training process.
How to write the code? I could only come up with this:
def rotateImage90CW(image):
pic = FileImage(image)
oldw = pic.getWidth()
oldh = pic.getHeight()
newIm = EmptyImage(oldw,oldh)
for row in range (oldh):
for col in range(oldw):
oldPixel = pic.getPixel(col,row)
newIm.setPixel(oldw-row,col,oldPixel)
newIm.draw(myWin)
If you use PIL/pillow:
from PIL import Image
im = Image.open(image)
im.rotate(90).show()
In your example, oldw-row should be „row”
in my case, there are 2 ways of getting image to resize/crop.
upload normal image file
giving base64 string data of image
in 1. case, resize and crop is working well:
f = Image.open(uploaded_image)
new_width, new_height = 1200, 630
wpercent = (new_width / float(f.size[0]))
hsize = int((float(f.size[1]) * float(wpercent)))
if f.mode != "RGB":
f = f.convert('RGB')
og_img = None
if f.size[0] < new_width:
#upscale
og_img = f.resize((new_width, hsize), Image.BICUBIC)
elif f.size[0] >= new_width:
#downscale
og_img = f.resize((new_width, hsize), Image.ANTIALIAS)
og_img = og_img.crop((0, 0, 1200, 630))
resized/cropped image:
in 2. case, the code is the same as above with slight change in:
base64_image = str(request.POST.get('base64_image')).split(',')[1]
imgfile = open('/'.join([settings.MEDIA_ROOT, 'test.png' ]), 'w+b')
imgfile.write(decodestring(base64_image))
imgfile.seek(0)
f = Image.open(imgfile)
#.. as above
but the resized/cropped image:
why is it in 2.case bad in quality and size? (black bottom part..) what am I doing wrong? am I reading the base64 string in wrong way?
I found a website which has many interesting things in it.It has 2(there are many) tools which maybe can help you.The 1th tool converts image to base64 and the 2th tool minifies the size of image (up to 70% save).
http://www.w3docs.com/tools/minimage/
http://www.w3docs.com/tools/image-base64