Python: pdf2image doesn't write .jpg - no error message - python

I'm working on a python script that checks the .pdf files in a directory, creates a new directory for each file, converts the .pdf into images, and writes the images as jpg into the new directory. I'm using pdf2image and have the following code:
import os
#import main
import glob
#import cv2
import matplotlib.pyplot as plt
from pdf2image import convert_from_path
from PIL import Image
path = "C:/Users/d/Desktop/Reis/"
for file in glob.iglob(path + "*.pdf"):
print(file)
name = os.path.basename(file)
filename = name.split(".")[0]
print(filename)
images = os.mkdir(path + filename)
pages = convert_from_path("C:/Users/d/Desktop/Reis/Reis_Wasser_Verhaeltnis.pdf",
350,
poppler_path=r'C:/Program Files/poppler-22.04.0/Library/bin',
output_folder=images)
for i in range(len(pages)):
pages[i].save('page' + str(i) + '.jpg', 'JPEG')
When I run my code I don't get an error message but no images either. Does anyone have an idea what I'm overseeing?

os.mkdir creates the Folder but it is of type boolean. Thus:
images = os.mkdir(path + filename)
returns only True and cannot be used as the output folder. My script writes the images into the default project directory.

Related

Is there any faster way to find every image from the entire computer?

I'm using that function to look for images but that's actually a very slow. I wonder if there is a faster way to do that.
import os
import cv2
images = []
def load_images_from_folder(folder):
global images
os.chdir(folder)
for filename in os.listdir(folder):
if os.path.isdir(os.path.join(folder,filename)):
try:
load_images_from_folder(os.path.join(folder, filename))
except:
pass
img = cv2.imread(os.path.join(folder,filename))
if img is not None:
images.append(img)
load_images_from_folder("C:\\")
As recommended in this answer you can use imghdr built-in module to test if a file is an image:
import imghdr
import os
def get_images(root_directory):
images = []
for root, dirs, filenames in os.walk(root_directory):
for filename in filenames:
path = os.path.join(root, filename)
if imghdr.what(path) is not None:
images.append(path)
return images
But imghdr module only detects several image types, according to documentation it cannot detect swg.
As #NiklasMertsch suggested you can just check for image file extensions like this:
import os
extensions = [
'.png',
'.jpg',
'.jpeg',
'.bmp',
'.gif',
'.tiff',
'.swg',
]
def get_images(root_directory):
images = []
for root, dirs, filenames in os.walk(root_directory):
for filename in filenames:
for extension in extensions:
if filename.endswith(extension):
images.append(os.path.join(root, filename))
break
return images
You can use the library treeHandler to achieve this. You can install it using pip install treeHandler
from treeHandler import treeHandler
import os
import cv2
inputFolder='SampleData'
### initialising treeHandler object
th=treeHandler()
### calling getFiles function to get all jpg files, you can add more extensions like ['jpg','png','bmp'] if required
fileTuple=th.getFiles(inputFolder,['jpg'])
Here is what fileTuple sample looks like:
[('image01.jpg', 'sampleData'), ('image02.jpg', 'sampleData'), ('image03.jpg', 'sampleData'), ('image04.jpg', 'sampleData'), ('image11.jpg', 'sampleData/folder1'), ('image12.jpg', 'sampleData/folder1'), ('image111.jpg', 'sampleData/folder1/folder11'), ('image112.jpg', 'sampleData/folder1/folder11'), ('image1111.jpg', 'sampleData/folder1/folder11/folder111'), ('image1112.jpg', 'sampleData/folder1/folder11/folder111'), ('image11111.jpg', 'sampleData/folder1/folder11/folder111/folder1111')....]
I have written a blog on processing large number of files and processing them
https://medium.com/#sreekiranar/directory-and-file-handling-in-python-for-real-world-projects-9bc8baf6ba89
You can refer that to get a fair idea on how to deal with getting all files from folders and subfolders.

OS Error:[WinError 123] The filename, directory name, or volume label syntax is incorrect

I am trying to save images after converting them into grayscale from one folder to another. when I run my code, it keeps saving the file in the same folder and making duplicates of all the images. Here is my code, Please guide where my problem lies...
import glob
import cv2
import os
spath=r"C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza\*.png"
dpath=r"C:\Users\usama\Documents\FYP-Data\FYP Project Data\grayscale images\*.png"
files = os.listdir(spath)
for filename in glob.glob(r'C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza\*.png'):
print(filename)
img=cv2.imread(filename)
rl=cv2.resize(img, (40,50))
gray_image = cv2.cvtColor(rl, cv2.COLOR_BGR2GRAY)
cv2.imwrite(os.path.join(dpath,filename), gray_image)
If you pass a full pathname to glob.glob(), then it will return the full path of the result files, not just the filenames.
That means in this loop in your code:
for filename in glob.glob(r'C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza\*.png'):
filename is a full path such as C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza\myfile1.png.
Then, later in the loop when you call cv2.imwrite(os.path.join(dpath,filename), gray_image), you're trying to join together C:\Users\usama\Documents\FYP-Data\FYP Project Data\grayscale images\*.png and C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza\myfile1.png, which is the cause of your error.
glob() is convenient to get the full path of just the files you want, but then you have to separate the filename from the directory.
Try using listdir() instead of glob():
import glob
import cv2
import os
sdir=r"C:\Users\usama\Documents\FYP-Data\FYP Project Data\hamza"
ddir=r"C:\Users\usama\Documents\FYP-Data\FYP Project Data\grayscale images"
for filename in os.listdir(sdir):
if not filename.lower().endswith(".png"):
continue
print(filename)
img=cv2.imread(os.path.join(sdir, filename))
rl=cv2.resize(img, (40,50))
gray_image = cv2.cvtColor(rl, cv2.COLOR_BGR2GRAY)
cv2.imwrite(os.path.join(ddir,filename), gray_image)

Generate .txt files from pdf files keeping the name same as in pdf using python

I have a directory containing pdf files. I have written the code that performs OCR when you pass a filename to an object of the wand.image class. What I want to do presently is to loop over the directory of pdf files and generate a OCR'd txt file for each pdf and save it some directory. The code that I have written till now is as follows:
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
pdf = wi(filename = r"D:\files\aba7d525-04b8-4474-a40d-e94f9656ed42.pdf", resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_text = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_text.append(text)
print(extracted_text[0])
The thing is if you see my code, ("pdf = .."), I have hardcoded a filename in my code but I need to pass a directory there so that all the files in that directory can be OCR'd and also I need to take as output all those files with their filenames with just .pdf being replaced by .txt. How can I do that
You can use glob
Example:
import os
import glob
from wand.image import Image as wi
files = glob.glob("D:\files\*")
for file in files:
pdf = wi(filename = file, resolution = 300)
# write your code
with open("D:\extracted_files\" + os.path.split(file)[-1].split(".")[0] + ".txt", 'w') as f:
f.write(extracted_text)

How to loop through folder, containing subfolders to change jpgs to tiffs

I have folders (river reach, eg W&S_River) which contain sub folders (one per gravel bar eg GravelBar_18) which contain images (50 - 300 per gravel bar). I'm trying to convert the images from jpg to tiff. I've got some code that does the conversion, but it takes some time and doesn't loop through the directory folder (river reach). I'm hoping to define the reach folder and have some code that opens each sub folder and converts each sub folder.
I've been trying to use os.walk based on what I've read here. I'm not getting any error messages, but it's not actually doing anything. Below is what I'm currently using to update the image in each sub folder.
import os
import os.path
from PIL import Image
import glob
os.chdir('E:/W&S_River/GravelBar_18')
for infile in glob.glob("*.jpg"):
file, ext = os.path.splitext(infile)
im = Image.open(infile)
im.save(file+".tiff", 'TIFF')
print("done")
for infile in glob.glob("/*/*.jpg"): # "/*" is important
...
#https://stackoverflow.com/a/36426997/11343720
#https://docs.python.org/3/library/glob.html#glob.glob
# ../../Tools/*/*.gif
import os
import os.path
from PIL import Image
import glob
def jpgToTIFF(folder):
os.chdir(folder)
for infile in glob.glob("*.jpg"):
file, ext = os.path.splitext(infile)
im = Image.open(infile)
im.save(file+".tiff", 'TIFF')
subfolders = [f.path for f in os.scandir('f:/work_rpi') if f.is_dir() ]
for foler in subfolders:
print(foler)
jpgToTIFF(folder)
https://stackoverflow.com/a/40347279/11343720

Opening images from a directory from a list of filenames

I want to be able to open/move/rename the images form a list of filenames from a directory with hundreds of images. Unfortunately there are no wildcards, they are all .jpg and the images names are not sequential.
e.g
list = ['media\\1520298987567.jpg',
'media\\1520298997109.jpg',
'media\\1520299004063.jpg',
'media\\1520299010082.jpg',
'media\\1520299015452.jpg',
'media\\1520299020690.jpg',
'media\\1520299026092.jpg']
Does anyone know how to do this?
Thank you in advance.
This may help you, (open/ move/ rename) the image files.
from PIL import Image
import glob, os
path = '/path/to/move/'
list_of_files = ['testfile1.jpg', 'testfile2.jpg']
for infile in glob.glob("*.jpg"):
filename, ext = os.path.splitext(infile)
if filename + '.' + ext in list_of_files:
# Open the image
im = Image.open(image_path)
# Rename the Image
renamed_filename = filename + '_renamed.' + ext
# Move the image
im.save(path + renamed_filename, 'jpg')
You can use glob package in python to get list of jpg files and os package to perform open/move/rename operations
import glob
list_of_files = glob.glob("media/*.jpg"):
for file in list_of_files:
# do required operation with the help of os package

Categories