Image does not exist - python

I want to print images in word so that I can print them easier, same size, etc.
I use this piece of code:
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
import os
folder = "./mercedes/"
for img_name in os.listdir(folder):
doc = DocxTemplate("C:\\Users\\jpp08\\Documents\\MIDDELKOOP\\testimg.docx")
img1 = InlineImage(doc, image_descriptor=f'{img_name}', width=Mm(20), height=Mm(10))
context = {'img1': img1}
doc.render(context)
doc.save(f"{folder}/{img_name}" +".docx")
But I got the error that "name of picture".jpg does not exist. But it's the right name and right location.

Related

Model card fails to show plots

I am trying to add a .png image to my model card. But it keeps giving a blank picture. I followed the instruction given by Google.
Here is what my code looks like,
from io import BytesIO
import base64
from google.colab import files
def plot_to_str():
img = BytesIO()
plt.savefig(img, format='png')
return base64.encodebytes(img.getvalue()).decode('utf-8')
my_plot = files.upload()
my_plot = plot_to_str()
And for the model card:
mct = mctlib.ModelCardToolkit()
model_card = mct.scaffold_assets()
# skipped the beginning of the model_card
model_card.model_parameters.data.append(mctlib.Dataset())
model_card.model_parameters.data[0].graphics.collection = [
mctlib.Graphic(image=my_plot)
]
mct.update_model_card(model_card)
html = mct.export_format()
display.display(display.HTML(html))
As a result I get this, a blan plot:

builtins.AttributeError: 'NoneType' object has no attribute 'seek'

I am trying to write a program that reads in a folder of photos, analyses their heights and widths, and resizes them accordingly, then sends them to a word document. I keep getting this error and I am unsure what is causing it:
from docx import Document
import cv2
from PIL import Image
import glob
import os
import numpy
document = Document()
img_dir = "C:/Users/27832/Desktop/Report Images"
data_path = os.path.join(img_dir,'*g')
files = glob.glob(data_path)
photos = []
for pic in files:
imagg = cv2.imread(pic)
photos.append(imagg)
for i in range(0, len(photos)):
if 0.85*(photos[i].shape[0]) < (photos[i].shape[1]) < 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((314, 314))
document.add_picture(resized_image)
elif (photos[i].shape[1]) >= 0.85*(photos[i].shape[0]):
resized_image = photos[i].resize((257, 382))
document.add_picture(resized_image)
elif (photos[i].shape[1]) <= 1.15*(photos[i].shape[0]):
resized_image = photos[i].resize((401, 325))
document.add_picture(resized_image)
document.save("C:/Users/27832/Desktop/Word Macro Program/Report.docx")

Cache error while doing OCR on a directory of pdf's in python

I am trying to OCR an entire directory of pdf files using pytesseract and imagemagick but the issue is that imagemagick is consuming all my Temp folder space and finally I'm getting a cache error i.e "CacheError: unable to extend cache 'C:/Users/Azu/AppData/Local/Temp/magick-18244WfgPyAToCsau11': No space left on device # error/cache.c/OpenPixelCache/3883" I have also written a code to delete the temp folder content once OCR'd but still facing the same issue.
Here's the code till now:
import io
import os
import glob
from PIL import Image
import pytesseract
from wand.image import Image as wi
files = glob.glob(r"D:\files\**")
tempdir = r"C:\Users\Azu\AppData\Local\Temp"
filesall = os.listdir(tempdir)
for file in files:
name = os.path.basename(file).split('.')[0]
#print(file)
pdf = wi(filename = file, resolution = 300)
pdfImg = pdf.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
extracted_texts = []
for imgBlob in imgBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
extracted_texts.append(text)
with open("D:\\extracted_text\\"+ name + ".txt", 'w') as f:
f.write(str(extracted_texts))
for ifile in filesall:
if "magick" in ifile:
os.remove(os.path.join(tempdir,ifile))

how to add multiple pdfs to be converted into excel?

I have program which converts pdf to excel, Now i want add multiple inputs i.e. multiple pdfs to be converted one by one.
my code is below:
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os
pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.
you can use loop
for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
pdf = wi(filename=name, resolution=300)
# rest of code
or you can use sys.argv to get names as
script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf
and code
import sys
for name in sys.argv[1:]:
pdf = wi(filename=name, resolution=300)
# rest of code
Try the code below. This will loop through every PDF file in the folder directory you define. Be sure to update your file_path to be where your PDFs are saved, making sure you use double backslashs in place of single backslashes.
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os
file_path = "C:\\Users\\..."
for file in os.listdir(file_path):
if file.endswith(".pdf"):
pdf = wi(file, resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open(file+'.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.

Incorrect output result: Text extraction for .pdf, .pptx, and .docx in python

I created a function that will open each file in a directory and extract the text from each file and output it in an excel sheet using Pandas. The indexing for each file type seems to be working just fine.However when the text gets extracted from the first file in the path directory It seems to be replacing the other extracted text from the other files with the first file's extracted text. Please help, thank you!
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
p = Path('C:/Users/XXXX/Desktop/test_folder')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out = [],[]
data = []
print(pptx_files)
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out = (docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
Example of the output is:
Text content file name
this is a test first_pdf.pdf
this is a test second_pdf.pdf
the "second_pdf.pdf" does not contain "this is a test" but for some reason it takes in whatever text that gets extracted from the first pdf. (same goes for all files types.
This block
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data
returns from your function after appending the first PDF file. It never gets to the second one because you are returning from inside the for loop. This should fix it:
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data

Categories