How do i add path in python docx? - python

I am trying to add a path in this code, how do i do it? I dont understand it.
# folder path
data_folder = Path("Desktop\biologi")
if y == ('biologi'):
document = Document() #create blank document
document.save(y+(x.strftime(" %Y-%m-%d"))+".docx") #save blank document, lägg in path här
document = Document(y+(x.strftime(" %Y-%m-%d"))+".docx") #open document
p = document.add_paragraph()
p.add_run(str(y+(x.strftime(" %Y-%m-%d"))))#edit words
document.save(y+(x.strftime(" %Y-%m-%d"))+".docx")#save edited document
´´´

Use os library.
import os
DESKTOP_PATH = os.path.expanduser("~\Desktop")
data_folder = os.path.join(DESKTOP_PATH, 'biologi')
print(data_folder)
# prints
# C:\Users\<Username>\Desktop\biologi

Related

Deleting pdf files from a folder if the search word is present using python

Hi i am trying to delete the pdf files in a folder which contains the word "Publications périodiques" in the first , so far i am able to search for the word but dont know how to delete the files .
Code used to search for the word in pdf files
import PyPDF2
import re
object = PyPDF2.PdfFileReader("202105192101394-60.pdf")
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
print(ResSearch)
Also how to loop this in multiple files
You can delete any file using:
import os
os.remove("C://fake/path/to/file.pdf")
In order to delete a file use
import os
os.unlink(file_path)
where file_path is the path to the relevant file
For browsing through files:
from os import walk
mypath= "./"
_, _, filenames = next(walk(mypath))
Process each file:
for file in filenames:
foundWord = yourFunction(file)
if foundWord:
os.remove(file) # Delete the file
Write yourFunction() such that it returns true/false.
I suppose your re.search() is already functional? Or is that part of your question?
If functional, you could just use os to get all the files, perhaps filter them through a list comprehension to only get the pdf-files like so:
import os
all_files = os.listdir("C:/../or_whatever_path")
only_pdf_files = [file for file in all_files if ".pdf" in file]
from that point on, you can iterate through all pdf-files and just execute the same code you've already written for each one and when "ResSearch" is True, delete the File via os.remove() method:
for file in only_pdf_files:
object = PyPDF2.PdfFileReader(file)
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
if ResSearch:
os.remove(file)
else:
pass
EDIT:
When your pdf-files aren't in the same directory as your python script, the path is to be added to the os.remove() method.
for file in only_pdf_files:
object = PyPDF2.PdfFileReader(file)
NumPages = object.getNumPages()
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
if ResSearch:
os.remove(file)
else:
pass

How to add a relative file path inside a pdf using pypdf

Context
I have a pdf with links.
I want to replace all the external links with local files in the same folder.
Is there a way to do that in pypdf or python
e.g.
outputStream = open("destination.pdf", "wb")
key = '/Annots'
uri = '/URI'
ank = '/A'
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
cwd = os.getcwd()
for x in range(existing_pdf.getNumPages()):
page = existing_pdf.getPage(x)
page_object = page.getObject()
if key in page_object:
ann = page_object[key]
for a in ann:
u = a.getObject()
if uri in u[ank]:
test = u[ank][uri]
test1 = u[ank].keys()
u[TextStringObject(ank)][TextStringObject(uri)] = TextStringObject(f"file:./foo1.pdf")
output.addPage(page)
# finally, write "output" to a real file
output.write(outputStream)
outputStream.close()
The above does not work i.e. the foo1.pdf is not linked properly.
If I add "file:///{CWD}/foo1.pdf" it works.
Is there a way to use relative path only?
After reading through the pdf structure and documentation I was able to write the following and it works as expected.
for x in range(existing_pdf.getNumPages()):
page = existing_pdf.getPage(x)
page_object = page.getObject()
if key in page_object:
ann = page_object[key]
for a in ann:
u = a.getObject()
if uri in u[ank]:
del u[TextStringObject(ank)][TextStringObject(uri)]
u[TextStringObject(ank)][NameObject('/F')] = TextStringObject(f"./sheets/sheet1.pdf")
u[TextStringObject(ank)][TextStringObject('/S')] = NameObject("/Launch")
u[TextStringObject(ank)][NameObject('/NewWindow')] = BooleanObject(f"true")
output.addPage(page)
# finally, write "output" to a real file
output.write(outputStream)
outputStream.close()

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()
Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

DOCX file to text file conversion using Python

I wrote the following code to convert my docx file to text file. The output that I get printed in my text file is the last paragraph/part of the whole file and not the complete content. The code is as follows:
from docx import Document
import io
import shutil
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
# for printing the complete document
print('\nThe whole content of the document:->>>\n')
for para in document.paragraphs:
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
#textFile.write(unicode(para.text))
x=unicode(para.text)
print(x) //the complete content gets printed by this line
textFile.write((x)) #after writing the content to text file only last paragraph is copied.
#textFile.write(para.text)
path= "/home/python/resumes/"
convertDocxToText(path)
the following is the solution for the above problem:
from docx import Document
import io
import shutil
import os
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
for para in document.paragraphs:
textFile.write(unicode(para.text))
path= "/home/python/resumes/"
convertDocxToText(path)
Problem
as your code says in the last for loop:
for para in document.paragraphs:
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
x=unicode(para.text)
textFile.write((x))
for each paragraph in whole document, you try to open a file named textFilename so let's say you have a file named MyFile.docx in /home/python/resumes/ so the textFilename value that contains the path will be /home/python/resumes/MyFile.txt always in whole of for loop, so the problem is that you open the same file in w mode which is a Write mode, and will overwrite the whole file content.
Solution:
you must open the file once out of that for loop then try add paragraphs one by one to it.

How to create corpus from multiple docx files in Python

I have a folder that consists of various 10 docx files. I am trying to create a corpus, which should be a list of length 10. Each element of the list should refer to the text of each docx document.
I have following function to extract text from docx files:
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import glob
from docx import *
def getText(filename):
document = Document(filename)
newparatextlist = []
for paragraph in document.paragraphs:
newparatextlist.append(paragraph.text.strip().encode("utf-8"))
return newparatextlist
path = 'pat_to_folder/*.docx'
files=glob.glob(path)
corpus_list = []
for f in files:
cur_corpus = getText(f)
corpus_list.append(cur_corpus)
corpus_list[0]
However, if I have content as follows in my word documents:
http://www.actus-usa.com/sampleresume.doc
https://www.myinterfase.com/sjfc/resources/resource_view.aspx?resource_id=53
the above function creates a list of list. How can I simply create a corpus out of the files?
TIA!
I tried this on some different method for my problem. It also consisted of loading various docx files to a corpus... I made some slight changes to your code!
def getText(filename):
doc = Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text.strip("\n"))
return " ".join(fullText)
PATH = "path_to_folder/*.docx"
files = glob.glob(PATH)
corpus_list = []
for f in files:
cur_corpus = getText(f)
corpus_list.append(cur_corpus)
hopefully this solves the problem!
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpus = PlaintextCorpusReader(ROOT_PATH, '*.docx')
It should create corpus from all the content of docx files present in the ROOT_PATH

Categories