DOCX file to text file conversion using Python

DOCX file to text file conversion using Python - python

I wrote the following code to convert my docx file to text file. The output that I get printed in my text file is the last paragraph/part of the whole file and not the complete content. The code is as follows:
from docx import Document
import io
import shutil
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
# for printing the complete document
print('\nThe whole content of the document:->>>\n')
for para in document.paragraphs:
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
#textFile.write(unicode(para.text))
x=unicode(para.text)
print(x) //the complete content gets printed by this line
textFile.write((x)) #after writing the content to text file only last paragraph is copied.
#textFile.write(para.text)
path= "/home/python/resumes/"
convertDocxToText(path)

the following is the solution for the above problem:
from docx import Document
import io
import shutil
import os
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
for para in document.paragraphs:
textFile.write(unicode(para.text))
path= "/home/python/resumes/"
convertDocxToText(path)

Problem
as your code says in the last for loop:
for para in document.paragraphs:
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"w", encoding="utf-8") as textFile:
x=unicode(para.text)
textFile.write((x))
for each paragraph in whole document, you try to open a file named textFilename so let's say you have a file named MyFile.docx in /home/python/resumes/ so the textFilename value that contains the path will be /home/python/resumes/MyFile.txt always in whole of for loop, so the problem is that you open the same file in w mode which is a Write mode, and will overwrite the whole file content.
Solution:
you must open the file once out of that for loop then try add paragraphs one by one to it.

Related

How do I insert a blank page between files using PyPDF2 pdfmerger

I am using this script to merge bill files so I can print 1 file. The bills are 3 pages, so I need to insert a blank page after each file so the first page of the next bill doesn't print on the back of the previous bill. How can I insert a blank page after each iteration of the loop for each bill file?
# If the file errors with "no module PyPDF2" then from command line, run pip install PyPDF2
import os
from os import listdir,mkdir,startfile
from os.path import isfile, join,exists
from PyPDF2 import PdfFileMerger
#Input file path and print the pdf files in that path
path = input("Enter the folder location: ")
pdffiles = [f for f in listdir(path) if isfile(join(path, f)) and '.pdf' in f]
print('\nList of PDF Files:\n')
for file in pdffiles:
print(file)
#Input the name of the result file
resultFile = input("\nEnter the name of the result file : ")
if '.pdf' not in resultFile:
resultFile += '.pdf'
#Append the pdf files
merger = PdfFileMerger()
for pdf in pdffiles:
merger.append(path+'\\'+pdf)
# The line below hopefully will add a blank page between
merger.addBlankPage(w,h)
#If the Output directory does not exist then create one
if not exists(path+'\\Output'):
mkdir(path+'\\Output')
#Write the merged result file to the Output directory
merger.write(path+'\\Output\\'+resultFile)
merger.close()
#Launch the result file
print('\n'+resultFile,'Successfully created!!! at ',path+'\\Output\\')
startfile(path+'\\Output\\'+resultFile)

First add blank page to end, then merge them.
import os
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
def add_blank_to_end(files: list) -> list:
names = []
for f in files:
pdf_in = open(f, 'rb')
pdf_file = PdfFileReader(pdf_in)
output = PdfFileWriter()
output.appendPagesFromReader(pdf_file)
output.addBlankPage()
names.append(f'b{f}')
outputStream = open(f'b{f}', 'wb')
output.write(outputStream)
return names
def merge_pdfs(files: list):
merger = PdfFileMerger()
for f in files:
merger.append(f)
merger.write("document-output.pdf")
files = ['file1.pdf', 'file2.pdf']
with_blank = add_blank_to_end(files)
merge_pdfs(with_blank)
# delete extra files
for i in with_blank:
os.remove(i)

Converting multiple PDF files into txt in Python?

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("C:/117"):
path = os.path.join("C:/117/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path).numPages):
print('Page: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()
I am converting hundreds of pdf files into txt. However, with this code, all the PDFs are merged into a single txt file. Is there a way to create separate txt file for each PDF I convert? Thanks

from pathlib import Path
from pypdf import PdfReader
def convert_pdf_to_text(path: Path) -> str:
text = ""
for page in PdfReader(path).pages:
text += page.extract_text() + "\n"
return text
for path in Path("Documents").glob("**/*.pdf"):
text = convert_pdf_to_text(path)
txt_path = path.parent / (".".join(path.name.split(".")[:-1]) + ".txt")
if txt_path.exists():
print(f"Skip {txt_path} as it already exists")
continue
with open(txt_path, "wt") as fp:
fp.write(text)

Deleting pdf files from a folder if the search word is present using python

Hi i am trying to delete the pdf files in a folder which contains the word "Publications périodiques" in the first , so far i am able to search for the word but dont know how to delete the files .
Code used to search for the word in pdf files
import PyPDF2
import re
object = PyPDF2.PdfFileReader("202105192101394-60.pdf")
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
print(ResSearch)
Also how to loop this in multiple files

You can delete any file using:
import os
os.remove("C://fake/path/to/file.pdf")

In order to delete a file use
import os
os.unlink(file_path)
where file_path is the path to the relevant file

For browsing through files:
from os import walk
mypath= "./"
_, _, filenames = next(walk(mypath))
Process each file:
for file in filenames:
foundWord = yourFunction(file)
if foundWord:
os.remove(file) # Delete the file
Write yourFunction() such that it returns true/false.

I suppose your re.search() is already functional? Or is that part of your question?
If functional, you could just use os to get all the files, perhaps filter them through a list comprehension to only get the pdf-files like so:
import os
all_files = os.listdir("C:/../or_whatever_path")
only_pdf_files = [file for file in all_files if ".pdf" in file]
from that point on, you can iterate through all pdf-files and just execute the same code you've already written for each one and when "ResSearch" is True, delete the File via os.remove() method:
for file in only_pdf_files:
object = PyPDF2.PdfFileReader(file)
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
if ResSearch:
os.remove(file)
else:
pass
EDIT:
When your pdf-files aren't in the same directory as your python script, the path is to be added to the os.remove() method.

for file in only_pdf_files:
object = PyPDF2.PdfFileReader(file)
NumPages = object.getNumPages()
String = "Publications périodiques"
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
if ResSearch:
os.remove(file)
else:
pass

AttributeError: 'str' object has no attribute 'P' when trying to extract text from all .odt files, recursively, using odfpy library

I wrote a script in order to convert all my .odt files, recursively - that is in the CWD and all the subdirectories, to text files. The code in question:
import glob, os
from odf import text, teletype
from odf.opendocument import load
fileList = glob.glob(f"{os.getcwd()}/**/*.odt", recursive=True)
for f in fileList:
textdoc = load(f)
allparas = textdoc.getElementsByType(text.P)
print(allparas)
s = len(allparas)
text = ""
for i in range(s):
text += teletype.extractText(allparas[i])
text += "\n"
output_file = f.replace(".odt", "")
with open(output_file, 'w') as textfile:
textfile.write(text)
When I run it I get the following error:
File "./odtR.py", line 12, in
allparas = textdoc.getElementsByType(text.P)
AttributeError: 'str' object has no attribute 'P'
By comparison, all is fine when I run a similar script which is meant to convert just one file of my choosing from CWD . This is the code of this script:
from odf import text, teletype
from odf.opendocument import load
path_to_your_odt_file = input("What is the name of your odt file?\n")
output_file = path_to_your_odt_file.replace(".odt", "")
textdoc = load(path_to_your_odt_file)
allparas = textdoc.getElementsByType(text.P)
s = len(allparas)
text = ""
for i in range(s):
text += teletype.extractText(allparas[i])
text += "\n"
output_file = path_to_your_odt_file.replace(".odt", "")
with open(output_file, 'w') as textfile:
textfile.write(text)
What did I do wrong in the former script? How would you rewrite it?

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()

Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

DOCX file to text file conversion using Python - python

Related

How do I insert a blank page between files using PyPDF2 pdfmerger

Converting multiple PDF files into txt in Python?

Deleting pdf files from a folder if the search word is present using python

AttributeError: 'str' object has no attribute 'P' when trying to extract text from all .odt files, recursively, using odfpy library

How to read all pdf files in a directory and convert to text file using tesseract python 3?

Categories

Resources