Convert a PDF files to TXT files - python

I need a last touch from an expert !! I want to convert all pdf files in a directory to txt files. I wrote a code to create empty txt files having the same name as pdf files and a code to convert a single pdf to txt but I want to convert all files in the directory. please see the code below:
PS : I Already tried with PDFminer, and every other package and it does not work
import pandas as pd
import os
import PyPDF2
###Create empty txt files Named as pdf files ###########
path = '....\\PDF2Text\\PDF\\'
newpath = '....\\PDF2Text\\Text\\'
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
ext = f.replace('.pdf','.txt')
extpath = ext.replace(path,newpath)
ft= open(extpath ,"w+")
ft.close()
print(extpath)
##Here we Convert a single pdf file to a txt file providing pdf path and empty txt path #####
import PyPDF2
def getPDFFileContentToTXT(pdfFile):
myPDFFile = PyPDF2.PdfFileReader(pdfFile)
with open('....\\PDF2Text\\Text\\blabla.txt', 'w') as pdf_output:
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open('.....\\PDF2Text\\Text\\blabla.txt', 'r') as myPDFContent:
return myPDFContent.read().replace('\n',' ')
pdfFileContent = getPDFFileContentToTXT('.....\\PDF2Text\\PDF\\blabla.pdf')

import pandas as pd
import os
import PyPDF2
#Create empty txt files Named as pdf files
path = 'C:\\PDF2Text\\PDF\\'
newpath = 'C:\\PDF2Text\\Text\\'
# r=root, d=directories, f = files
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
txt = f.replace('.pdf','.txt')
txtpath = txt.replace(path,newpath)
print(f)
ft= open(txtpath ,"w+")
ft.close()
print(txtpath)
Vpath = f.replace('.pdf','')
#print(Vpath)
myPDFFile = PyPDF2.PdfFileReader(f)
with open(txtpath, 'w') as pdf_output: #, encoding="utf-8"
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open(txtpath, 'r') as myPDFContent:
myPDFContent.read().replace('\n',' ')

Have you tried Tika? Just do a pip install tika (also need to have Java 7+ installed on your system) and maybe this is the piece of code you want:
import os
from tika import parser
def read_pdf(pdf_file):
text = parser.from_file(pdf_file)['content']
return text.encode('utf-8')
def pdf_to_txt(folder_with_pdf, dest_folder):
"""
folder_with_pdf: path to your pdf's
dest_folder: path where you want .txt files saved
"""
pdf_files = []
for root, dirs, files in os.walk(folder_with_pdf):
for f in files:
if '.pdf' in f:
pdf_files.append(os.path.join(root, f))
#print(pdf_files)
for file_ in pdf_files:
text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
text_f.write(read_pdf(file_))
return None
pdf_to_txt('./pdf_folder', './txt_folder') #you should see .txt files being populated in ./txt_folder
Aside: If pdf files in sub-directories of ./pdf_folder happens to have the same name (but different content) by any chance, then you will lose one (or more) .txt files.

Related

How do I insert a blank page between files using PyPDF2 pdfmerger

I am using this script to merge bill files so I can print 1 file. The bills are 3 pages, so I need to insert a blank page after each file so the first page of the next bill doesn't print on the back of the previous bill. How can I insert a blank page after each iteration of the loop for each bill file?
# If the file errors with "no module PyPDF2" then from command line, run pip install PyPDF2
import os
from os import listdir,mkdir,startfile
from os.path import isfile, join,exists
from PyPDF2 import PdfFileMerger
#Input file path and print the pdf files in that path
path = input("Enter the folder location: ")
pdffiles = [f for f in listdir(path) if isfile(join(path, f)) and '.pdf' in f]
print('\nList of PDF Files:\n')
for file in pdffiles:
print(file)
#Input the name of the result file
resultFile = input("\nEnter the name of the result file : ")
if '.pdf' not in resultFile:
resultFile += '.pdf'
#Append the pdf files
merger = PdfFileMerger()
for pdf in pdffiles:
merger.append(path+'\\'+pdf)
# The line below hopefully will add a blank page between
merger.addBlankPage(w,h)
#If the Output directory does not exist then create one
if not exists(path+'\\Output'):
mkdir(path+'\\Output')
#Write the merged result file to the Output directory
merger.write(path+'\\Output\\'+resultFile)
merger.close()
#Launch the result file
print('\n'+resultFile,'Successfully created!!! at ',path+'\\Output\\')
startfile(path+'\\Output\\'+resultFile)
First add blank page to end, then merge them.
import os
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
def add_blank_to_end(files: list) -> list:
names = []
for f in files:
pdf_in = open(f, 'rb')
pdf_file = PdfFileReader(pdf_in)
output = PdfFileWriter()
output.appendPagesFromReader(pdf_file)
output.addBlankPage()
names.append(f'b{f}')
outputStream = open(f'b{f}', 'wb')
output.write(outputStream)
return names
def merge_pdfs(files: list):
merger = PdfFileMerger()
for f in files:
merger.append(f)
merger.write("document-output.pdf")
files = ['file1.pdf', 'file2.pdf']
with_blank = add_blank_to_end(files)
merge_pdfs(with_blank)
# delete extra files
for i in with_blank:
os.remove(i)

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()
Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

Copy the specific files listed in text file from source folder to Destination folder

I have a list of file names in a text file with different format. I have multiple files found in the Source folder , I like to search files in the source folder with respective file name in text file copy those file and paste it to destination folder.
Example :
Text File : contains only limited (Selected files)
C:/.../abc.doc::1
C:/.../def.doc::1
c:/.../ghu.doc::1
c:/.../zzz.doc::1
Source Folder :
C:/.../abc.doc
C./.../12a.doc
C:/.../def.doc
c:/.../ghu.doc
c:/.../zzz.doc
Destination Folder :
C:/.../abc.doc
C:/.../def.doc
c:/.../ghu.doc
c:/.../zzz.doc
I am new to python , I tried my level best, need some valuable input to finish my home work
Step1: I like to select the text file
Step2: Slice the line only the file name (C:/.../abc.doc::1) to file name(abc)
Step3: Search the file name in the source folder
Step4: Copy and paste it to destination folder.
Code :
import os
from tkinter import filedialog
from tkinter import *
root = Tk()
#FolderA = os.path.normpath(filedialog.askdirectory(initialdir="/", title="Select png source path")) + "\\"
text_file_list = os.path.normpath(filedialog.askopenfilename(initialdir = "/", title="Select Rating text or csv file", filetypes = (("text files","*.txt"), ("all files","*.*"))))
FolderB = os.path.normpath(filedialog.askdirectory(initialdir="/", title="Select png source path")) + "\\"
print (FolderA)
print (FolderB)
os.chdir(text_file_list)
namelist = list()
for f in os.listdir():
file_name,file_ext = os.path.splitext(f)
namelist.append(file_name)
os.chdir(FolderB)
for findex, f in enumerate(os.listdir()):
t = f
strs.startswith('py') and strs.endswith("27")
file_name,file_ext = os.path.splitext(f)
os.rename(f, namelist[findex] + file_ext)
print(file_name)
Copy from comment:
with open(text_file_list, "r") as ins:
array = []
for line in ins:
array.append(line)
print(line)
m = re.search(r"(?<=tinted_combined).*?(?=.jpg::1)", your_text).group(0)
if m:
found = m.group(1)
print(found)
Try this code. Hope it works.
import os
import shutils
files = [os.path.join(SOURCE_PATH, f) for f in os.listdir(SOURCE_PATH)
if os.path.isfile(os.path.join(SOURCE_PATH, f))]
for file in files:
shutil.move(file, DESTINATION_PATH)

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!
One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

how to get list of files from a directory into text file using python

This question is how to get list of files from a directory into text file using python.
Result in the text file should exactly be like this:
E:\AA\a.jpg
E:\AA\b.jpg
...
How to correct the code below:
WD = "E:\\AA"
import glob
files = glob.glob ('*.jpg')
with open ('infiles.txt', 'w') as in_files:
in_files.write(files +'\n')
glob.glob() returns a list. You have to iterate through it.
WD = "E:\\AA"
import glob
files = glob.glob ('*.jpg')
with open ('infiles.txt', 'w') as in_files:
for eachfile in files: in_files.write(eachfile+'\n')
Input directory path : WD = "E://AA"
You can assign specific file extention that you needed eg: path = WD+'/*.jpg',
if you need all file list then give '' eg: path = WD+'/'
import glob
w_dir = WD + "/*.jpg"
with open("infiles.txt","wb")as fp:
for path in [filepath for filepath in glob.glob(w_dir)]:
fp.write(path+"\n")
Without path, glob.glob returns list of filename (No directory part). To get full path you need to call os.path.abspath(filename) / os.path.realpath(filename) / os.path.join(WD, filename)
>>> glob.glob('*.png')
['gnome-html.png', 'gnome-windows.png', 'gnome-set-time.png', ...]
>>> os.path.abspath('gnome-html.png')
'/usr/share/pixmaps/gnome-html.png'
With path, glob.glob return list of filename with directory part.
>>> glob.glob('/usr/share/pixmaps/*.png')
['/usr/share/pixmaps/gnome-html.png', '/usr/share/pixmaps/gnome-windows.png', '/usr/share/pixmaps/gnome-set-time.png', ...]
import glob
import os
WD = r'E:\AA'
files = glob.glob(os.path.join(WD, '*.jpg'))
with open('infiles.txt', 'w') as in_files:
in_files.writelines(fn + '\n' for fn in files)
or
import glob
import os
WD = r'E:\AA'
os.chdir(WD)
files = glob.glob('*.jpg')
with open('infiles.txt', 'w') as in_files:
in_files.writelines(os.path.join(WD, fn) + '\n' for fn in files)
Here is a two line simple solution:
import os
filee = open('all_names.txt','w')
given_dir = 'the_dierctory'
[filee.write(os.path.join(os.path.dirname(os.path.abspath(__file__)),given_dir,i)+'\n') for i in os.listdir(given_dir)]
where given_dir is the directory name. The output is a text file (all_names.txt) where each line in the file is the full path to all files and directories in the given_dir.

Categories