How do I insert a blank page between files using PyPDF2 pdfmerger - python

I am using this script to merge bill files so I can print 1 file. The bills are 3 pages, so I need to insert a blank page after each file so the first page of the next bill doesn't print on the back of the previous bill. How can I insert a blank page after each iteration of the loop for each bill file?
# If the file errors with "no module PyPDF2" then from command line, run pip install PyPDF2
import os
from os import listdir,mkdir,startfile
from os.path import isfile, join,exists
from PyPDF2 import PdfFileMerger
#Input file path and print the pdf files in that path
path = input("Enter the folder location: ")
pdffiles = [f for f in listdir(path) if isfile(join(path, f)) and '.pdf' in f]
print('\nList of PDF Files:\n')
for file in pdffiles:
print(file)
#Input the name of the result file
resultFile = input("\nEnter the name of the result file : ")
if '.pdf' not in resultFile:
resultFile += '.pdf'
#Append the pdf files
merger = PdfFileMerger()
for pdf in pdffiles:
merger.append(path+'\\'+pdf)
# The line below hopefully will add a blank page between
merger.addBlankPage(w,h)
#If the Output directory does not exist then create one
if not exists(path+'\\Output'):
mkdir(path+'\\Output')
#Write the merged result file to the Output directory
merger.write(path+'\\Output\\'+resultFile)
merger.close()
#Launch the result file
print('\n'+resultFile,'Successfully created!!! at ',path+'\\Output\\')
startfile(path+'\\Output\\'+resultFile)

First add blank page to end, then merge them.
import os
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
def add_blank_to_end(files: list) -> list:
names = []
for f in files:
pdf_in = open(f, 'rb')
pdf_file = PdfFileReader(pdf_in)
output = PdfFileWriter()
output.appendPagesFromReader(pdf_file)
output.addBlankPage()
names.append(f'b{f}')
outputStream = open(f'b{f}', 'wb')
output.write(outputStream)
return names
def merge_pdfs(files: list):
merger = PdfFileMerger()
for f in files:
merger.append(f)
merger.write("document-output.pdf")
files = ['file1.pdf', 'file2.pdf']
with_blank = add_blank_to_end(files)
merge_pdfs(with_blank)
# delete extra files
for i in with_blank:
os.remove(i)

Related

How to I merge multiple .txt files that are in a .zip file into only one .txt file in Python?

I'm trying to merge multiple .txt files that are in a .zip file into only one .txt file in Python.
My code is the following:
firstfile = Path(r'C:\Users\Viniz\Downloads\devkmbe-5511001_05-12-2022_00_20_09.zip\AudioCaptureMemoryUsage_01_12_2022.txt')
secondfile = Path(r'C:\Users\Viniz\Downloads\devkmbe-5511001_05-12-2022_00_20_09.zip\AudioMatchingMemoryUsage_01_12_2022.txt')
newfile = input("Enter the name of the new file: ")
print()
print("The merged content of the 2 files will be in", newfile)
with open(newfile, "wb") as wfd:
for f in [firstfile, secondfile]:
with open(f, "rb") as fd:
shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)
print("\nThe content is merged successfully.!")
print("Do you want to view it ? (y / n): ")
check = input()
if check == 'n':
exit()
else:
print()
c = open(newfile, "r")
print(c.read())
c.close()
Thanks.
I tried to merge them in only one file but it doesn't worked.
To merge the files, you'll need to first extract the files from the zip file, then merge them, and then write the merged content to a new file. Here is an example of how you can do this using the zipfile module.
Update: If the .txt files are located inside a folder within the zip file, you'll need to include the folder name in the path when opening the files.
import zipfile
zip_file = r'C:\Users\Viniz\Downloads\devkmbe-5511001_05-12-2022_00_20_09.zip'
folder_name = 'myfolder'
first_file = folder_name + '/AudioCaptureMemoryUsage_01_12_2022.txt'
second_file = folder_name + '/AudioMatchingMemoryUsage_01_12_2022.txt'
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
with zip_ref.open(first_file) as f1, zip_ref.open(second_file) as f2:
first_content = f1.read()
second_content = f2.read()
# Concatenate the two files
merged_content = first_content + second_content
# Write the merged content to a new file
new_file = input("Enter the name of the new file: ")
with open(new_file, 'wb') as new_f:
new_f.write(merged_content)
print("The content is merged successfully.!")
print("Do you want to view it ? (y / n): ")
check = input()
if check == 'n':
exit()
else:
print()
c = open(new_file, "r")
print(c.read())
c.close()
Make sure to replace 'myfolder' with the actual name of the folder containing the .txt files in your zip file.
For multiple files..
import zipfile
zip_file = r'C:\Users\Viniz\Downloads\devkmbe-5511001_05-12-2022_00_20_09.zip'
folder_name = 'myfolder'
file_names = ['AudioCaptureMemoryUsage_01_12_2022.txt',
'AudioMatchingMemoryUsage_01_12_2022.txt',
'File3.txt',
'File4.txt',
...
'File29.txt']
merged_content = b'' # Initialize an empty bytes object
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file_name in file_names:
with zip_ref.open(folder_name + '/' + file_name) as f:
merged_content += f.read()
# Write the merged content to a new file
new_file = input("Enter the name of the new file: ")
with open(new_file, 'wb') as new_f:
new_f.write(merged_content)
print("The content is merged successfully.!")
print("Do you want to view it ? (y / n): ")
check = input()
if check == 'n':
exit()
else:
print()
c = open(new_file, "r")
print(c.read())
c.close()
import os
import zipfile
import shutil
def extract_txt_files(zip_path, temp_folder):
"""Extracts all the .txt files from the given zip file to the given temp folder"""
with zipfile.ZipFile(zip_path, "r") as zip_file:
i = len([name for name in os.listdir(temp_folder) if name.endswith(".txt")]) + 1
for member in zip_file.infolist():
if member.filename.endswith(".txt"):
zip_file.extract(member, temp_folder)
os.rename(os.path.join(temp_folder, member.filename), os.path.join(temp_folder, f"{i}.txt"))
i += 1
def merge_txt_files(temp_folder):
"""Merges all the .txt files from the given temp folder into a single file called "merged.txt" """
with open("merged.txt", "w") as outfile:
for filename in os.listdir(temp_folder):
if filename.endswith(".txt"):
with open(os.path.join(temp_folder, filename)) as infile:
outfile.write(infile.read())
def delete_temp_folder(temp_folder):
"""Deletes the given temp folder"""
os.rmdir(temp_folder)
# paths to the zip files
zip1_path = "zip1.zip"
zip2_path = "zip2.zip"
# create a temporary folder to extract the .txt files
temp_folder = "temp"
os.makedirs(temp_folder, exist_ok=True)
# extract the .txt files from the zip files
extract_txt_files(zip1_path, temp_folder)
extract_txt_files(zip2_path, temp_folder)
# merge the .txt files
merge_txt_files(temp_folder)
# delete the temporary folder
shutil.rmtree(temp_folder)
print("The content is merged successfully.!")
print("Do you want to view it ? (y / n): ")
check = input()
if check == 'n':
exit()
else:
print()
c = open(new_file, "r")
print(c.read())
c.close()
The zip path in the script is relative, which means that the zip files "zip1.zip" and "zip2.zip" are expected to be in the same directory as the script.
If the zip files contain multiple .txt files, the script will extract all of them to the temporary folder.
the script renames the extracted .txt files with an incremental index and the .txt extension to ensure that all the extracted files will have unique names and not overwritten.This will maintain the order of txt files as they were in zip file.

Converting multiple PDF files into txt in Python?

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("C:/117"):
path = os.path.join("C:/117/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path).numPages):
print('Page: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()
I am converting hundreds of pdf files into txt. However, with this code, all the PDFs are merged into a single txt file. Is there a way to create separate txt file for each PDF I convert? Thanks
from pathlib import Path
from pypdf import PdfReader
def convert_pdf_to_text(path: Path) -> str:
text = ""
for page in PdfReader(path).pages:
text += page.extract_text() + "\n"
return text
for path in Path("Documents").glob("**/*.pdf"):
text = convert_pdf_to_text(path)
txt_path = path.parent / (".".join(path.name.split(".")[:-1]) + ".txt")
if txt_path.exists():
print(f"Skip {txt_path} as it already exists")
continue
with open(txt_path, "wt") as fp:
fp.write(text)

How to split only first page in each pdf file from directory that has muliple files?

I have created two directories with input and output names. Input directory have more than one PDF file and each file has multiple pages. I am trying to get first page of every PDF file and that should be save on output directory.
Below is the code i am tryingimport os
from PyPDF2 import PdfFileWriter, PdfFileReader
in_path = "D:/data/input/"
out_path = "D:/data/output/"
output = PdfFileWriter()
pages_to_keep = [0]
in_files = (f for f in os.listdir(in_path) if os.path.isfile(f) and f.endswith('.pdf'))
for file in in_files:
po = open(file, 'rb')
rd = PdfFileReader(po, strict=False)
for i in pages_to_keep:
page = rd.getPage(i)
output.addPage(page)
with open(out_path+str(file), 'wb') as f:
output.write(f):
The problem is: when i executing the script that is saving output file1 having 1 page, and output file2 having 2 pages, third file having three pages. But i need only first page from all PDF files.
How to solve this.
You need to reset output for each file:
for file in in_files:
output = PdfFileWriter() # clear output
po = open(file, 'rb')
rd = PdfFileReader(po, strict=False)
for i in pages_to_keep:
page = rd.getPage(i)
output.addPage(page)
with open(out_path+str(file), 'wb') as f:
output.write(f):

Convert a PDF files to TXT files

I need a last touch from an expert !! I want to convert all pdf files in a directory to txt files. I wrote a code to create empty txt files having the same name as pdf files and a code to convert a single pdf to txt but I want to convert all files in the directory. please see the code below:
PS : I Already tried with PDFminer, and every other package and it does not work
import pandas as pd
import os
import PyPDF2
###Create empty txt files Named as pdf files ###########
path = '....\\PDF2Text\\PDF\\'
newpath = '....\\PDF2Text\\Text\\'
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
ext = f.replace('.pdf','.txt')
extpath = ext.replace(path,newpath)
ft= open(extpath ,"w+")
ft.close()
print(extpath)
##Here we Convert a single pdf file to a txt file providing pdf path and empty txt path #####
import PyPDF2
def getPDFFileContentToTXT(pdfFile):
myPDFFile = PyPDF2.PdfFileReader(pdfFile)
with open('....\\PDF2Text\\Text\\blabla.txt', 'w') as pdf_output:
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open('.....\\PDF2Text\\Text\\blabla.txt', 'r') as myPDFContent:
return myPDFContent.read().replace('\n',' ')
pdfFileContent = getPDFFileContentToTXT('.....\\PDF2Text\\PDF\\blabla.pdf')
import pandas as pd
import os
import PyPDF2
#Create empty txt files Named as pdf files
path = 'C:\\PDF2Text\\PDF\\'
newpath = 'C:\\PDF2Text\\Text\\'
# r=root, d=directories, f = files
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
txt = f.replace('.pdf','.txt')
txtpath = txt.replace(path,newpath)
print(f)
ft= open(txtpath ,"w+")
ft.close()
print(txtpath)
Vpath = f.replace('.pdf','')
#print(Vpath)
myPDFFile = PyPDF2.PdfFileReader(f)
with open(txtpath, 'w') as pdf_output: #, encoding="utf-8"
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open(txtpath, 'r') as myPDFContent:
myPDFContent.read().replace('\n',' ')
Have you tried Tika? Just do a pip install tika (also need to have Java 7+ installed on your system) and maybe this is the piece of code you want:
import os
from tika import parser
def read_pdf(pdf_file):
text = parser.from_file(pdf_file)['content']
return text.encode('utf-8')
def pdf_to_txt(folder_with_pdf, dest_folder):
"""
folder_with_pdf: path to your pdf's
dest_folder: path where you want .txt files saved
"""
pdf_files = []
for root, dirs, files in os.walk(folder_with_pdf):
for f in files:
if '.pdf' in f:
pdf_files.append(os.path.join(root, f))
#print(pdf_files)
for file_ in pdf_files:
text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
text_f.write(read_pdf(file_))
return None
pdf_to_txt('./pdf_folder', './txt_folder') #you should see .txt files being populated in ./txt_folder
Aside: If pdf files in sub-directories of ./pdf_folder happens to have the same name (but different content) by any chance, then you will lose one (or more) .txt files.

Copy the specific files listed in text file from source folder to Destination folder

I have a list of file names in a text file with different format. I have multiple files found in the Source folder , I like to search files in the source folder with respective file name in text file copy those file and paste it to destination folder.
Example :
Text File : contains only limited (Selected files)
C:/.../abc.doc::1
C:/.../def.doc::1
c:/.../ghu.doc::1
c:/.../zzz.doc::1
Source Folder :
C:/.../abc.doc
C./.../12a.doc
C:/.../def.doc
c:/.../ghu.doc
c:/.../zzz.doc
Destination Folder :
C:/.../abc.doc
C:/.../def.doc
c:/.../ghu.doc
c:/.../zzz.doc
I am new to python , I tried my level best, need some valuable input to finish my home work
Step1: I like to select the text file
Step2: Slice the line only the file name (C:/.../abc.doc::1) to file name(abc)
Step3: Search the file name in the source folder
Step4: Copy and paste it to destination folder.
Code :
import os
from tkinter import filedialog
from tkinter import *
root = Tk()
#FolderA = os.path.normpath(filedialog.askdirectory(initialdir="/", title="Select png source path")) + "\\"
text_file_list = os.path.normpath(filedialog.askopenfilename(initialdir = "/", title="Select Rating text or csv file", filetypes = (("text files","*.txt"), ("all files","*.*"))))
FolderB = os.path.normpath(filedialog.askdirectory(initialdir="/", title="Select png source path")) + "\\"
print (FolderA)
print (FolderB)
os.chdir(text_file_list)
namelist = list()
for f in os.listdir():
file_name,file_ext = os.path.splitext(f)
namelist.append(file_name)
os.chdir(FolderB)
for findex, f in enumerate(os.listdir()):
t = f
strs.startswith('py') and strs.endswith("27")
file_name,file_ext = os.path.splitext(f)
os.rename(f, namelist[findex] + file_ext)
print(file_name)
Copy from comment:
with open(text_file_list, "r") as ins:
array = []
for line in ins:
array.append(line)
print(line)
m = re.search(r"(?<=tinted_combined).*?(?=.jpg::1)", your_text).group(0)
if m:
found = m.group(1)
print(found)
Try this code. Hope it works.
import os
import shutils
files = [os.path.join(SOURCE_PATH, f) for f in os.listdir(SOURCE_PATH)
if os.path.isfile(os.path.join(SOURCE_PATH, f))]
for file in files:
shutil.move(file, DESTINATION_PATH)

Categories