Split PDF #python function is only exporting the first page ONLY - python

After exporting a PDF file from an xslx file. The split function only works for the first page only.
In a matter of fact, I export 2 PDF files from 2 xslx files. The first one is working just fine but the other one gets exported into a PDF file and split into the needed file but it only executes the first page only then the process gets terminated.
Here is the function:
def split_pdf(file_name, info_json, save_file_name = None, folder_name = "Reports"):
if save_file_name == None:
save_file_name = file_name
output_folder_path = '{}/'.format(folder_name)
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
print("The '{}/' folder has been created".format(folder_name))
pdf = PdfFileReader(os.path.join(os.getcwd(), file_name + ".pdf"))
location = None
for page_num in range(pdf.numPages):
pdfWriter = PdfFileWriter()
pdfWriter.addPage(pdf.getPage(page_num))
page_content = pdf.getPage(page_num).extractText()
list_of_names = page_content.encode('utf-8').decode('utf-8').split("\n")
if location == None:
for index, name in enumerate(list_of_names):
if 'name' in name.lower():
location = index + 1
break
try:
Name = list_of_names[location].strip()
except:
continue
if 'name' not in list_of_names[location - 1].lower().strip():
continue
if Name == '#N/A':
continue
try:
file_location = "{}{}/{}/{}/".format(output_folder_path, info_js[Name]['Campus'], info_js[Name]['School'], Name)
except:
print("\t\tCouldn't find {}".format(Name))
continue
print("\t" + file_location + '{}.pdf'.format(save_file_name))
Path(file_location).mkdir(parents=True, exist_ok=True)
with open(file_location + '{}.pdf'.format(save_file_name), 'wb') as f:
pdfWriter.write(f)

Related

How to know the cause for assertion errors Python?

I am compiling a script for adding custom property in PDF files using PdfMerger() in PyPdf2. It worked fine for almost all the files except a few. And error occurs in some function inside the PdfMerge. I don't understand what exactly is causing this error or how to rectify it. Here is the entire program - not sure if giving a snippet would be helpful.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP" pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")
I cannot figure out how to overcome assertion errors because the error is shown to have occurred in several layers below the simplified syntax.
There is "+" sign missing in this line before ".pdf"
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value ".pdf", 'wb')
try this:
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value + ".pdf", 'wb')
hope it works
Use try and except statements when reading or merging pdf files to throw the exception messages if failed. It's always a good practice to throw errors and exceptions when working with files or memory for development purposes.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP"
pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
try:
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
except Exception as e:
print(e)
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")

TabError: inconsistent use of tabs and spaces in indentation when adding to a dictionary

I am trying to move selected images from nested subdirectories. I am match sku from an excel file to the image name (which is also the sku number). Any that matches are then moved into a new folder.
My challenge when I try to create a dictionary to save my full directory I am being faced with the following error message.
File "c:\printing\python\data_clean.py", line 56
fullpath_filelist = {file: os.path.join(root,dirs, file}
^
TabError: inconsistent use of tabs and spaces in indentation
#! python 3
# Create clean version of data file
import openpyxl, webbrowser, sys,re, os, shutil
print('Opening workbook')
#*********************
Main_Searchterm = 'Find'
Sub_Searchterm = 'Marine'
Data_path = 'C:\Printing\Python\data\datafile.xlsx'
Image_folder = 'C:\Printing\Python\data\images'
Sorted_folder ='C:\Printing\Python\data\sorted'
#**********************
def find_category():
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
#This looks for the main search term and put it into column 6
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Main_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Main_Searchterm:
sheet.cell(row = rowNum, column = 6).value = Main_Searchterm #This control which column to add the new search term
except:
pass
#This looks for the sub search term and put it into column 7
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Sub_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Sub_Searchterm:
sheet.cell(row = rowNum, column = 7).value = Sub_Searchterm #This control which column to add the new search term
except:
pass
wb.save(Data_path)
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
filelist = [] #List of all files in directory and subdirectory
fullpath_filelist ={}
for root, dirs, files in os.walk(Image_folder):
for file in files:
#append the file name to the list
filelist.append(file)
fullpath_filelist = {file: os.path.join(root,dirs, file}
for filename in filelist:
for rowNum in range(2, sheet.max_row+1):
#for rowNum in range(2, 3):
image = sheet['H' + str(rowNum)].value #This control which column to search from
final_path = os.path.join(root,Main_Searchterm,Sub_Searchterm,filename)
if str(image) == str(filename):
shutil.move(filename,final_path)
find_category()
Depending on the IDE, ctrl-F for the '\t' and replace with ' ' (4 spaces)

Get all PDF files name under same folder and save in excel according to PDF file name

I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name.
This is what I have tried
def get_files(pdf_path):
import os
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
for i in files:
save_as_excel(pdf_path, i)
As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.
Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files
Assumption and logic would remain same from my previous answer.
I have moved the data and methods and encapsulated to a class PdfExtractor.
Please follow the below steps to extract text from pdf and save as xlsx.
Before moving ahead install the packages pdfplumber, xlsxwriter
Save the below code with filename PdfExtractor.py
import pdfplumber
import xlsxwriter
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
class PdfExtractor:
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.json_data = {}
self.pdf_text = ''
def __preprocess_data(self, data):
return [el.strip() for el in data.splitlines() if el.strip()]
def __get_header_data(self, text):
header_data_list = self.__preprocess_data(text)
# third line in text of header contains Date Created field
self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def __populate_date_notes(self, data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
def get_pdf_text(self):
data_index = 1
with pdfplumber.open(self.pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
self.pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
self.__get_header_data(split_on_data_entity[0])
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = self.__preprocess_data(split_on_data_entity[data_index])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
self.__populate_date_notes(data_entity, data_processed)
self.json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
return self.json_data
def save_as_xlsx(self, file_name):
if(not self.json_data):
print("Data was not read from PDF")
return
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write column
columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], self.json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], self.json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], self.json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], self.json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(self.json_data.get(data_entity_key) != None):
for key, value in self.json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
print(file_name + " saved successfully")
Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
import os
from PdfExtractor import PdfExtractor
path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)
for file in files:
if(not file.endswith(".pdf")):
continue
filename = os.path.splitext(file)[0]
pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
pdf_text = pdf_obj.get_pdf_text()
pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
Output :
C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx
Let me know if you have any doubts in the above code.
If you mean saving each filename as an empty excel file, try this :
import os
import openpyxl
pdf_path = '.'
def get_files(pdf_path):
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
# create an empty workbook (excel file)
wb = openpyxl.workbook.Workbook()
for i in files:
output_path = os.path.join(pdf_path, i).replace('.pdf', '.xlsx')
# save as an excel file with filename
wb.save(output_path)
print(output_path)

"Delete files based on Invoice number, date and verison number"

I want to delete multiple files from certain directory based on file name, date and version number using Python. PS File creation date cant be taken into account
Refereed post of Stackoverflow but file names and version numbers are changed. How to keep date and version number to find latest file.
source = r'C:\Users\XMLFiles'
file_names = os.listdir(source)
latest_files = {}
for file_name in file_names:
name_parts = file_name.split("_")
date_stamp = name_parts[2], name_parts[3].split(".")[0]
if date_stamp not in latest_files or file_name > latest_files[date_stamp]:
latest_files[date_stamp] = file_name
print(latest_files)
keep_files = latest_files.values()
for file_name in file_names:
if file_name in keep_files:
continue
os.remove(os.path.join(source, file_name)
##################
List of files to process
=============================
Invoice_456879_20180404_2510.xml
Invoice_123876_20171027_17.xml
Invoice_123876_20180404_2513.xml
Invoice_832765_20170309_2.xml
Invoice_832765_20170313_0.xml
Invoice_832765_20170323_5.xml
Invoice_832765_20170330_2.xml
Invoice_832765_20170613_3.xml
Invoice_832765_20171206_18.xml
Invoice_832765_20171206_30.xml
Invoice_832765_20171206_36.xml
Invoice_832765_20180404_3066.xml
Invoice_832765_20180405_9770.xml
Invoice_832765_20180405_9779.xml
Invoice_698325_20170308_0.xml
Invoice_698325_20170309_3.xml
Invoice_698325_20170323_4.xml
Invoice_698325_20170330_5.xml
Invoice_698325_20170613_4.xml
Invoice_698325_20171206_8.xml
Invoice_698325_20171206_24.xml
Invoice_698325_20171206_46.xml
Invoice_698325_20180404_3067.xml
Invoice_698325_20180405_9771.xml
===========================================================
Expected Output
Invoice_456879_20180404_2510.xml
Invoice_123876_20180404_2513.xml
Invoice_832765_20180405_9779.xml
Invoice_698325_20180405_9771.xml
latest_files = {}
for file_name in file_names:
name_parts = file_name.split('_')
invoice = name_parts[1]
ymd = name_parts[2]
version = int(name_parts[3].split('.')[0])
if invoice not in latest_files:
latest_files[invoice] = (ymd, version, file_name)
continue
latest_ymd, latest_version = latest_files[invoice][:2]
if ymd > latest_ymd or (ymd == latest_ymd and version > latest_version):
latest_files[invoice] = (ymd, version, file_name)
keep_files = [tup[2] for tup in latest_files.values()]
for file_name in file_names:
if file_name in keep_files:
continue
os.remove(os.path.join(source, file_name))

outputting .zip file in django

I want to upload zip file with .csv files and output zip file with .vm files.
I use this code:
def csv_archive_to_vm(request):
response = HttpResponse(content_type='application/force-download')
work_string = ''
if request.method == "POST":
##reading input zip file
input_file = request.FILES.get('file')
zf = zipfile.ZipFile(input_file)
for info in zf.infolist():
##reading files in archive
path = re.search('(.*\.csv)', info.filename)
path_name = re.search('(.*/)(.*\.csv)', info.filename)
for string in zf.open(info.filename):
quotes_search = re.search('"(.*)",\s*"(.*)",\s*"(.*)"', string)
if quotes_search:
descr = quotes_search.group(1)
macro_name = quotes_search.group(2)
say = quotes_search.group(3)
new_lines_search = re.search('/n', say)
if new_lines_search:
say = re.sub('/n', '\n\t\t', say)
##making content for new files for new archive
work_string = work_string + '##' + descr + '\n#macro(' + macro_name + ')\n\t#random()\n\t\t' + say + '\n\t#end\n#end\n\n'
##outputting new archive
zipdata = StringIO()
zf_create = zipfile.ZipFile(zipdata, mode='a')
try:
if path_name:
zf_create.writestr(str(path_name.group(1)) + str(path_name.group(2))[0:-4] + '.vm', work_string)
finally:
zf_create.close()
work_string = ''
response = HttpResponse(zipdata.read())
response['Content-Disposition'] = 'attachment; filename=assistant-linguistics_vm.zip'
response['Content-Type'] = 'application/x-zip'
return response
but i get empty zip archive, with 0kb weight. What am i doing wrong? Thanks.

Categories