I have a script which runs on single input PDF file. Is there a way that this script can run for multiple PDF files in a directory.
The below snippet where single PDF input file is
# Select the Master PDF Path. Located in "INPUT" folder
masterPDF_path = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
master_pdf_document = 'Payment# 79724.pdf'
The complete script that runs on a single PDF file is as below :-
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import fitz
from datetime import datetime
import os
# Select the Master PDF Path. Located in "INPUT" folder
masterPDF_path = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
master_pdf_document = 'Payment# 79724.pdf'
os.chdir(masterPDF_path)
# Choose the Path of Where the Doc Split Invoices should go. Located in "OUTPUT" folder
docSplit_dumpPath = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
#=========================================================================================#
#===================================== GET NUMBER OF PAGES ===============================#
#=========================================================================================#
String1 = "WIRE TRANSFER/ACH RECORD VOUCHER"
page_range = {}
pdfstarter = 0
doc = fitz.open(masterPDF_path+ "\\" + master_pdf_document)
docPageCount = doc.page_count
#================= PARSE PDF INTO THE DICTIONARY - LONG COMPUTE TIME ======================#
for i in range(0, docPageCount):
pageText = doc.load_page(i)
totalpage = i + 1
pageiText = pageText.get_text('text')
if String1 in pageiText:
page_range.update({pdfstarter:totalpage})
pdfstarter = totalpage
#print("Current Page: ", i, " out of ", docPageCount)
#================= PARSE PDF INTO THE DICTIONARY - LONG COMPUTE TIME ======================#
invoiceList = []
for i in range(0,docPageCount):
pageText = doc.load_page(i)
pageiText = pageText.get_text('text')
if String1 in pageiText:
pageiText = pageiText.split("\n")
test_list = pageiText
# Checking if exists in list
for i in test_list:
if(i == String1):
invoice = "PAYMENT_WIRE_BANK STATEMENT_STEP_1_" + master_pdf_document
#print(invoice)
invoiceList.append(invoice)
#========================================= SETUP ==========================================#
### SPLITING INTO n
n = int(len(invoiceList))
### CREATING FOUR LIST OF Invoice LIST
fourSplit_invoiceList = [invoiceList[i:i + n] for i in range(0, len(invoiceList), n)]
### CONVERTING DIC TO LIST CONTAINING TUPLES
page_rangeList = [(k,v) for k, v in page_range.items()]
### CREATING FOUR LIST OF PAGE RANGE
fourSplit_pageRange = [page_rangeList[i:i + n] for i in range(0, len(page_rangeList), n)]
TotalNumberOfDocs = len(fourSplit_invoiceList[0])
#=========================================================================================#
#=========================================================================================#
#==================================== CREATE PDFs ========================================#
#=========================================================================================#
openpdf = PyPDF2.PdfFileReader(masterPDF_path + "\\" + master_pdf_document)
for i in range(len(fourSplit_invoiceList[0])):
page_numberstart = fourSplit_pageRange[0][i][0]
page_numberend = fourSplit_pageRange[0][i][1]
outputfile = fourSplit_invoiceList[0][i]
outputfile = os.path.join(docSplit_dumpPath, outputfile)
try:
assert page_numberstart < openpdf.numPages
pdf_writer1 = PdfFileWriter()
for page in range(page_numberstart, page_numberend):
pdf_writer1.addPage(openpdf.getPage(page))
with open("{}".format(outputfile), 'wb') as file0:
pdf_writer1.write(file0)
except AssertionError as e:
print("Error: The PDF you are cutting has less pages than you want to cut!")
If you have a list of file names you can loop over them:
files = ['Payment# 1.pdf', 'Payment# 2.pdf']
for file in files:
master_pdf_document = file
Or, if you want to loop over your payment numbers and the 'Payment' string remains unchanged:
payment_numbers = [1,2]
for payment_number in payment_numbers:
master_pdf_document = 'Payment# '+str(payment_number)+'.pdf'
Related
i would like to get the radio-button / checkbox information from a pdf-document -
I had a look at pdfplumber and pypdf2 - but was not able to find a solution with this modules.
I can parse the text using this code - but for the radio-buttons i get only the text - but no information which button (or checkbox) is selected.
import pdfplumber
import os
import sys
if __name__ == '__main__':
path = os.path.abspath(os.path.dirname(sys.argv[0]))
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
page = pdf.pages[0]
text = page.extract_text()
I have also uploaded an example file here:
https://easyupload.io/8y8k2v
Is there any way to get this information from the pdf-file using python?
I think i found a solution using pdfplumber -
(probably not elegant - but i can check if the radio-buttons are selected or not)
Generally:
i read all chars and all curves for all pages
then i sort all elements by x and y (to get the chars and elements in the correct order like in the pdf)
then i concatenate the cars and add also blanks when the distance between the chars is longer than in a word
i check the pts-information for the carves and get so the information if the radio button is selected or not
the final lines and yes/not informatin i store in a list line-by-line for furhter working
import pdfplumber
import os
import sys
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
finalContent = []
for idx,page in enumerate(pdf.pages, start=1):
print(f"Reading page {idx}")
contList = []
for e in page.chars:
tmpRow = ["char", e["text"], e["x0"], e["y0"]]
contList.append(tmpRow)
for e in page.curves:
tmpRow = ["curve", e["pts"], e["x0"], e["y0"]]
contList.append(tmpRow)
contList.sort(key=lambda x: x[2])
contList.sort(key=lambda x: x[3], reverse=True)
workContent = []
workText = ""
workDistCharX = False
for e in contList:
if e[0] == "char":
if workDistCharX != False and \
(e[2] - workDistCharX > 20 or e[3] - workDistCharY < -2):
workText += " / "
workText += e[1]
workDistCharX = e[2]
workDistCharY = e[3]
continue
if e[0] == "curve":
if workText != "":
workContent.append(workText)
workText = ""
if e[1][0][0] < 100:
tmpVal = "SELECT-YES"
else:
tmpVal = "SELECT-NO"
workContent.append(f"CURVE {tmpVal}, None, None")
finalContent.extend(workContent)
workContent = "\n".join(workContent)
I would like to create different Word-Documents using a template and an excel-input-file.
So I read the xlsx-file, change the content of the template, and want to save several Docx-files. I use the following code which generally works fine for the first document (anything gets replaced and stored as expected). But in the following documents, there is always the same content as in the first document. I tried to reassign the for every row in the excel-sheet the document with
docWork = doc
But it seems that somehow this initialization is not working.
This is the full code I am using:
from docx import Document
import os, sys
import xlwings as xw
import time
from DateTime import DateTime
if __name__ == '__main__':
print(f"Start Program V8...")
SAVE_INTERVAL = 5
WAIT = 3
FN = "dataCreateCover.xlsx"
path = os.path.abspath(os.path.dirname(sys.argv[0]))
fn = os.path.join(path, FN)
print(f"Read {fn}...")
wb = xw.Book (fn)
ws = wb.sheets[0]
inpData = ws.range ("A2:Z5000").value
inpData = [x for x in inpData if x[0] != None]
tday = str(datetime.today().date())
print(f"Read DOCX-Files...")
FNDOC = "template.docx"
fnDoc = os.path.join(path, FNDOC)
print(f"Path for DOCX: {fnDoc}...")
doc = Document(fnDoc)
for elem in inpData:
dictWords = {}
docWork = doc
elem = [x for x in elem if x != None]
for idxElem, valElem in enumerate(elem):
dictWords[f"[section{idxElem + 1}]"] = valElem
for idx,para in enumerate(docWork.paragraphs):
for k,v in dictWords.items():
if k in para.text:
inline = para.runs
for item in inline:
if k in item.text:
item.text = item.text.replace(k, v)
print(f"Replaced {k} with {v}...")
break
docFN = f"{tday}_{elem[1]}.docx"
docWork.save(docFN)
print(f"Document <{docFN}> created - pls press <enter> to close the window...")
How can I use the docx-template and write different word-docs as output?
I'm working on a project where the file name consists of actual dates but the data for the dates are split into multiple files.
Developed the following program to count the number of files for each date (part of the filename) and also the total size.
Is there a better way to achieve the same?
import os
import glob
import os
import collections
directory_name = "\\SpeicifDir\\"
# Get a list of files (file paths) in the given directory
list_of_files = filter( os.path.isfile,
glob.glob(directory_name + '*.txt') )
mapOfDateFileSize = collections.defaultdict(list)
# For all the files
for file_path in list_of_files:
file_size = os.stat(file_path).st_size
filename = os.path.basename(file_path)
splitFilename = filename.split('-')
# Extract the file and split the file using - as a separator
dtForFile = splitFilename[1] + "-" + splitFilename[2] + "-" + splitFilename[3]
# Get the file name and size
if dtForFile in mapOfDateFileSize:
dataFromDictionary = mapOfDateFileSize[dtForFile]
dataFromDictionary = dataFromDictionary[0]
totalCount = dataFromDictionary[0]
totalSize = dataFromDictionary[1]
totalCount = totalCount + 1
totalSize = totalSize + file_size
# Update the file size and count
mapOfDateFileSize[dtForFile] = [ (totalCount, totalSize) ]
else:
mapOfDateFileSize[dtForFile].append((1,file_size))
# For each date get the total size, total file count
for dt,elements in mapOfDateFileSize.items():
dataFromDictionary = elements[0]
totalCount = dataFromDictionary[0]
totalSize = dataFromDictionary[1]
print (dt, ",", totalCount , ",", totalSize)
My goal is to download full metazoan genome sequences from NCBI. I have a list of unique ID numbers for the genome sequences I need. I planned to use the Bio.Entrez module EFetch to download the data but learned today via the Nov 2, 2011 release notes (http://1.usa.gov/1TA5osg) that EFetch does not support the 'Genome' database. Can anyone suggest an alternative package/module or some other way around this? Thank you in advance!
Here is a script for you -- though you may need to tinker with it to make it work. Name the script whatever you prefer, but when you call the script do so as follows:
python name_of_script[with .py extension] your_email_address.
You need to add your email to the end of the call else it will not work. If you have a text file of accession numbers (1/line), then choose option 2. If you choose option 1, it will ask you for items like the name of the organism, strain name, and keywords. Use as many keywords as you would like -- just be certain to separate them by commas. If you go with the first option, NCBI will be searched and will return GI numbers [NOTE: NCBI is phasing out the GI numbers in 9.2016 so this script may not work after this point] which will then be used to snag the accession numbers. Once all the accession numbers are present, a folder is created, and a subfolder is created for each accession number (named as the accession number). In each subfolder, the corresponding fasta AND genbank file will be downloaded. These files will carry the accession number as the file name (e.g. accession_number.fa, accession_number.gb). Edit script to your purposes.
ALSO...Please note the warning (ACHTUNG) portion of the script. Sometimes the rules can be bent...but if you are egregious enough, your IP may be blocked from NCBI. You have been warned.
import os
import os.path
import sys
import re #regular expressions
from Bio import Entrez
import datetime
import time
import glob
arguments = sys.argv
Entrez.email = arguments[1] #email
accession_ids = []
print('Select method for obtaining the accession numbers?\n')
action = input('1 -- Input Search Terms\n2 -- Use text file\n')
if action == '1':
print('\nYou will be asked to enter an organism name, a strain name, and keywords.')
print('It is not necessary to provide a value to each item (you may just hit [ENTER]), but you must provide at least one item.\n')
organism = input('Enter the organism you wish to search for (e.g. Escherichia coli [ENTER])\n')
strain = input('Enter the strain you wish to search for. (e.g., HUSEC2011 [ENTER])\n')
keywords = input('Enter the keywords separated by a comma (e.g., complete genome, contigs, partial [ENTER])\n')
search_phrase = ''
if ',' in keywords:
keywords = keywords.split(',')
ncbi_terms = ['organism', 'strain', 'keyword']
ncbi_values = [organism, strain, keywords]
for index, n in enumerate(ncbi_values):
if index == 0 and n != '':
search_phrase = '(' + n + '[' + ncbi_terms[index] + '])'
else:
if n != '' and index != len(ncbi_values)-1:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is not list:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is list:
for name in n:
name = name.lstrip()
search_phrase = search_phrase + ' AND (' + name + '[' + ncbi_terms[index] + '])'
print('Here is the complete search line that will be used: \n\n', search_phrase)
handle = Entrez.esearch(db='nuccore', term=search_phrase, retmax=1000, rettype='acc', retmode='text')
result = Entrez.read(handle)
handle.close()
#print(result['Count'])
gi_numbers = result['IdList']
fetch_handle = Entrez.efetch(db='nucleotide', id=result['IdList'], rettype='acc', retmode='text')
accession_ids = [id.strip() for id in fetch_handle]
fetch_handle.close()
if action == '2': #use this option if you have a file of accession #s
file_name = input('Enter the name of the file\n')
with open(file_name, 'r') as input_file:
lines = input_file.readlines()
for line in lines:
line = line.replace('\n', '')
accession_ids.append(line)
#--------------------------------------------------------------------------------------------------------------
#----------------------------------- Make directory to store files --------------------------------------------
new_path = 'Genbank_Files/'
if not os.path.exists(new_path):
os.makedirs(new_path)
print('You have ' + str(len(accession_ids)) + ' file(s) to download.') #print(accession_ids)
ending='.gb'
files = []
##CHECK IF FILE HAS BEEN DOWNLOADED
for dirpath, dirnames, filenames in os.walk(new_path):
for filename in [f for f in filenames if f.endswith(ending)]: #for zipped files
files.append(os.path.join(dirpath,filename))
for f in files:
f = f.rsplit('/')[-1]
f = f.replace('.gb', '')
if f in accession_ids:
ind = accession_ids.index(f)
accession_ids.pop(ind)
print('')
print('You have ' + str(len(accession_ids)) + ' file(s) to download.')
#--------------------------------------------------------------------------
###############################################################################
#---ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG----#
###############################################################################
# Call Entrez to download files
# If downloading more than 100 files...
# Run this script only between 9pm-5am Monday - Friday EST
# Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov
# Make no more than 3 requests every 1 second (Biopython takes care of this).
# Use URL parameter email & tool for distributed software
# NCBI's Disclaimer and Copyright notice must be evident to users of your service.
#
# Use this script at your own risk.
# Neither the script author nor author's employers are responsible for consequences arising from improper usage
###############################################################################
# CALL ENTREZ: Call Entrez to download genbank AND fasta (nucleotide) files using accession numbers.
###############################################################################
start_day = datetime.date.today().weekday() # 0 is Monday, 6 is Sunday
start_time = datetime.datetime.now().time()
print(str(start_day), str(start_time))
print('')
if ((start_day < 5 and start_time > datetime.time(hour=21)) or (start_day < 5 and start_time < datetime.time(hour=5)) or start_day > 5 or len(accession_ids) <= 100 ):
print('Calling Entrez...')
for a in accession_ids:
if ((datetime.date.today().weekday() < 5 and datetime.datetime.now().time() > datetime.time(hour=21)) or
(datetime.date.today().weekday() < 5 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() == start_day + 1 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() > 5) or len(accession_ids) <= 100 ):
print('Downloading ' + a)
new_path = 'Genbank_Files/' + a + '/'
if not os.path.exists(new_path):
os.makedirs(new_path)
handle=Entrez.efetch(db='nucleotide', id=a, rettype='gb', retmode='text', seq_start=0)
FILENAME = new_path + a + '.gb'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
handle=Entrez.efetch(db='nucleotide', id=a, rettype='fasta', retmode='text')
FILENAME = new_path + a + '.fna'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
else:
print('You have too many files to download at the time. Try again later.')
#-------
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)