I have a folder with more than 1.000 files that's updated constantly. I'm using a script to add a random number to it based on the total of the files, like this:
Before
file_a
file_b
After
1_file_a
2_file_b
I would like to add leading zeros so that the files are sorted correctly. Like this:
0001_file_a
0010_file_b
0100_file_c
Here's the random number script:
import os
import random
used_random = []
os.chdir('c:/test')
for filename in os.listdir():
n = random.randint(1, len(os.listdir()))
while n in used_random:
n = random.randint(1, len(os.listdir()))
used_random.append(n)
os.rename(filename, f"{n}_{filename}")
I would suggest using f-strings to accomplish this.
>>> num = 2
>>> f"{num:04}_file"
'0002_file'
>>> num = 123
>>> f"{num:04}_file"
'0123_file'
I would also replace the following with a list comprehension.
cleaned_files = []
for item in folder_files:
if item[0] == '.' or item[0] == '_':
pass
else:
cleaned_files.append(item)
cleaned_files = [item for item in folder_files if not item[0] in ('.', '_')]
You should use the first element of the list obtained after split:
def getFiles(files):
for file in files:
file_number, file_end = file.split('_')
num = file_number.split()[0].zfill(4) # num is 4 characters long with leading 0
new_file = "{}_{}".format(num, file_end)
# rename or store the new file name for later rename
Something like this should work ... I hope this helps ...
import re
import glob
import os
import shutil
os.chdir('/tmp') # I played in the /tmp directory
for filename in glob.glob('[0-9]*_file_*'):
m = re.match(r'(^[0-9]+)(_.*)$', filename)
if m:
num = f"{int(m.group(1)):04}" # e.g. 23 convert to int and than format
name = m.group(2) # the rest of the name e.g. _file_a
new_filename = num + name # 0023_file_a
print(filename + " " + new_filename)
# Not sure if you like to rename the files, if yes:
# shutil.move(filename, new_filename)
Thanks to user https://stackoverflow.com/users/15261315/chris I updated the random number script to add leading zeros:
import os
import random
used_random = []
os.chdir('c:/Test')
for filename in os.listdir():
n = random.randint(1, len(os.listdir()))
while n in used_random:
n = random.randint(1, len(os.listdir()))
used_random.append(n)
os.rename(filename, f"{n:04}_{filename}")
Related
I have a script which runs on single input PDF file. Is there a way that this script can run for multiple PDF files in a directory.
The below snippet where single PDF input file is
# Select the Master PDF Path. Located in "INPUT" folder
masterPDF_path = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
master_pdf_document = 'Payment# 79724.pdf'
The complete script that runs on a single PDF file is as below :-
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import fitz
from datetime import datetime
import os
# Select the Master PDF Path. Located in "INPUT" folder
masterPDF_path = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
master_pdf_document = 'Payment# 79724.pdf'
os.chdir(masterPDF_path)
# Choose the Path of Where the Doc Split Invoices should go. Located in "OUTPUT" folder
docSplit_dumpPath = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
#=========================================================================================#
#===================================== GET NUMBER OF PAGES ===============================#
#=========================================================================================#
String1 = "WIRE TRANSFER/ACH RECORD VOUCHER"
page_range = {}
pdfstarter = 0
doc = fitz.open(masterPDF_path+ "\\" + master_pdf_document)
docPageCount = doc.page_count
#================= PARSE PDF INTO THE DICTIONARY - LONG COMPUTE TIME ======================#
for i in range(0, docPageCount):
pageText = doc.load_page(i)
totalpage = i + 1
pageiText = pageText.get_text('text')
if String1 in pageiText:
page_range.update({pdfstarter:totalpage})
pdfstarter = totalpage
#print("Current Page: ", i, " out of ", docPageCount)
#================= PARSE PDF INTO THE DICTIONARY - LONG COMPUTE TIME ======================#
invoiceList = []
for i in range(0,docPageCount):
pageText = doc.load_page(i)
pageiText = pageText.get_text('text')
if String1 in pageiText:
pageiText = pageiText.split("\n")
test_list = pageiText
# Checking if exists in list
for i in test_list:
if(i == String1):
invoice = "PAYMENT_WIRE_BANK STATEMENT_STEP_1_" + master_pdf_document
#print(invoice)
invoiceList.append(invoice)
#========================================= SETUP ==========================================#
### SPLITING INTO n
n = int(len(invoiceList))
### CREATING FOUR LIST OF Invoice LIST
fourSplit_invoiceList = [invoiceList[i:i + n] for i in range(0, len(invoiceList), n)]
### CONVERTING DIC TO LIST CONTAINING TUPLES
page_rangeList = [(k,v) for k, v in page_range.items()]
### CREATING FOUR LIST OF PAGE RANGE
fourSplit_pageRange = [page_rangeList[i:i + n] for i in range(0, len(page_rangeList), n)]
TotalNumberOfDocs = len(fourSplit_invoiceList[0])
#=========================================================================================#
#=========================================================================================#
#==================================== CREATE PDFs ========================================#
#=========================================================================================#
openpdf = PyPDF2.PdfFileReader(masterPDF_path + "\\" + master_pdf_document)
for i in range(len(fourSplit_invoiceList[0])):
page_numberstart = fourSplit_pageRange[0][i][0]
page_numberend = fourSplit_pageRange[0][i][1]
outputfile = fourSplit_invoiceList[0][i]
outputfile = os.path.join(docSplit_dumpPath, outputfile)
try:
assert page_numberstart < openpdf.numPages
pdf_writer1 = PdfFileWriter()
for page in range(page_numberstart, page_numberend):
pdf_writer1.addPage(openpdf.getPage(page))
with open("{}".format(outputfile), 'wb') as file0:
pdf_writer1.write(file0)
except AssertionError as e:
print("Error: The PDF you are cutting has less pages than you want to cut!")
If you have a list of file names you can loop over them:
files = ['Payment# 1.pdf', 'Payment# 2.pdf']
for file in files:
master_pdf_document = file
Or, if you want to loop over your payment numbers and the 'Payment' string remains unchanged:
payment_numbers = [1,2]
for payment_number in payment_numbers:
master_pdf_document = 'Payment# '+str(payment_number)+'.pdf'
I created a code to version names in python. The idea is to add v1, v2... if a name already exists in a list. I tried the following code:
import pandas as pd
list_names = pd.Series(['name_1', 'name_1_v1'])
name = 'name_1'
new_name = name
i = 1
while list_names.str.contains(new_name).any() == True:
new_name = f'{name}_v{i}'
if list_names.str.contains(new_name).any() == False:
break
i = i + 1
It works fine when I input 'name_1' (output: 'name_1_v2'), however, when I enter 'name_1_v1', the output is 'name_1_v1_v1' (correct would be 'name_1_v2'). I thought of using a regex with pattern _v[0-9]$, but I wasnt able to make it work.
<<< edit >>>
Output should be new_name = 'name_1_v2'. The idea is to find an adequate versioned name, not change the ones in the list.
Proposed code :
import pandas as pd
import re
basename = 'name_1'
def new_version(lnam, basename):
i, lat_v = 0, 0
# looks for latest version
while i < len(lnam):
if re.search('v\d*', lnam[i]) is not None:
lat_v = max(int(re.findall('v\d*', lnam[i])[0][1:]), lat_v)
i+=1
if lat_v == 0:
return basename + '_v1'
else:
return basename + '_v%s'%(lat_v+1)
lnam = pd.Series(['name_1'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v1
lnam = pd.Series(['name_1', 'name_1_v1'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v2
Result :
new_name : name_1_v2
Let's try now with an unordered list of names (next version is 101) :
lnam = pd.Series(['name_1', 'name_1_v4', 'name_1_v100', 'name_1_v12', 'name_1_v17'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v101
Basename automatic identification (like #FernandoQuintino suggests)
basename = re.sub('_v\d*', '', basename)
# name_1
I have a list of strings:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
and I'd like to confirm that there is both a Run.1-Final and Run.2-Initial for each date.
I've tried something like:
for i in range(len(directoryList)):
if directoryList[i][5:15] != directoryList[i + 1][5:15]:
print(directoryList[i] + ' is missing.')
i += 2
and I'd like the output to be
'YMML.2019.09.14-Run.2-Initial.pdf is missing,
Perhaps something like
dates = [directoryList[i][5:15] for i in range(len(directoryList))]
counter = collections.Counter(dates)
But then having trouble extracting from the dictionary.
To make it more readable, you could create a list of dates first, then loop over those.
file_list = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
dates = set([item[5:15] for item in file_list])
for date in dates:
if 'YMML.' + date + '-Run.1-Final.pdf' not in file_list:
print('YMML.' + date + '-Run.1-Final.pdf is missing')
if 'YMML.' + date + '-Run.2-Initial.pdf' not in file_list:
print('YMML.' + date + '-Run.2-Initial.pdf is missing')
set() takes the unique values in the list to avoid looping through them all twice.
I'm kind of late but here's what i found to be the simplest way, maybe not the most efficent :
for file in fileList:
if file[20:27] == "1-Final":
if (file[0:20] + "2-Initial.pdf") not in fileList:
print(file)
elif file[19:29] is "2-Initial.pdf":
if (file[0:20] + "1-Final.pdf") not in fileList:
print(file)
Here's an O(n) solution which collects items into a defaultdict by date, then filters on quantity seen, restoring original names from the remaining value:
from collections import defaultdict
files = [
'YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',
]
seen = defaultdict(list)
for x in files:
seen[x[5:15]].append(x)
missing = [v[0] for k, v in seen.items() if len(v) < 2]
print(missing) # => ['YMML.2019.09.14-Run.1-Final.pdf']
Getting names of partners can be done with a conditional:
names = [
x[:20] + "2-Initial.pdf" if x[20] == "1" else
x[:20] + "1-Final.pdf" for x in missing
]
print(names) # => ['YMML.2019.09.14-Run.2-Initial.pdf']
This works:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
initial_set = {filename[:15] for filename in fileList if 'Initial' in filename}
final_set = {filename[:15] for filename in fileList if 'Final' in filename}
for filename in final_set - initial_set:
print(filename + '-Run.2-Initial.pdf is missing.')
for filename in initial_set - final_set:
print(filename + '-Run.1-Final.pdf is missing.')
I have a python script for Editorial on iOS that I've modified, and I would like help tweaking it further.
I have .taskpaper files in a dropbox folder that Editorial is pointed at. When I run this workflow the script search all the files and return a list of lines that include "#hardware". This is working well but the final list includes items with #hardware that I've finished and appended with #done. How can I exclude #hardware lines with #done?
There are seven files that run. These two seem to be the ones that need to be modified:
Generate the list of hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
pattern = re.compile(r'\s#{1}(\w+)', re.I|re.U)
p = editor.get_path()
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
tags = ['#hardware']
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() in valid_extensions:
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
tags.append(match.group(1))
except UnicodeDecodeError, e:
pass
workflow.set_output('\n'.join(sorted(set(tags))))
and
Search documents with hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
from StringIO import StringIO
theme = editor.get_theme()
workflow.set_variable('CSS', workflow.get_variable('CSS Dark' if theme == 'Dark' else 'CSS Light'))
p = editor.get_path()
searchterm = workflow.get_variable('Search Term')
term = '#' + searchterm
pattern = re.compile(re.escape(term), flags=re.IGNORECASE)
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
html = StringIO()
match_count = 0
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() not in valid_extensions:
continue
found_snippets = []
i = 0
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
start = max(0, match.start(0) - 100)
end = min(len(line)-1, match.end(0) + 100)
snippet = (line[start:match.start(0)],
match.group(0),
line[match.end(0):end],
match.start(0) + i,
match.end(0) + i)
found_snippets.append(snippet)
i += len(line)
except UnicodeDecodeError, e:
pass
if len(found_snippets) > 0:
match_count += 1
root, rel_path = editor.to_relative_path(full_path)
ed_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
html.write('<h2>' + name + '</h2>')
for snippet in found_snippets:
start = snippet[3]
end = snippet[4]
select_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
select_url += '&selection=' + str(start) + '-' + str(end)
html.write('<a class="result-box" href="' + select_url + '">' + snippet[0] + '<span class="highlight">' + snippet[1] + '</span>' + snippet[2] + '</a>')
if match_count == 0:
html.write('<p>No matches found.</p>')
workflow.set_output(html.getvalue())
Thank you.
Since the matching lines are stored in a list, you can use a list comprhension to exlcude the ones you don't want. Something like this:
l = ['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
print(l)
['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
m = [ s for s in l if '#done' not in s]
print(m)
['#hardware ttuff', 'stuff #hardware', '#hardware']
A friend solved it for me.
We added:
if not "#done" in line:
in the "Search documents with hashtags" file after
for line in f:
Works great
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)