Comparing part of a string within a list - python

I have a list of strings:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
and I'd like to confirm that there is both a Run.1-Final and Run.2-Initial for each date.
I've tried something like:
for i in range(len(directoryList)):
if directoryList[i][5:15] != directoryList[i + 1][5:15]:
print(directoryList[i] + ' is missing.')
i += 2
and I'd like the output to be
'YMML.2019.09.14-Run.2-Initial.pdf is missing,
Perhaps something like
dates = [directoryList[i][5:15] for i in range(len(directoryList))]
counter = collections.Counter(dates)
But then having trouble extracting from the dictionary.

To make it more readable, you could create a list of dates first, then loop over those.
file_list = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
dates = set([item[5:15] for item in file_list])
for date in dates:
if 'YMML.' + date + '-Run.1-Final.pdf' not in file_list:
print('YMML.' + date + '-Run.1-Final.pdf is missing')
if 'YMML.' + date + '-Run.2-Initial.pdf' not in file_list:
print('YMML.' + date + '-Run.2-Initial.pdf is missing')
set() takes the unique values in the list to avoid looping through them all twice.

I'm kind of late but here's what i found to be the simplest way, maybe not the most efficent :
for file in fileList:
if file[20:27] == "1-Final":
if (file[0:20] + "2-Initial.pdf") not in fileList:
print(file)
elif file[19:29] is "2-Initial.pdf":
if (file[0:20] + "1-Final.pdf") not in fileList:
print(file)

Here's an O(n) solution which collects items into a defaultdict by date, then filters on quantity seen, restoring original names from the remaining value:
from collections import defaultdict
files = [
'YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',
]
seen = defaultdict(list)
for x in files:
seen[x[5:15]].append(x)
missing = [v[0] for k, v in seen.items() if len(v) < 2]
print(missing) # => ['YMML.2019.09.14-Run.1-Final.pdf']
Getting names of partners can be done with a conditional:
names = [
x[:20] + "2-Initial.pdf" if x[20] == "1" else
x[:20] + "1-Final.pdf" for x in missing
]
print(names) # => ['YMML.2019.09.14-Run.2-Initial.pdf']

This works:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
initial_set = {filename[:15] for filename in fileList if 'Initial' in filename}
final_set = {filename[:15] for filename in fileList if 'Final' in filename}
for filename in final_set - initial_set:
print(filename + '-Run.2-Initial.pdf is missing.')
for filename in initial_set - final_set:
print(filename + '-Run.1-Final.pdf is missing.')

Related

Version strings in python using patterns

I created a code to version names in python. The idea is to add v1, v2... if a name already exists in a list. I tried the following code:
import pandas as pd
list_names = pd.Series(['name_1', 'name_1_v1'])
name = 'name_1'
new_name = name
i = 1
while list_names.str.contains(new_name).any() == True:
new_name = f'{name}_v{i}'
if list_names.str.contains(new_name).any() == False:
break
i = i + 1
It works fine when I input 'name_1' (output: 'name_1_v2'), however, when I enter 'name_1_v1', the output is 'name_1_v1_v1' (correct would be 'name_1_v2'). I thought of using a regex with pattern _v[0-9]$, but I wasnt able to make it work.
<<< edit >>>
Output should be new_name = 'name_1_v2'. The idea is to find an adequate versioned name, not change the ones in the list.
Proposed code :
import pandas as pd
import re
basename = 'name_1'
def new_version(lnam, basename):
i, lat_v = 0, 0
# looks for latest version
while i < len(lnam):
if re.search('v\d*', lnam[i]) is not None:
lat_v = max(int(re.findall('v\d*', lnam[i])[0][1:]), lat_v)
i+=1
if lat_v == 0:
return basename + '_v1'
else:
return basename + '_v%s'%(lat_v+1)
lnam = pd.Series(['name_1'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v1
lnam = pd.Series(['name_1', 'name_1_v1'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v2
Result :
new_name : name_1_v2
Let's try now with an unordered list of names (next version is 101) :
lnam = pd.Series(['name_1', 'name_1_v4', 'name_1_v100', 'name_1_v12', 'name_1_v17'])
new_name = new_version(lnam, basename)
print("new_name : ", new_name)
# new_name : name_1_v101
Basename automatic identification (like #FernandoQuintino suggests)
basename = re.sub('_v\d*', '', basename)
# name_1

Rename duplicate pdf name by increasing counter

So I've written something to pull out certain string (beneficiary) from pdf's and rename the file based on the string but the problem is if there are duplicates, is there any way to add a +1 counter behind the name?
My inefficient code as follow, appreciate any help!:
for filename in os.listdir(input_dir):
if filename.endswith('.pdf'):
input_path = os.path.join(input_dir, filename)
pdf_array = (glob.glob(input_dir + '*.pdf'))
for current_pdf in pdf_array:
with pdfplumber.open(current_pdf) as pdf:
page = pdf.pages[0]
text = page.extract_text()
keyword = text.split('\n')[2]
try:
if 'attention' in keyword:
pdf_to_att = text.split('\n')[2]
start_to_att = 'For the attention of: '
to_att = pdf_to_att.removeprefix(start_to_att)
pdf.close()
result = to_att
os.rename(current_pdf, result + '.pdf')
else:
pdf_to_ben = text.split('\n')[1]
start_to_ben = 'Beneficiary Name : '
end_to_ben = pdf_to_ben.rsplit(' ', 1)[1]
to_ben = pdf_to_ben.removeprefix(start_to_ben).removesuffix(end_to_ben).rstrip()
pdf.close()
result = to_ben
os.rename(current_pdf, result + '.pdf')
except Exception:
pass
messagebox.showinfo("A Title", "Done!")
edit: the desired output should be
AAA.pdf
AAA_2.pdf
BBB.pdf
CCC.pdf
CCC_2.pdf
I would use a dict to record the occurrence count of each filename.
dict.get() returns the value for key if key is in the dictionary, else default. If default is not given, it defaults to None
pdf_name_count = {}
for current_pdf in pdf_array:
with pdfplumber.open(current_pdf) as pdf:
page = pdf.pages[0]
text = page.extract_text()
keyword = text.split('\n')[2]
try:
if 'attention' in keyword:
...
result = to_att
else:
...
result = to_ben
filename_count = pdf_name_count.get(result, 0)
if filename_count >= 1:
filename = f'{result}_{filename_count+1}.pdf'
else:
filename = result + '.pdf'
os.rename(current_pdf, filename)
# increase the name occurrence by 1
pdf_name_count[result] = filename_count + 1
except Exception:
pass
What you want is to build a string, for the filename, that includes a counter,
let's call it cnt. Python has the f-string syntax for this exact purpose, it
lets you interpolate a variable into a string.
Initialize your counter before the for loop:
cnt = 0
Replace
os.rename(current_pdf, result + '.pdf')
with
os.rename(current_pdf, f'{result}_{cnt}.pdf')
cnt += 1
The f before the opening quote introduces the f-string, and the curly braces
{} let you include any python expression, in your case we just substitute the
values of the two variables result and cnt. Then we increment the counter,
of course.
os.path.isfile can be your mate meet your needs.
import os
def get_new_name(result):
file_name = result + '{}.pdf'
file_number = 0
if os.path.isfile(file_name.format('')): # AAA.pdf
file_number = 2
while os.path.isfile(file_name.format('_{}'.format(file_number))):
file_number += 1
if file_number:
pdf_name = file_name.format('_{}'.format(file_number))
else:
pdf_name = file_name.format('')
return pdf_name
my screenshot
I update code for your output format, it can be work.

Sort dicoms images using the metadata?

I am trying to sort dicoms of multiple subjects into their respective folders based on their PatientID. The current directory has all the dicoms for all subjects without sorting. I am able to go through a dicom directory and group subjects by their PatientID and count how many dicoms each subject has. Is it possible to copy or move the dicoms to another directory and sort them in a folder based on their PatientID.
code:
os.listdir('\\dicoms')
device = torch.device("cuda")
print(device)
input_path = '\\dicoms\\'
ds_columns = ['ID', 'PatientID', 'Modality', 'StudyInstance',
'SeriesInstance', 'PhotoInterpretation', 'Position0',
'Position1', 'Position2', 'Orientation0', 'Orientation1',
'Orientation2', 'Orientation3', 'Orientation4', 'Orientation5',
'PixelSpacing0', 'PixelSpacing1']
def extract_dicom_features(ds):
ds_items = [ds.SOPInstanceUID,
ds.PatientID,
ds.Modality,
ds.StudyInstanceUID,
ds.SeriesInstanceUID,
ds.PhotometricInterpretation,
ds.ImagePositionPatient,
ds.ImageOrientationPatient,
ds.PixelSpacing]
line = []
for item in ds_items:
if type(item) is pydicom.multival.MultiValue:
line += [float(x) for x in item]
else:
line.append(item)
return line
list_img = os.listdir(input_path + 'imgs')
print(len(list_img))
df_features = []
for img in tqdm.tqdm(list_img):
img_path = input_path + 'imgs/' + img
ds = pydicom.read_file(img_path)
df_features.append(extract_dicom_features(ds))
df_features = pd.DataFrame(df_features, columns=ds_columns)
df_features.head()
df_features.to_csv('\\meta.csv')
print(Counter(df_features['PatientID']))
example of metadata:
,ID,PatientID,Modality,StudyInstance,SeriesInstance,PhotoInterpretation,Position0,Position1,Position2,Orientation0,Orientation1,Orientation2,Orientation3,Orientation4,Orientation5,PixelSpacing0,PixelSpacing1
0,ID_000012eaf,ID_f15c0eee,CT,ID_30ea2b02d4,ID_0ab5820b2a,MONOCHROME2,-125.0,-115.89798,77.970825,1.0,0.0,0.0,0.0,0.927184,-0.374607,0.488281,0.488281
example of Counter output:
Counter({'ID_19702df6': 28, 'ID_b799ed34': 26, 'ID_e3523464': 26, 'ID_cd9169c2': 26, 'ID_e326a8a4': 24, 'ID_45da90cb': 24, 'ID_99e4f787': 24, 'ID_df751e93': 24, 'ID_929a5b39': 20})
I added the following code to try to sort the images into subdirectories but I run into an error:
dest_path = input_path+'imageProcessDir'
counter = 0
for index, rows in df_features.iterrows():
filename = basename(rows['ID'])
image = cv2.imread(input_path+rows['ID'])
counter=counter+1
fold = rows['PatientID']+"/"
dest_fold = dest_path+fold
cv2.imwrite(dest_fold+"/"+filename+ "_" +str(counter)+".dcm", img)
error:
Traceback (most recent call last):
File "ct_move.py", line 77, in <module>
cv2.imwrite(dest_fold+"/"+filename+ "_" +str(counter)+".dcm", img)
TypeError: Expected cv::UMat for argument 'img'
I'd also second ditching CV for this - it is overkill.
Try pydicom instead.
What I'd do for your problem (move all files with same patient ID into their own folder, and count how many for each) is:
get list of dicom files as a list (use glob.glob to search a directory and/or just pass in the full file list via argv)
load all those files into a list of pydicom dicom file objects (DataSets), so something like:
import pydicom
for fname in glob.glob(sys.argv[1], recursive=False):
print("loading: {}".format(fname))
files.append(pydicom.read_file(fname))
go through that list and move (creating new directory if required) that file. So something like (not working code - I can't off the top of my head remember the os module methods I'm just putting the function in <>'s, just showing how conceptually to do it):
from collections import defaultdict
# dict for counting number of files for each patient ID
patient_id_count = defaultdict(lambda: 0)
for f in files:
id = f.PatientID # this gets the patient ID from the current file
if os.<directory doesnt exist>(id):
os.<create directory>(id)
os.<move>(f.file_name, id)
patient_id_count{id} += 1
To address your issue, it seems like overkill to use opencv here at all. If all you want to do is to move the dicom images from one location into another on the filesystem, you could use os.rename or shutil.move if you are on a UNIX-like system. Unless you are modifying image content, these are cleaner and faster solutions.
I noticed two little things in your last code block:
I think I noticed that you want the fold variable to have the "/" prefixed instead of suffixed for the paths to work.
Also, the counter will continue to increment across all dicoms, where I think you want it to increment on a per-subject basis (I am assuming that df_features will be sorted on PatientID here, if it is not, maybe you could use the Counter class).
dest_path = input_path+'imageProcessDir'
counter = 0
prev_fold = '/' + df_features.loc[0, 'PatientID']
for index, rows in df_features.iterrows():
filename = basename(rows['ID'])
counter=counter + 1
fold = '/' + rows['PatientID']
dest_fold = dest_path + fold
out_file = dest_fold + "/" + filename + "_" + str(counter) + ".dcm"
os.rename(input_path + rows['ID'], out_file)
if fold != prev_fold:
counter = 0 # reset when the PatientID changes
prev_fold = fold
I would also use os.path.join to handle filesystem paths instead of adding "/" to everything:
fold = rows['PatientID']
dest_fold = os.path.join(dest_path, fold)
as I think that there is also an issue with the input file path: input_path + rows['ID']
edit:
This is to get rid of the use of '/' and put in os.path.join
dest_path = os.path.join(input_path, 'imageProcessDir')
counter = 0
prev_fold = df_features.loc[0, 'PatientID']
for index, rows in df_features.iterrows():
filename = basename(rows['ID'])
counter=counter + 1
fold = rows['PatientID']
dest_fold = os.path.join(dest_path, fold)
os.makedirs(dest_fold, exist_ok=True) # make sure target folder exists
out_file = os.path.join(dest_fold, filename + "_" + str(counter) + ".dcm")
os.rename(os.path.join(input_path, rows['ID']), out_file)
if fold != prev_fold:
counter = 0 # reset when the PatientID changes
prev_fold = fold
Also, note that os.rename(os.path.join(input_path, rows['ID']), out_file) may need to be os.rename(os.path.join(input_path, rows['ID'] + '.dcm'), out_file)
If it's not too much, you may want to make a backup of your files before attempting this, to make sure you get what you want out!
Thank you I solved the problem with your help.
Solution:
os.listdir('directory')
device = torch.device("cuda")
print(device)
input_path = 'directory\\'
ds_columns = ['ID', 'PatientID', 'Modality', 'StudyInstance',
'SeriesInstance', 'PhotoInterpretation', 'Position0',
'Position1', 'Position2', 'Orientation0', 'Orientation1',
'Orientation2', 'Orientation3', 'Orientation4', 'Orientation5',
'PixelSpacing0', 'PixelSpacing1']
def extract_dicom_features(ds):
ds_items = [ds.SOPInstanceUID,
ds.PatientID,
ds.Modality,
ds.StudyInstanceUID,
ds.SeriesInstanceUID,
ds.PhotometricInterpretation,
ds.ImagePositionPatient,
ds.ImageOrientationPatient,
ds.PixelSpacing]
line = []
for item in ds_items:
if type(item) is pydicom.multival.MultiValue:
line += [float(x) for x in item]
else:
line.append(item)
return line
list_img = os.listdir(input_path)
print(len(list_img))
print('***********')
print(list_img)
print('***********')
df_features = []
for img in tqdm.tqdm(list_img):
img_path = input_path + img
ds = pydicom.read_file(img_path)
df_features.append(extract_dicom_features(ds))
df_features = pd.DataFrame(df_features, columns=ds_columns)
print(df_features)
print('***********')
df_features.head()
df_features.to_csv('\\test_meta.csv')
print(Counter(df_features['PatientID']))
print('***********')
df_features['ID'] = df_features['ID'].astype(str) + ".dcm"
print(df_features)
print('***********')
dest_path = '\\sorted'
counter = 0
prev_fold = '\\' + df_features.loc[0, 'PatientID']
for index, rows in df_features.iterrows():
filename = basename(rows['ID'])
counter=counter + 1
fold = '\\' + rows['PatientID']
dest_fold = dest_path + fold
out_file = os.path.join(dest_fold, filename)
print(out_file)
print('-------------')
if not os.path.exists(dest_fold):
os.mkdir(dest_fold)
os.rename(os.path.join(input_path, rows['ID']), out_file)
if fold != prev_fold:
counter = 0
prev_fold = fold

Append to associate array

I have a Python script that iterates through a PDF file (loops over each page), and inside each page does some text manipulation. So basically two loops:
files = {}
#npages is the number of PDF pages in the specific file.
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
Basically, I looking on each PDF page, and on each page I then further do some text manipulation.
I am trying to output it like:
- file_page_1.pdf
- file_page_1_col_1.pdf
- file_page_1_col_2.pdf
file_page_2.pdf
- file_page_2_col_1.pdf
- file_page_2_col_2.pdf
However using above coes gives me below error:
files[int(n)][int(i)] = path
TypeError: 'str' object does not support item assignment
I think the structure you're looking for is a dict that has string keys to list values.
files = {}
for page in range(npages):
path = pdf_name + str(n+1) + '_1.txt'
files[path] = []
for i, col in enumerate(COLUMNS):
subpath = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[path].append(subpath)
# For accessing items
for path, subpaths in files.items():
# path is a string, the key in files dict
print(path)
# subpaths is a list of strings, the value in files dict
for subpath in subpaths:
print(subpath)
If you want the path/subpath pairs to be returned in the order it was inserted, you can use OrderedDict instead of dict.
from collections import OrderedDict
files = OrderedDict()
# code as above
it is because files[int(n)] returns you str and not a dictionary.
as you can see from your line.
files[int(n)] = path
you are trying to achieve a dictionary behavior from a str object.
to carry out what you are trying to do we can do something like.
from collections import defaultdict
files = {}
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = defaultdict()
files[int(n)]['path_root'] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
this should give you result like:
|-- nth file
| |
| |- path_root
| |- child1 (0)
| |- child2 (1)
..
A quick side note about defaultdict:
somedict = {}
print(somedict[3]) # KeyError
someddict = defaultdict(int) # or str
print(someddict[3]) # print int(), thus 0 (str will return you '')

Python Error: String indices must be integers, not str

OK, I have an obvious problems staring me in the face that I can't figure out. I am getting the output/results I need but I get the TypeError: "string indices must be integers, not str". The following is a sample of my code. It is because of the statement "if f not in GetSquishySource(dirIn)" Basicially I am looking to see if a specific file is in another list so that I don't end up adding it to a zip file I am creating. I just don't see the problem here and how to get around it. Any help would be appreciated.
def compressLists(z, dirIn, dirsIn, filesIn, encrypt=None):
try:
with zipfile.ZipFile(z, 'w', compression=zipfile.ZIP_DEFLATED) as zip:
# Add files
compressFileList(z, dirIn, dirIn, filesIn, zip, encrypt)
# Add directories
for dir in dirsIn:
dirPath = os.path.join(dirIn, dir["name"])
for root, dirs, files in os.walk(dirPath):
# Ignore hidden files and directories
files = [f for f in files if not f[0] == '.']
dirs[:] = [d for d in dirs if not d[0] == '.']
# Replace file entries with structure value entries
for i, f in enumerate(files):
del files[i]
**if f not in GetSquishySource(dirIn):**
files.insert(i, {'zDir': dir["zDir"], 'name': f})
compressFileList(z, dirIn, root, files, zip, encryptedLua)
if dir["recurse"] == False:
break;
The following is the GetSquishySource function I created and call.
def GetSquishySource(srcDir):
squishyLines = []
srcToRemove = []
if os.path.isfile(srcDir + os.path.sep + "squishy"):
with open(srcDir + os.path.sep + "squishy") as squishyFile:
squishyContent = squishyFile.readlines()
squishyFile.close()
for line in squishyContent:
if line.startswith("Module") and line is not None:
squishyLines.append(line.split(' '))
for s in squishyLines:
if len(s) == 3 and s is not None:
# If the 3rd column in the squishy file contains data, use that.
path = s[2].replace('Module "', '').replace('"', '').replace("\n", '')
srcToRemove.append(os.path.basename(path))
elif len(s) == 2 and s is not None:
# If the 3rd column in the squishy file contains no data, then use the 2nd column.
path = s[1].replace('Module "', '').replace('"', '').replace("\n", '').replace(".", os.path.sep) + ".lua"
srcToRemove.append(os.path.basename(path))
return srcToRemove

Categories