How could I add from the following variable fIN = T1_r.nii.gz
the following suffix _brain and create the following output filename?
fOut = T1_r_brain.nii.gz
When I use the following command line
fIn2, file_extension = os.path.splitext(fIn)
it only removes the .gz extension.
Thank you for your help
Fred
I had to write a utility for this, and here's what I came up with.
from pathlib import Path
def add_str_before_suffixes(filepath, string: str) -> Path:
"""Append a string to a filename immediately before extension(s).
Parameters
----------
filepath : Path-like
Path to modify. Can contain multiple extensions like `.bed.gz`.
string : str
String to append to filename.
Returns
-------
Instance of `pathlib.Path`.
Examples
--------
>>> add_str_before_suffixes("foo", "_baz")
PosixPath('foo_baz')
>>> add_str_before_suffixes("foo.bed", "_baz")
PosixPath('foo_baz.bed')
>>> add_str_before_suffixes("foo.bed.gz", "_baz")
PosixPath('foo_baz.bed.gz')
"""
filepath = Path(filepath)
suffix = "".join(filepath.suffixes)
orig_name = filepath.name.replace(suffix, "")
new_name = f"{orig_name}{string}{suffix}"
return filepath.with_name(new_name)
Here is an example:
>>> f_in = "T1_r.nii.gz"
>>> add_str_before_suffixes(f_in, "_brain")
PosixPath('T1_r_brain.nii.gz')
split_path = 'T1_r.nii.gz'.split('.')
split_path[0] += '_brain'
final_path = ".".join(split_path)
Related
I am sorry for low level question, I am junior. I try to learn snakemake along with click. Please, help me to understand, for this example, how can I put a list of pathes to input in rule? And
get this list in python script.
Snakemake:
path_1 = 'data/raw/data2process/'
path_2 = 'data/raw/table.xlsx'
rule:
input:
list_of_pathes = "list of all pathes to .xlsx/.csv/.xls files from path_1"
other_table = path_2
output:
{some .xlsx file}
shell:
"script_1.py {input.list_of_pathes} {output}"
"script_2.py {input.other_table} {output}"
script_1.py:
#click.command()
#click.argument(input_list_of_pathes, type=*??*)
#click.argument("out_path", type=click.Path())
def foo(input_list_of_pathes: list, out_path: str):
df = pd.DataFrame()
for path in input_list_of_pathes:
table = pd.read_excel(path)
**do smthng**
df = pd.concat([df, table])
df.to_excel(out_path)
script_2.py:
#click.command()
#click.argument("input_path", type=type=click.Path(exist=True))
#click.argument("output_path", type=click.Path())
def foo_1(input_path: str, output_path: str):
table = pd.read_excel(input_path)
**do smthng**
table.to_excel(output_path)
Using pathlib, and the glob method of a Path object, you could proceed as follows:
from itertools import chain
from pathlib import Path
path_1 = Path('data/raw/data2process/')
exts = ["xlsx", "csv", "xls"]
path_1_path_lists = [
list(path_1.glob(f"*.{ext}"))
for ext in exts]
path_1_all_paths = list(chain.from_iterable(path_1_dict.values()))
The chain.from_iterables allows to "flatten" the list of lists, but I'm not sure Snakemake even needs this for the input of its rules.
Then, in your rule:
input:
list_of_paths = path_1_all_paths,
other_table = path_2
I think that Path objects can be used directly. Otherwise, you need to turn them into strings with str:
input:
list_of_paths = [str(p) for p in path_1_all_paths],
other_table = path_2
I am using python to create xml file using element and subelement process.
I have a list of zip files in my folder listed below:
Retirement_participant-plan_info_v1_getPlankeys_rev1_2021_03_09.zip
Retirement_participant-plan_info_resetcache_secretmanager_rev1_2021_03_09.zip
Retirement_participant-plan_info_v1_mypru_plankeys_rev1_2021_03_09.zip
Retirement_participant-plan_info_resetcache_param_value_rev1_2021_03_09.zip
Retirement_participant-plan_info_resetcache_param_v1_balances_rev1_2021_03_09.zip
I want to split those zip files and get the name like this:
Retirement_participant-plan_info_v1_getPlankeys
Retirement_participant-plan_info_resetcache_secretmanager
Retirement_participant-plan_info_v1_mypru_plankeys
Retirement_participant-plan_info_resetcache_param_value
Retirement_participant-plan_info_resetcache_param_v1_balances
PS: I want to remove _rev1_2021_03_09.zip while creating a name from the zip file.
here is my python code. It works with Retirement_participant-plan_info_v1_getPlankeys_rev1_2021_03_09.zip but its not working if i have too big names for a zip file for eg Retirement_participant-plan_info_resetcache_param_v1_balances_rev1_2021_03_09.zip
Proxies = SubElement(proxy, 'Proxies')
path = "./"
for f in os.listdir(path):
if '.zip' in f:
Proxy = SubElement(Proxies, 'Proxy')
name = SubElement(Proxy, 'name')
fileName = SubElement(Proxy, 'fileName')
a = f.split('_')
name.text = '_'.join(a[:3])
fileName.text = str(f)
You can str.split by rev1_
>>> filenames
['Retirement_participant-plan_info_v1_getPlankeys_rev1_2021_03_09.zip',
'Retirement_participant-plan_info_resetcache_secretmanager_rev1_2021_03_09.zip',
'Retirement_participant-plan_info_v1_mypru_plankeys_rev1_2021_03_09.zip',
'Retirement_participant-plan_info_resetcache_param_value_rev1_2021_03_09.zip',
'Retirement_participant-plan_info_resetcache_param_v1_balances_rev1_2021_03_09.zip']
>>> names = [fname.split('_rev1_')[0] for fname in filenames]
>>> names
['Retirement_participant-plan_info_v1_getPlankeys',
'Retirement_participant-plan_info_resetcache_secretmanager',
'Retirement_participant-plan_info_v1_mypru_plankeys',
'Retirement_participant-plan_info_resetcache_param_value',
'Retirement_participant-plan_info_resetcache_param_v1_balances']
Same can be achieved with str.rsplit by limiting the maxsplit to 4:
>>> names = [fname.rsplit('_', 4)[0] for fname in filenames]
>>> names
['Retirement_participant-plan_info_v1_getPlankeys',
'Retirement_participant-plan_info_resetcache_secretmanager',
'Retirement_participant-plan_info_v1_mypru_plankeys',
'Retirement_participant-plan_info_resetcache_param_value',
'Retirement_participant-plan_info_resetcache_param_v1_balances']
If the rev and date is always the same (2021_03_09), just replace them with the empty string:
filenames = [f.replace("_rev1_2021_03_09.zip", "") for f in os.listdir(path)]
I have a file that has some special extension. Sometime it is '.exe', or 'exe.gz' or 'exe.tar.gz'...I want to get the filename only. I am using the below code to get filename abc but it cannot work for all cases
import os
filename = 'abc.exe'
base = os.path.basename(filename)
print(os.path.splitext(base)[0])
filename = 'abc.exe.gz'
base = os.path.basename(filename)
print(os.path.splitext(base)[0])
Note that, I knew the list of extensions such as ['.exe','exe.gz','exe.tar.gz', '.gz']
You can just split with the . char and take the first element:
>>> filename = 'abc.exe'
>>> filename.split('.')[0]
'abc'
>>> filename = 'abc.exe.gz'
>>> filename.split('.')[0]
'abc'
How about a workaround like this?
suffixes = ['.exe','.exe.gz','.exe.tar.gz', '.gz']
def get_basename(filename):
for suffix in suffixes:
if filename.endswith(suffix):
return filename[:-len(suffix)]
return filename
I am trying to assign the elements of a list as names for some files that live in a directory, so far I created a function that recover the name of a each file from a directory and returns them in a list:
def retrive(directory_path):
path_names = []
for filename in sorted(glob.glob(os.path.join(directory_path, '*.pdf'))):
retrieved_files = filename.split('/')[-1]
path_names.append(retrieved_files)
print (path_names)
The above function returns in a list the names of each file, then I am writing the files into another directory as follows:
path = os.path.join(new_dir_path, "list%d.txt" % i)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Finally, my question is: how can I assign as a name of each file, each element of path_names?, something like this line:
path = os.path.join(new_dir_path, "list%d.txt" % i)
I also tried to use the format() function. However I still cant assign the the correct name to each file.
Here's the full script:
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from tika import parser
all_texts = []
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
texts = parsed['content']
all_texts.append(texts)
for i , a_list in enumerate(all_texts):
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, "list%d.txt" % i)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
The desired output will consist of the actual names of each processed file.
You’re almost there:
for path_name in path_names:
path = os.path.join(new_dir_path, "list%s.txt" % path_name)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as f:
for item in [a_list]:
f.write(item+"\n")
Update based on updated code sample. You are using different loops here, and that is not ideal unless you are doing processing in between the two loops. Since I am going to keep that structure, we are going to have to make sure to associate each block of content with the original filename. The best structure for that is a dict, and in case order is important, we use an OrderedDict. Now, when we’re looping over the filename, content pairs in the OrderedDict we’ll want to change the extension of the file to match the new file type. Luckily, python has some nice utilities for file/path manipulation in the os.path module. os.path.basename can be used to strip off the directory from a file and os.path.splitext will strip off an extension from a filename. We use both of those to get just the filename without the extension and then append .txt to designate the new file type. Putting it all together, we get :
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from collections import OrderedDict
from tika import parser
all_texts = OrderedDict()
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
filename = os.path.basename(filename)
texts = parsed['content']
all_texts[filename] = texts
for i, (original_filename, a_list) in enumerate(all_texts.items()):
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Second update: OP asked how I would write this code if this was all that there was, so here goes:
# move imports to top of file: PEP 8
import codecs, glob, os
from tika import parser
def transform_directoy(input_directory, output_directory):
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
parsed_content = parsed['content']
original_filename = os.path.basename(filename)
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
path = os.path.join(output_directory, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
# no need for a second loop since we can piggy back off the first loop
with codecs.open(path, "w", encoding='utf8') as filename:
# No need for a for loop here since our list only has one item
filename.write(parsed_content)
filename.write("\n")
My code
specFileName = input("Enter the file path of the program you would like to capslock: ")
inFile = open(specFileName, 'r')
ified = inFile.read().upper()
outFile = open(specFileName + "UPPER", 'w')
outFile.write(ified)
outFile.close()
print(inFile.read())
This is basically make to take in any file, capitalize everything, and put it into a new file called UPPER"filename". How do I add the "UPPER" bit into the variable without it being at the very end or very beginning? As it won't work like that due to the rest of the file path in the beginning and the file extension at the end. For example, C:/users/me/directory/file.txt would become C:/users/me/directory/UPPERfile.txt
Look into the methods os.path.split and os.path.splitext from the os.path module.
Also, quick reminder: don't forget to close your "infile".
Depending on exactly how you're trying to do this, there's several approaches.
First of all you probably want to grab just the filename, not the whole path. Do this with os.path.split.
>>> pathname = r"C:\windows\system32\test.txt"
>>> os.path.split(pathname)
('C:\\windows\\system32', 'test.txt')
Then you can also look at os.path.splitext
>>> filename = "test.old.txt"
>>> os.path.splitext(filename)
('test.old', '.txt')
And finally string formatting would be good
>>> test_string = "Hello, {}"
>>> test_string.format("world") + ".txt"
"Hello, world.txt"
Put 'em together and you've probably got something like:
def make_upper(filename, new_filename):
with open(filename) as infile:
data = infile.read()
with open(new_filename) as outfile:
outfile.write(data.upper())
def main():
user_in = input("What's the path to your file? ")
path = user_in # just for clarity
root, filename = os.path.split(user_in)
head,tail = os.path.splitext(filename)
new_filename = "UPPER{}{}".format(head,tail)
new_path = os.path.join(root, new_filename)
make_upper(path, new_path)