I want ignore the paths that generate the Error:
'Path does not exist'
when I read parquet files with pyspark. For example I have a list of paths:
list_paths = ['path1','path2','path3']
and read the files like:
dataframe = spark.read.parquet(*list_paths)
but the path path2 does not exist. In general, I do not know which path does not exits, so I want ignore path2 automatically. How can I do it and obtain only one dataframe?
You can use Hadoop FS API to check if the files exist before you pass them to spark.read:
conf = sc._jsc.hadoopConfiguration()
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
filtered_paths = [p for p in list_paths if Path(p).getFileSystem(conf).exists(Path(p))]
dataframe = spark.read.parquet(*filtered_paths)
Where sc is the SparkContext.
Maybe you can do
existing_paths = [path for path in list_paths if os.path.exists(path)]
dataframe = spark.read.parquet(*existing_paths)
Adding to #blackbishop's answer, you can further use Hadoop pattern strings to check for files/objects before loading them.
It's also worth noting that spark.read.load() accepts lists of path strings.
from functools import partial
from typing import Iterator
from pyspark.sql import SparkSession
def iterhadoopfiles(spark: SparkSession, path_pattern: str) -> Iterator[str]:
"""Return iterator of object/file paths that match path_pattern."""
sc = spark.sparkContext
FileUtil = sc._gateway.jvm.org.apache.hadoop.fs.FileUtil
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
hadoop_config = sc._jsc.hadoopConfiguration()
p = Path(path_pattern)
return (
str(x)
for x in FileUtil.stat2Paths(
p.getFileSystem(hadoop_config).globStatus(p)
)
)
def pathnotempty(spark: SparkSession, path_pattern: str) -> bool:
"""Return true if path matches at least one object/file."""
try:
next(iterhadoopfiles(spark, path_pattern))
except StopIteration:
return False
return True
paths_to_load = list(filter(partial(pathnotempty, spark), ["file:///*.parquet"]))
spark.read.format('parquet').load(paths_to_load)
Related
I am sorry for low level question, I am junior. I try to learn snakemake along with click. Please, help me to understand, for this example, how can I put a list of pathes to input in rule? And
get this list in python script.
Snakemake:
path_1 = 'data/raw/data2process/'
path_2 = 'data/raw/table.xlsx'
rule:
input:
list_of_pathes = "list of all pathes to .xlsx/.csv/.xls files from path_1"
other_table = path_2
output:
{some .xlsx file}
shell:
"script_1.py {input.list_of_pathes} {output}"
"script_2.py {input.other_table} {output}"
script_1.py:
#click.command()
#click.argument(input_list_of_pathes, type=*??*)
#click.argument("out_path", type=click.Path())
def foo(input_list_of_pathes: list, out_path: str):
df = pd.DataFrame()
for path in input_list_of_pathes:
table = pd.read_excel(path)
**do smthng**
df = pd.concat([df, table])
df.to_excel(out_path)
script_2.py:
#click.command()
#click.argument("input_path", type=type=click.Path(exist=True))
#click.argument("output_path", type=click.Path())
def foo_1(input_path: str, output_path: str):
table = pd.read_excel(input_path)
**do smthng**
table.to_excel(output_path)
Using pathlib, and the glob method of a Path object, you could proceed as follows:
from itertools import chain
from pathlib import Path
path_1 = Path('data/raw/data2process/')
exts = ["xlsx", "csv", "xls"]
path_1_path_lists = [
list(path_1.glob(f"*.{ext}"))
for ext in exts]
path_1_all_paths = list(chain.from_iterable(path_1_dict.values()))
The chain.from_iterables allows to "flatten" the list of lists, but I'm not sure Snakemake even needs this for the input of its rules.
Then, in your rule:
input:
list_of_paths = path_1_all_paths,
other_table = path_2
I think that Path objects can be used directly. Otherwise, you need to turn them into strings with str:
input:
list_of_paths = [str(p) for p in path_1_all_paths],
other_table = path_2
How could I add from the following variable fIN = T1_r.nii.gz
the following suffix _brain and create the following output filename?
fOut = T1_r_brain.nii.gz
When I use the following command line
fIn2, file_extension = os.path.splitext(fIn)
it only removes the .gz extension.
Thank you for your help
Fred
I had to write a utility for this, and here's what I came up with.
from pathlib import Path
def add_str_before_suffixes(filepath, string: str) -> Path:
"""Append a string to a filename immediately before extension(s).
Parameters
----------
filepath : Path-like
Path to modify. Can contain multiple extensions like `.bed.gz`.
string : str
String to append to filename.
Returns
-------
Instance of `pathlib.Path`.
Examples
--------
>>> add_str_before_suffixes("foo", "_baz")
PosixPath('foo_baz')
>>> add_str_before_suffixes("foo.bed", "_baz")
PosixPath('foo_baz.bed')
>>> add_str_before_suffixes("foo.bed.gz", "_baz")
PosixPath('foo_baz.bed.gz')
"""
filepath = Path(filepath)
suffix = "".join(filepath.suffixes)
orig_name = filepath.name.replace(suffix, "")
new_name = f"{orig_name}{string}{suffix}"
return filepath.with_name(new_name)
Here is an example:
>>> f_in = "T1_r.nii.gz"
>>> add_str_before_suffixes(f_in, "_brain")
PosixPath('T1_r_brain.nii.gz')
split_path = 'T1_r.nii.gz'.split('.')
split_path[0] += '_brain'
final_path = ".".join(split_path)
I'm writing a module that will take in an array of strings among other command line arguments. The array would be something like:
['PUPSFF', 'PCASPE', 'PCASEN']
My module has a method that will search for files matching a possible format in a directory:
def search(self, fundCode, type):
funds_string = '_'.join(fundCode)
files = set(os.listdir(self.unmappedDir))
file_match = 'citco_unmapped_{type}_{funds}_{start}_{end}.csv'.format(type=type, funds=funds_string, start=self.startDate, end=self.endDate)
if file_match in files:
filename = os.path.join(self.unmappedDir, file_match)
return self.read_file(filename)
else:
Logger.error('No {type} file/s found for {funds}, between {start} and {end}'.format(type=type, funds=fundCode, start=self.startDate, end=self.endDate))
So if my directory has a file like this one:
citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv
And I pass this array as the cmd line argument: ['PUPSFF', 'PCASPE', 'PCASEN']
After calling my method (and passing in the rest of the self arguments) like this:
positions = alerter.search(alerter.fundCodes, 'positions')
It will search, find that file, and do whatever in needs to do.
However, I want it to be independent of the order. so that it will still find the file if the command line arguments are written like this:
['PCASPE', 'PCASEN', 'PUPSFF'] or
['PCASEN', 'PUPSFF', 'PCASPE'] or whatever
Any ideas on how to go on about this?
Use the all function to see the each of the needed tags is in the file name. This example should get you going:
files = [
"citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv", # yes
"citco_unmapped_positions_PUPSFF_NO_WAY_PCASEN_2018-07-01_2018-07-11.csv", # no
"citco_unmapped_positions_PCASEN_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv", # no
"citco_unmapped_positions_PCASPE_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv", # yes
]
tags = ['PUPSFF', 'PCASPE', 'PCASEN']
for fname in files:
if (all(tag in fname for tag in tags)):
# the file is a match.
print("Match", fname)
Output:
Match citco_unmapped_positions_PUPSFF_PCASPE_PCASEN_2018-07-01_2018-07-11.csv
Match citco_unmapped_positions_PCASPE_PCASEN_PUPSFF_2018-07-01_2018-07-11.csv
Found a possible solution with permutations from itertools
def search(self, fundCodes, type):
permutations = self.find_permutations(fundCodes)
files = set(os.listdir(self.unmappedDir))
for perm in permutations:
fund_codes = '_'.join(perm)
file_match = 'citco_unmapped_{type}_{funds}_{start}_{end}.csv'.format(type=type, funds=fund_codes, start=self.startDate, end=self.endDate)
if file_match in files:
filename = os.path.join(self.unmappedDir, file_match)
return self.read_file(filename)
else:
Logger.error('No {type} file/s found for {funds}, between {start} and {end}'.format(type=type, funds=fund_codes, start=self.startDate, end=self.endDate))
def find_permutations(self, list):
perms = [p for p in permutations(list)]
return perms
Probably really slow though.
I need to translate chunks of matlab code into Python. My code seems to be 'unreachable' though. Any idea why this is happening?
Also: am I doing it right? I'm a real newbie.
Matlab code:
function Dir = getScriptDir()
fullPath = mfilename('fullpath');
[Dir, ~,~] = fileparts(fullPath);
end
function [list,listSize] = getFileList(Dir)
DirResult = dir( Dir );
list = DirResult(~[DirResult.isdir]); % select files
listSize = size(list);
end
My Python code:
def Dir = getScriptDir():
return os.path.dirname(os.path.realpath(__file__)
def getFileList(Dir):
list = os.listdir(Dir)
listSize = len(list)
getFileList() = [list, listSize]
Your syntax is incorrect. If I'm reading this correctly, you're trying to get the names of the files in the same directory as the script and print the number of files in that list.
Here's an example of how you might do this (based on the program you gave):
import os
def getFileList(directory = os.path.dirname(os.path.realpath(__file__))):
list = os.listdir(directory)
listSize = len(list)
return [list, listSize]
print(getFileList())
Output example:
[['program.py', 'data', 'syntax.py'], 3]
Your function definitions were incorrect. I have modified the code you provided. You can also consolidate the getScriptDir() functionality into the getFileList() function.
import os
def getFileList():
dir = os.path.dirname(os.path.realpath(__file__))
list = os.listdir(dir)
listSize = len(list)
fileList = [list, listSize]
return fileList
print(getFileList())
Returns: (in my environment)
[['test.py', 'test.txt', 'test2.py', 'test2.txt', 'test3.py', 'test4.py', 'testlog.txt', '__pycache__'], 8]
Your script functions - including getScriptDir(modified):
import os
def getScriptDir():
return os.path.dirname(os.path.realpath(__file__))
def getFileList(dir):
dir = os.path.dirname(os.path.realpath(__file__))
list = os.listdir(dir)
listSize = len(list)
fileList = [list, listSize]
return fileList
dir = getScriptDir()
print(getFileList(dir))
Remember that you need to return variables from a python-function to get their results.
More on how to define your own functions in python: https://docs.python.org/3/tutorial/controlflow.html#defining-functions
filetypes = ("*.jpg","*.txt","*.csv")
filelist = []
for root, dirnames, filenames in os.walk("c:\\"):
for ft in filetypes:
for f in fnmatch.filter(filenames, ft):
filelist.append(os.path.join(root, f))
I have this code which will add to my list only files with the extensions I provide,
1) I want to do the opposite add all file extensions "*.*" and filter some of them I don't need for example "*.dat","*.dll","*.log","*.exe"
2) Also I do not need files from c:\\windows c:\\program files c:\\else can I filter it too?
3) I need it to be fast found this example code from other answer it seems to be faster but what is main speed issue in this type of function os.walk? If so there is scandir github project 7-20 times faster os.walk improved function, or if it's the filtering of file matches by extensions i want to filter 20+ extensions any suggestions?
import os
extns = ('.jpg', '.jpeg', '.png', '.tif', '.tiff')
matches = []
for root, dirnames, fns in os.walk("C:\\"):
matches.extend(os.path.join(root, fn) for fn in fns if fn.lower().endswith(extns))
Your help is very much appreciated
#!/usr/bin/python2.7
import os
import sys
import re
import fnmatch
def findit(root, exclude_files=[], exclude_dirs=[]):
exclude_files = (fnmatch.translate(i) for i in exclude_files)
exclude_files = '('+')|('.join(exclude_files)+')'
exclude_files = re.compile(exclude_files)
exclude_dirs = (os.path.normpath(i) for i in exclude_dirs)
exclude_dirs = (os.path.normcase(i) for i in exclude_dirs)
exclude_dirs = set(exclude_dirs)
return (os.path.join(r,f)
for r,_,f in os.walk(root)
if os.path.normpath(os.path.normcase(r)) not in exclude_dirs
for f in f
if not exclude_files.match(os.path.normcase(f)))
if __name__ == '__main__':
# If you need the entire list in memory at once
filelist = list(findit('c:/',
exclude_files = ['*.dll', '*.dat', '*.log', '*.exe'],
exclude_dirs = ['c:/windows', 'c:/program files', 'c:/else'],
))
# Or this, if you need the items one at a time (saves memory):
for filename in findit('c:/',
exclude_files = ['*.dll', '*.dat', '*.log', '*.exe'],
exclude_dirs = ['c:/windows', 'c:/program files', 'c:/else'],
):
print filename # or stat() or open() the file, or whatever.
I've tried to write a more flexible os.walk to filter dirs and files with a regex, but using pathlib2 functions.
Here it is.
import regex
from pathlib2 import Path
def _recursedir_(self, depth=-1, exclude=None, invert=False):
'''
Parameters
----------
depth : int, optional
depth to stop at, if less than 0 then don't stop. The default is -1.
exclude : compiled regex expression, or dict of regex expressions, keys 'dir', 'file', optional
regex to match current dir/file name against. The default is None.
invert : bool or dict of bools, keys 'dir', 'file', optional
invert the sense of the filter, default is to skip filter matches. The default is False.
Yields
------
Path
Current dir path.
List of dirs
SubDirs in current path.
List of files
Files in current path.
'''
if type(exclude) is dict:
dfilt = exclude['dir']
ffilt = exclude['file']
else:
dfilt = exclude
ffilt = None # means show all files in current path
if type(invert) is dict:
dsens = invert['dir']
fsens = invert['file']
else:
dsens = invert
fsens = False # means skip files that match
if dfilt is None:
dfun = lambda x : True
elif dsens is False:
dfun = lambda x : not dfilt.match(x.name) # filter match EXCLUDES
else:
dfun = lambda x : dfilt.match(x.name) # filter match INCLUDES
if ffilt is None:
ffun = lambda x: True
elif fsens is False:
ffun = lambda x : not ffilt.match(x.name) # filter match EXCLUDES
else:
ffun = lambda x : ffilt.match(x.name) # filter match INCLUDES
d = self.resolve()
dd = [ x for x in d.iterdir() if x.is_dir()]
f = [ x for x in d.iterdir() if x.is_file()]
dd[:] = [ x for x in filter(dfun, dd )]
f[:] = [ x for x in filter(ffun, f)]
yield (d, dd, f)
if depth > 0 or depth < 0:
for xd in dd:
yield from _recursedir_( d / xd, depth=depth-1, exclude=exclude, invert=invert)
Path.recursedir = _recursedir_
So the call to this method is, given a Path variable e.g.
for dir, dlist, flist in Path.home().recursedir(depth = 3, exclude=re.compile(r'\d+'), invert = True ):
print(f"{d} : {dlist} : {flist}")
This will (should !) limit the depth to 3, and exclude all DIRS that begin with digits.
So for multiple file extension matching, one would use,
exclude = { 'dir':None, 'file':re.compile(r'.*\.(?:jpg|jpeg|txt)')}