change urlparse.path of a url - python

Here is the python code:
url = http://www.phonebook.com.pk/dynamic/search.aspx
path = urlparse(url)
print (path)
>>>ParseResult(scheme='http', netloc='www.phonebook.com.pk', path='/dynamic/search.aspx', params='', query='searchtype=cat&class_id=4520&page=1', fragment='')
print (path.path)
>>>/dynamic/search.aspx
Now I need to change the path.path to my requirement. Like if "/dynamic/search.aspx" is the path then I only need the parts between the first slash and last slash including slashes which is "/dynamic/".
I have tried these two lines but end result is not what I expected that's why I am asking this question as my knowledge of "urllib.parse" is insufficient.
path = path.path[:path.path.index("/")]
print (path)
>>>Returns nothing.
path = path.path[path.path.index("/"):]
>>>/dynamic/search.aspx (as it was before, no change.)
In short whatever the path.path result is my need is directory names only. For example:" dynamic/search/search.aspx". now I need "dynamic/search/"

First, the desired part of the path can be obtained using rfind which returns the index of the last occurrence. The + 1 is for keeping the trailing slash.
desired_path = path.path[:path.path.rfind("/") + 1]
Second, use the _replace method to replace the path attribute of the urlparse object as follows:
desired_url = urlunparse(path._replace(path=desired_path))
The full working example:
from urllib.parse import urlparse, urlunparse
url = "http://www.phonebook.com.pk/dynamic/search/search.aspx"
path = urlparse(url)
desired_path = path.path[:path.path.rfind("/") + 1]
desired_url = urlunparse(path._replace(path=desired_path))

I've tried to look into urlparse to find any method that could help in your situation, but didn't find, may be overlooked, but anyway, at this level, you probably would have to make your own method or hack:
>>> path.path
'/dynamic/search.aspx'
>>> import re
>>> d = re.search(r'/.*/', path.path)
>>> d.group(0)
'/dynamic/'
This is just an example to you, you may also use built-in methods, like so:
>>> i = path.path.index('/', 1)
>>>
>>> path.path[:i+1]
'/dynamic/'
EDIT:
I didn't notice your last example, so here is another way:
>>> import os
>>> path = os.path.dirname(path.path) + os.sep
>>> path
'/dynamic/'
>>> path = os.path.dirname(s) + os.sep
>>> path
'dynamic/search/'
Or with re:
>>> s
'dynamic/search/search.aspx'
>>> d = re.search(r'.*/', s)
>>> d
<_sre.SRE_Match object; span=(0, 15), match='dynamic/search/'>
>>> d.group(0)
'dynamic/search/'
>>>
>>> s = '/dynamic/search.aspx'
>>> d = re.search(r'.*/', s)
>>> d.group(0)
'/dynamic/'

Related

add text to a path by adding text to it with python (FTP Path NCBI)

I'm in a bind please. do you know if i can duplicate the last folder in the path please and add "_genomic.fna.gz" to it for example how to change from this
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3
to this :
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3/GCA_001316945.3_ASM131694v3_genomic.fna.gz
Thanks
urllib.parse.urlparse can split your URL into parts we can work with.
posixpath.join can help us build the full path.
urllib.parse.urlunparse can help us get a complete URL back.
>>> import urllib.parse
>>> import posixpath
>>> url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3"
>>> parts = urllib.parse.urlparse(url)
>>> parts.path
'/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3'
>>> posixpath.basename(parts.path)
'GCA_001316945.3_ASM131694v3'
>>> suffix = "_genomic.fna.gz"
>>> prefix = posixpath.basename(parts.path)
>>> print(prefix+suffix)
GCA_001316945.3_ASM131694v3_genomic.fna.gz
>>> path = posixpath.join(parts.path, prefix+suffix)
>>> path
'/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3/GCA_001316945.3_ASM131694v3_genomic.fna.gz'
>>> ret = parts._replace(path=path)
>>> print(urllib.parse.urlunparse(ret))
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/316/945/GCA_001316945.3_ASM131694v3/GCA_001316945.3_ASM131694v3_genomic.fna.gz
>>>

How to get names in string?

Get last name of the directory:
str = "/folderA/folderB/folderC/folderD"
editstr = str.split("/")[-1]
print(editstr)
folderD
How do I get all the directories before folderD (without the last slash)? E.g:
editstr = ???
print(editstr)
/folderA/folderB/folderC
There is a module for this.
>>> import os
>>> s = "/folderA/folderB/folderC/folderD"
>>> os.path.basename(s)
'folderD'
>>> os.path.dirname(s)
'/folderA/folderB/folderC'
You may use str.rsplit():
>>> editstr = str.rsplit('/folderD', 1)[0]
>>> print(editstr)
/folderA/folderB/folderC

How to get rid of extensions from file basename using python

I have got the complete path of files in a list like this:
a = ['home/robert/Documents/Workspace/datafile.xlsx', 'home/robert/Documents/Workspace/datafile2.xls', 'home/robert/Documents/Workspace/datafile3.xlsx']
what I want is to get just the file NAMES without their extensions, like:
b = ['datafile', 'datafile2', 'datafile3']
What I have tried is:
xfn = re.compile(r'(\.xls)+')
for name in a:
fp, fb = os.path.split(fp)
ofn = xfn.sub('', name)
b.append(ofn)
But it results in:
b = ['datafilex', 'datafile2', 'datafile3x']
The regex you've used is wrong. (\.xls)+ matches strings of the form .xls, .xls.xls, etc. This is why there is a remaining x in the .xlsx items. What you want is \.xls.*, i.e. a .xls followed by zero or more of any characters.
You don't really need to use regex. There are specialized methods in os.path that deals with this: basename and splitext.
>>> import os.path
>>> os.path.basename('home/robert/Documents/Workspace/datafile.xlsx')
'datafile.xlsx'
>>> os.path.splitext(os.path.basename('home/robert/Documents/Workspace/datafile.xlsx'))[0]
'datafile'
so, assuming you don't really care about the .xls/.xlsx suffix, your code can be as simple as:
>>> a = ['home/robert/Documents/Workspace/datafile.xlsx', 'home/robert/Documents/Workspace/datafile2.xls', 'home/robert/Documents/Workspace/datafile3.xlsx']
>>> [os.path.splitext(os.path.basename(fn))[0] for fn in a]
['datafile', 'datafile2', 'datafile3']
(also note the list comprehension.)
Oneliner:
>>> filename = 'file.ext'
>>> '.'.join(filename.split('.')[:-1]) if '.' in filename else filename
'file'
This is a repeat of:
How to get the filename without the extension from a path in Python?
https://docs.python.org/3/library/os.path.html
In python 3 pathlib "The pathlib module offers high-level path objects." so,
>>> from pathlib import Path
>>> p = Path("/a/b/c.txt")
>>> print(p.with_suffix(''))
\a\b\c
>>> print(p.stem)
c
Why not just use the split method?
def get_filename(path):
""" Gets a filename (without extension) from a provided path """
filename = path.split('/')[-1].split('.')[0]
return filename
>>> path = '/home/robert/Documents/Workspace/datafile.xlsx'
>>> filename = get_filename(path)
>>> filename
'datafile'

How to get only the last part of a path in Python?

In python, suppose I have a path like this:
/folderA/folderB/folderC/folderD/
How can I get just the folderD part?
Use os.path.normpath, then os.path.basename:
>>> os.path.basename(os.path.normpath('/folderA/folderB/folderC/folderD/'))
'folderD'
The first strips off any trailing slashes, the second gives you the last part of the path. Using only basename gives everything after the last slash, which in this case is ''.
With python 3 you can use the pathlib module (pathlib.PurePath for example):
>>> import pathlib
>>> path = pathlib.PurePath('/folderA/folderB/folderC/folderD/')
>>> path.name
'folderD'
If you want the last folder name where a file is located:
>>> path = pathlib.PurePath('/folderA/folderB/folderC/folderD/file.py')
>>> path.parent.name
'folderD'
You could do
>>> import os
>>> os.path.basename('/folderA/folderB/folderC/folderD')
UPDATE1: This approach works in case you give it /folderA/folderB/folderC/folderD/xx.py. This gives xx.py as the basename. Which is not what you want I guess. So you could do this -
>>> import os
>>> path = "/folderA/folderB/folderC/folderD"
>>> if os.path.isdir(path):
dirname = os.path.basename(path)
UPDATE2: As lars pointed out, making changes so as to accomodate trailing '/'.
>>> from os.path import normpath, basename
>>> basename(normpath('/folderA/folderB/folderC/folderD/'))
'folderD'
Here is my approach:
>>> import os
>>> print os.path.basename(
os.path.dirname('/folderA/folderB/folderC/folderD/test.py'))
folderD
>>> print os.path.basename(
os.path.dirname('/folderA/folderB/folderC/folderD/'))
folderD
>>> print os.path.basename(
os.path.dirname('/folderA/folderB/folderC/folderD'))
folderC
I was searching for a solution to get the last foldername where the file is located, I just used split two times, to get the right part. It's not the question but google transfered me here.
pathname = "/folderA/folderB/folderC/folderD/filename.py"
head, tail = os.path.split(os.path.split(pathname)[0])
print(head + " " + tail)
I like the parts method of Path for this:
grandparent_directory, parent_directory, filename = Path(export_filename).parts[-3:]
log.info(f'{t: <30}: {num_rows: >7} Rows exported to {grandparent_directory}/{parent_directory}/{filename}')
If you use the native python package pathlib it's really simple.
>>> from pathlib import Path
>>> your_path = Path("/folderA/folderB/folderC/folderD/")
>>> your_path.stem
'folderD'
Suppose you have the path to a file in folderD.
>>> from pathlib import Path
>>> your_path = Path("/folderA/folderB/folderC/folderD/file.txt")
>>> your_path.name
'file.txt'
>>> your_path.parent
'folderD'
During my current projects, I'm often passing rear parts of a path to a function and therefore use the Path module. To get the n-th part in reverse order, I'm using:
from typing import Union
from pathlib import Path
def get_single_subpath_part(base_dir: Union[Path, str], n:int) -> str:
if n ==0:
return Path(base_dir).name
for _ in range(n):
base_dir = Path(base_dir).parent
return getattr(base_dir, "name")
path= "/folderA/folderB/folderC/folderD/"
# for getting the last part:
print(get_single_subpath_part(path, 0))
# yields "folderD"
# for the second last
print(get_single_subpath_part(path, 1))
#yields "folderC"
Furthermore, to pass the n-th part in reverse order of a path containing the remaining path, I use:
from typing import Union
from pathlib import Path
def get_n_last_subparts_path(base_dir: Union[Path, str], n:int) -> Path:
return Path(*Path(base_dir).parts[-n-1:])
path= "/folderA/folderB/folderC/folderD/"
# for getting the last part:
print(get_n_last_subparts_path(path, 0))
# yields a `Path` object of "folderD"
# for second last and last part together
print(get_n_last_subparts_path(path, 1))
# yields a `Path` object of "folderc/folderD"
Note that this function returns a Pathobject which can easily be converted to a string (e.g. str(path))
path = "/folderA/folderB/folderC/folderD/"
last = path.split('/').pop()
str = "/folderA/folderB/folderC/folderD/"
print str.split("/")[-2]

read string backwards and terminate at first '/'

I want to extract just the file name portion of a path. My code below works, but I'd like to know what the better (pythonic) way of doing this is.
filename = ''
tmppath = '/dir1/dir2/dir3/file.exe'
for i in reversed(tmppath):
if i != '/':
filename += str(i)
else:
break
a = filename[::-1]
print a
Try:
#!/usr/bin/python
import os.path
path = '/dir1/dir2/dir3/file.exe'
name = os.path.basename(path)
print name
you'd be better off using standard library for this:
>>> tmppath = '/dir1/dir2/dir3/file.exe'
>>> import os.path
>>> os.path.basename(tmppath)
'file.exe'
Use os.path.basename(..) function.
>>> import os
>>> path = '/dir1/dir2/dir3/file.exe'
>>> path.split(os.sep)
['', 'dir1', 'dir2', 'dir3', 'file.exe']
>>> path.split(os.sep)[-1]
'file.exe'
>>>
The existing answers are correct for your "real underlying question" (path manipulation). For the question in your title (generalizable to other characters of course), what helps there is the rsplit method of strings:
>>> s='some/stuff/with/many/slashes'
>>> s.rsplit('/', 1)
['some/stuff/with/many', 'slashes']
>>> s.rsplit('/', 1)[1]
'slashes'
>>>

Categories