Is there a builtin function to get url like this: ../images.html given a base url like this: http://www.example.com/faq/index.html and a target url such as http://www.example.com/images.html
I checked urlparse module. What I want is counterpart of the urljoin() function.
You could use urlparse.urlparse to find the paths, and the posixpath version of os.path.relname to find the relative path.
(Warning: This works for Linux, but may not for Windows):
import urlparse
import sys
import posixpath
def relurl(target,base):
base=urlparse.urlparse(base)
target=urlparse.urlparse(target)
if base.netloc != target.netloc:
raise ValueError('target and base netlocs do not match')
base_dir='.'+posixpath.dirname(base.path)
target='.'+target.path
return posixpath.relpath(target,start=base_dir)
tests=[
('http://www.example.com/images.html','http://www.example.com/faq/index.html','../images.html'),
('http://google.com','http://google.com','.'),
('http://google.com','http://google.com/','.'),
('http://google.com/','http://google.com','.'),
('http://google.com/','http://google.com/','.'),
('http://google.com/index.html','http://google.com/','index.html'),
('http://google.com/index.html','http://google.com/index.html','index.html'),
]
for target,base,answer in tests:
try:
result=relurl(target,base)
except ValueError as err:
print('{t!r},{b!r} --> {e}'.format(t=target,b=base,e=err))
else:
if result==answer:
print('{t!r},{b!r} --> PASS'.format(t=target,b=base))
else:
print('{t!r},{b!r} --> {r!r} != {a!r}'.format(
t=target,b=base,r=result,a=answer))
The first solutions that comes to mind is:
>>> os.path.relpath('/images.html', os.path.dirname('/faq/index.html'))
'../images.html'
Of course, this requires URL parsing -> domain name comparison (!!) -> path rewriting if that's the case -> re-adding query and fragment.
Edit: a more complete version
import urlparse
import posixpath
def relative_url(destination, source):
u_dest = urlparse.urlsplit(destination)
u_src = urlparse.urlsplit(source)
_uc1 = urlparse.urlunsplit(u_dest[:2]+tuple('' for i in range(3)))
_uc2 = urlparse.urlunsplit(u_src[:2]+tuple('' for i in range(3)))
if _uc1 != _uc2:
## This is a different domain
return destination
_relpath = posixpath.relpath(u_dest.path, posixpath.dirname(u_src.path))
return urlparse.urlunsplit(('', '', _relpath, u_dest.query, u_dest.fragment)
Then
>>> relative_url('http://www.example.com/images.html', 'http://www.example.com/faq/index.html')
'../images.html'
>>> relative_url('http://www.example.com/images.html?my=query&string=here#fragment', 'http://www.example.com/faq/index.html')
'../images.html?my=query&string=here#fragment'
>>> relative_url('http://www.example.com/images.html', 'http://www2.example.com/faq/index.html')
'http://www.example.com/images.html'
>>> relative_url('https://www.example.com/images.html', 'http://www.example.com/faq/index.html')
'https://www.example.com/images.html'
Edit: now using the posixpath implementation of os.path to make it work under windows too.
import itertools
import urlparse
def makeRelativeUrl(sourceUrl, targetUrl):
'''
:param sourceUrl: a string
:param targetUrl: a string
:return: the path to target url relative to first or targetUrl if at different net location
'''
# todo test
parsedSource = urlparse.urlparse(sourceUrl)
parsedTarget = urlparse.urlparse(targetUrl)
if parsedSource.netloc == parsedTarget.netloc:
# if target on same path but lower than source url
if parsedTarget.path.startswith(parsedSource.path):
return parsedTarget.path.replace(parsedSource.path, '.')
# on same path
elif parsedTarget.path.rsplit('/', 1)[0] == parsedSource.path.rsplit('/', 1)[0]:
return './' + parsedTarget.path.rsplit('/', 1)[1]
# same netloc, varying paths
else:
path = ''
upCount = 0
for item in list(itertools.izip_longest(parsedSource.path.rsplit('/'), parsedTarget.path.rsplit('/'))):
if item[0] == item[1]:
pass
else:
if item[0] is not None:
upCount += 1
if item[1] is not None:
path += item[1] + '/'
return upCount * '../' + path
else:
return targetUrl
if __name__ == '__main__':
'''
"tests" :p
'''
url1 = 'http://coolwebsite.com/questions/bobobo/bo/bo/1663807/how-can-i-iterate-through-two-lists-in-parallel-in-python'
url2 = 'http://coolwebsite.com/questions/126524/iterate-a-list-with-indexes-in-python'
print url1
print url2
print 'second relative to second:'
print makeRelativeUrl(url1, url2)
url1 = 'http://coolwebsite.com/questions/1663807/how-can-i-iterate-through-two-lists-in-parallel-in-python'
url2 = 'http://coolwebsite.com/questions/1663807/bananas'
print url1
print url2
print 'second relative to first:'
print makeRelativeUrl(url1, url2)
url1 = 'http://coolwebsite.com/questions/1663807/fruits'
url2 = 'http://coolwebsite.com/questions/1663807/fruits/berries/bananas'
print url1
print url2
print 'second relative to first:'
print makeRelativeUrl(url1, url2)
Run 'tests' to see if it works :P
Related
I'm trying to pass json through terminal parameters:
import json
import sys
import requests
import platform
from winreg import *
from pathlib import Path
import datetime
def show_gifs(giffavs, folder):
value = json.loads(giffavs)["_state"]["favorites"]
print(f'found {len(value)} faved gifs')
for count, el in enumerate(value):
print(f'➡{count}: {el["url"]}')
if el["url"].split('.')[-1] != 'gif':
el["url"] += '.gif'
download_gif(el["url"], folder)
...
if __name__ == '__main__':
#set time value and get JSON from args if exist
now = datetime.datetime.now()
data = str(sys.argv[1]) #here I get my JSON data I used to just make it a str value but now I want to take it from gicen parameters
#look for right folder and create it
if len(sys.argv) < 3:
if len(sys.argv) == 3:
folder = sys.argv[2]
if platform.system() == 'Windows':
with OpenKey(HKEY_CURRENT_USER, 'SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders') as key:
folder = QueryValueEx(key, '{374DE290-123F-4565-9164-39C4925E467B}')[0]
else:
folder = str(Path.home()) + '/Downloads'
folder += '/DiscordFavoriteGif(' + now.strftime("%d") + '.' + now.strftime("%b") + '.' + now.strftime(
"%Y") + '_' + now.strftime("%H") + now.strftime("%M") + now.strftime("%S") + ')'
Path(folder).mkdir(parents=True, exist_ok=True)
show_gifs(data, folder)
else:
print('too many arguments')
when I write manually my json in the data variable
data = 'MYJSON'
it works but when I try to use the same data in my terminal
python DownloadFavoriteGIF.py MYJSON
I get a json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
which is weird because it's the exact same data, do you know where it could come from ?
EDIT
my json look like this I just removed the url for privacy issue:
{"_state":{"favorites":[{"url":"url","src":" url","width":1032,"height":1000,"format":"IMAGE"},{"url":" url","src":" url ","width":247,"height":387,"format":"IMAGE"},{"url":" url ","src":" url ","width":640,"height":358,"format":"VIDEO"},{"url":" url ","src":" url ","width":640,"height":516,"format":"VIDEO"},{"width":632,"height":640,"src":" url ","format":"VIDEO"}],"timesFavorited":6},"_version":2}
The problem lies somewhere in how I'm parsing and or reassembling urls. I'm losing the ?id=1 and getting ?d=1.
What I am trying to do is have the ability to manipulate and query parameter and reassemble it before sending back out modified. Meaning the dictionaries would be modified than using urlencode(modified_dict) I would reassemble url + query.
Can someone give me a pointer on what I'm doing wrong here.
from urlparse import parse_qs, urlparse , urlsplit
from urllib import urlencode
import os
import sys
import mechanize
from collections import OrderedDict
import urllib2
scrape_post_urls = []
get_inj_tests = []
#check multiple values to strip out duplicate and useless checks
def parse_url(url):
parsed = urlparse(url,allow_fragments=False)
if parsed.query:
if url not in get_inj_tests:
get_inj_tests.append(url)
#print url
'''get_inj_tests.append(url)
print url
#print 'scheme :', parsed.scheme
#print 'netloc :', parsed.netloc
print 'path :', parsed.path
print 'params :', parsed.params
print 'query :', parsed.query
print 'fragment:', parsed.fragment
#print 'hostname:', parsed.hostname, '(netloc in lower case)'
#print 'port :', parsed.port
'''
else:
if url not in scrape_post_urls:
scrape_post_urls.append(url)
#print url
def main():
unparsed_urls = open('in.txt','r')
for urls in unparsed_urls:
try:
parse_url(urls)
except:
pass
print(len(scrape_post_urls))
print(len(get_inj_tests))
clean_list = list(OrderedDict.fromkeys(get_inj_tests))
reaasembled_url = ""
#print clean_list
for query_test in clean_list:
url_object = urlparse(query_test,allow_fragments=False)
#parse query paramaters
url = query_test.split("?")[1]
dicty = {x[0] : x[1] for x in [x.split("=") for x in url[1:].split("&") ]}
query_pairs = [(k,v) for k,vlist in dicty.iteritems() for v in vlist]
reaasembled_url = "http://" + str(url_object.netloc) + str(url_object.path) + '?'
reaasembled_query = urlencode(query_pairs)
full_url = reaasembled_url + reaasembled_query
print dicty
main()
Can someone give me a pointer on what I'm doing wrong here.
Well quite simply you're not using the existing tools:
1/ to parse a query string, use urllib.parse.parse_qsl().
2/ to reassemble the querystring, use urllib.parse.urlencode().
And forget about dicts, querystrings can have multiple values for the same key, ie ?foo=1&foo=2 is perfectly valid.
first of all, your variable url is a bad name for the params variable and this could create confusion.
>>> url = "https://url.domian.com?id=22¶m1=1¶m2=2".split("?")[1]
'id=22¶m1=1¶m2=2'
>>> "https://url.domian.com?id=22¶m1=1¶m2=2".split("?")[1].split("&")
['id=22', 'param1=1', 'param2=2']
The error is in the url[1:].split("&")
Solution:
>>> dicty = {x[0] : x[1] for x in [x.split("=") for x in url.split("&") ]}
{'id': '22', 'param1': '1', 'param2': '2'}
NOTE: There is no fix url for it. Means it is not possible to see this url always. I want code which works for all the urls.
For ex, http://januapp.com/demo/search.php?search=aaa
http://januapp.com/demo/search.php?other=aaa
Now I want to change it to
http://januapp.com/demo/search.php?search=bbb
http://januapp.com/demo/search.php?other=bbb
I don't know how can I do it?
I tried this
import optparse
import requests
import urlparse
parser = optparse.OptionParser()
parser.add_option("-t","--Host", dest="Target", help="Please provide the target", default="true")
options, args = parser.parse_args()
url = options.Target
xss = []
xss.append("bbb")
try:
url2 =urlparse.urlparse(url)
print url2
url3 = urlparse.parse_qs(url2.query)
parametervalue = [key for key, key in url3.iteritems()] #[['aaa']]
parsed = parametervalue.append(xss[0])
print parsed
finalurl = urljoin(url, parsed)
print finalurl
except Exception as e:
print e
So when I pass this
xss3.py -t http://januapp.com/demo/search.php?search=aaa
The Error occurs below on to the cmd
ParseResult(scheme='http', netloc='januapp.com', path='/demo/search.php', params='', query='search=aaa', fragment='')
None
name 'urljoin' is not defined
See the None
Now that's the problem,
I am using Python2.7.
Thank you very much. Hope you get the problem.
You can try something with this kind of approach.
url = 'http://januapp.com/demo/search.php?search=aaa'
# First get all your query params
arr = url.split('?')
base_url = arr[0] # This is your base url i.e. 'http://januapp.com/demo/search.php'
params = arr[1] # here are your query params ['search=aaa']
# Now seprate out all the query parameters and their values
arr2 = params.split("=") # This will give you somrthing like this : ['search', 'aaa'], the the value will be next to the key
# This is a dictonary to hold the key value pairs
param_value_dict = {} # {'search': 'aaa'}
for i, str in enumerate(arr2):
if i % 2 == 0:
param_value_dict[str] = arr2[i + 1]
# now if you want to chnage the value of search from 'aaa' to 'bbb', then just change it in the dictonary
param_value_dict['search'] = 'bbb'
# now form the new url from the dictonary
new_url = base_url + '?'
for param_name, param_value in param_value_dict.items():
new_url = new_url + param_name + "=" + param_value + "&"
# remove the extra '&'
new_url = new_url[:len(new_url) - 1]
print(new_url)
How about:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search="
print a+ext
Where ext is what you want to search for, a is the link and just add them together.
Or you could replace values like this:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
print a.replace('aaa', ext)
Using regex:
import re
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
b=re.compile(r".+search=")
print re.search(b,a).group()+ext
I would like to write a Python function that is capable of taking a file path, like:
/abs/path/to/my/file/file.txt
And returning three string variables:
/abs - the root directory, plus the "top-most" directory in the path
file - the "bottom-most" directory in the path; the parent of file.txt
path/to/my - everything in between the top- and bottom-most directories in the path
So something with the following pseudo-code:
def extract_path_segments(file):
absPath = get_abs_path(file)
top = substring(absPath, 0, str_post(absPath, "/", FIRST))
bottom = substring(absPath, 0, str_post(absPath, "/", LAST))
middle = str_diff(absPath, top, bottom)
return (top, middle, bottom)
Thanks in advance for any help here!
You are looking for os.sep, together with various os.path module functions. Simply split the path by that character, then re-assemble the parts you want to use. Something like:
import os
def extract_path_segments(path, sep=os.sep):
path, filename = os.path.split(os.path.abspath(path))
bottom, rest = path[1:].split(sep, 1)
bottom = sep + bottom
middle, top = os.path.split(rest)
return (bottom, middle, top)
This does not deal very well with Windows paths, where both \ and / are legal path separators. In that case you also have a drive letter, so you'd have to special-case that as well anyway.
Output:
>>> extract_path_segments('/abs/path/to/my/file/file.txt')
('/abs', 'path/to/my', 'file')
use os.path.split:
import os.path
def split_path(path):
"""
Returns a 2-tuple of the form `root, list_of_path_parts`
"""
head,tail = os.path.split(path)
out = []
while tail:
out.append(tail)
head,tail = os.path.split(head)
return head,list(reversed(out))
def get_parts(path):
root,path_parts = split_path(path)
head = os.path.join(root,path_parts[0])
path_to = os.path.join(*path_parts[1:-2])
parentdir = path_parts[-2]
return head,path_to,parentdir
head,path_to,parentdir = get_parts('/foo/path/to/bar/baz')
print (head) #foo
print (path_to) #path/to
print (parentdir) #bar
Using os.path.split() and os.path.join() as we are supposed to
>>> import os
>>> pth = "/abs/path/to/my/file/file.txt"
>>> parts = []
>>> while True:
... pth, last = os.path.split(pth)
... if not last:
... break
... parts.append(last)
...
>>> pth + parts[-1]
'/abs'
>>> parts[1]
'file'
>>> os.path.join(*parts[-2:1:-1])
'path/to/my'
As a function
import os
def extract_path_segments(pth):
parts = []
while True:
pth, last = os.path.split(pth)
if not last:
break
parts.append(last)
return pth + parts[-1], parts[1], os.path.join(*parts[-2:1:-1])
>>> p = '/abs/path/to/my/file/file.txt'
>>> r = p.split('/')
>>> r[1],'/'.join(r[2:-2]),r[-2]
('abs', 'path/to/my', 'file')
As of now I am trying to scrape Good.is.The code as of now gives me the regular image(turn the if statement to True) but I want to higher res picture. I was wondering how I would replace a certain text so that I could download the high res picture. I want to change the html: http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html to http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html (The end is different). My code is:
import os, urllib, urllib2
from BeautifulSoup import BeautifulSoup
import HTMLParser
parser = HTMLParser.HTMLParser()
# make folder.
folderName = 'Good.is'
if not os.path.exists(folderName):
os.makedirs(folderName)
list = []
# Python ranges start from the first argument and iterate up to one
# less than the second argument, so we need 36 + 1 = 37
for i in range(1, 37):
list.append("http://www.good.is/infographics/page:" + str(i) + "/sort:recent/range:all")
listIterator1 = []
listIterator1[:] = range(0,37)
counter = 0
for x in listIterator1:
soup = BeautifulSoup(urllib2.urlopen(list[x]).read())
body = soup.findAll("ul", attrs = {'id': 'gallery_list_elements'})
number = len(body[0].findAll("p"))
listIterator = []
listIterator[:] = range(0,number)
for i in listIterator:
paragraphs = body[0].findAll("p")
nextArticle = body[0].findAll("a")[2]
text = body[0].findAll("p")[i]
if len(paragraphs) > 0:
#print image['src']
counter += 1
print counter
print parser.unescape(text.getText())
print "http://www.good.is" + nextArticle['href']
originalArticle = "http://www.good.is" + nextArticle['href']
article = BeautifulSoup(urllib2.urlopen(originalArticle).read())
title = article.findAll("div", attrs = {'class': 'title_and_image'})
getTitle = title[0].findAll("h1")
article1 = article.findAll("div", attrs = {'class': 'body'})
articleImage = article1[0].find("p")
betterImage = articleImage.find("a")
articleImage1 = articleImage.find("img")
paragraphsWithinSection = article1[0].findAll("p")
print betterImage['href']
if len(paragraphsWithinSection) > 1:
articleText = article1[0].findAll("p")[1]
else:
articleText = article1[0].findAll("p")[0]
print articleImage1['src']
print parser.unescape(getTitle)
if not articleText is None:
print parser.unescape(articleText.getText())
print '\n'
link = articleImage1['src']
x += 1
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Have a look at str.replace. If that isn't general enough to get the job done, you'll need to use a regular expression ( re -- probably re.sub ).
>>> str1="http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html"
>>> str1.replace("flash","flat")
'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html'
I think the safest and easiest way is to use a regular expression:
import re
url = 'http://www.google.com/this/is/sample/url/flash.html'
newUrl = re.sub('flash\.html$','flat.html',url)
The "$" means only match the end of the string. This solution will behave correctly even in the (admittedly unlikely) event that your url includes the substring "flash.html" somewhere other than the end, and also leaves the string unchanged (which I assume is the correct behavior) if it does not end with 'flash.html'.
See: http://docs.python.org/library/re.html#re.sub
#mgilson has a good solution, but the problem is it will replace all occurrences of the string with the replacement; so if you have the word "flash" as part of the URL (and not the just the trailing file name), you'll have multiple replacements:
>>> str = 'hello there hello'
>>> str.replace('hello','world')
'world there world'
An alternate solution is to replace the last part after / with flat.html:
>>> url = 'http://www.google.com/this/is/sample/url/flash.html'
>>> url[:url.rfind('/')+1]+'flat.html'
'http://www.google.com/this/is/sample/url/flat.html'
Using urlparse you can do a few bits and bobs:
from urlparse import urlsplit, urlunsplit, urljoin
s = 'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html'
url = urlsplit(s)
head, tail = url.path.rsplit('/', 1)
new_path = head, 'flat.html'
print urlunsplit(url._replace(path=urljoin(*new_path)))