The problem lies somewhere in how I'm parsing and or reassembling urls. I'm losing the ?id=1 and getting ?d=1.
What I am trying to do is have the ability to manipulate and query parameter and reassemble it before sending back out modified. Meaning the dictionaries would be modified than using urlencode(modified_dict) I would reassemble url + query.
Can someone give me a pointer on what I'm doing wrong here.
from urlparse import parse_qs, urlparse , urlsplit
from urllib import urlencode
import os
import sys
import mechanize
from collections import OrderedDict
import urllib2
scrape_post_urls = []
get_inj_tests = []
#check multiple values to strip out duplicate and useless checks
def parse_url(url):
parsed = urlparse(url,allow_fragments=False)
if parsed.query:
if url not in get_inj_tests:
get_inj_tests.append(url)
#print url
'''get_inj_tests.append(url)
print url
#print 'scheme :', parsed.scheme
#print 'netloc :', parsed.netloc
print 'path :', parsed.path
print 'params :', parsed.params
print 'query :', parsed.query
print 'fragment:', parsed.fragment
#print 'hostname:', parsed.hostname, '(netloc in lower case)'
#print 'port :', parsed.port
'''
else:
if url not in scrape_post_urls:
scrape_post_urls.append(url)
#print url
def main():
unparsed_urls = open('in.txt','r')
for urls in unparsed_urls:
try:
parse_url(urls)
except:
pass
print(len(scrape_post_urls))
print(len(get_inj_tests))
clean_list = list(OrderedDict.fromkeys(get_inj_tests))
reaasembled_url = ""
#print clean_list
for query_test in clean_list:
url_object = urlparse(query_test,allow_fragments=False)
#parse query paramaters
url = query_test.split("?")[1]
dicty = {x[0] : x[1] for x in [x.split("=") for x in url[1:].split("&") ]}
query_pairs = [(k,v) for k,vlist in dicty.iteritems() for v in vlist]
reaasembled_url = "http://" + str(url_object.netloc) + str(url_object.path) + '?'
reaasembled_query = urlencode(query_pairs)
full_url = reaasembled_url + reaasembled_query
print dicty
main()
Can someone give me a pointer on what I'm doing wrong here.
Well quite simply you're not using the existing tools:
1/ to parse a query string, use urllib.parse.parse_qsl().
2/ to reassemble the querystring, use urllib.parse.urlencode().
And forget about dicts, querystrings can have multiple values for the same key, ie ?foo=1&foo=2 is perfectly valid.
first of all, your variable url is a bad name for the params variable and this could create confusion.
>>> url = "https://url.domian.com?id=22¶m1=1¶m2=2".split("?")[1]
'id=22¶m1=1¶m2=2'
>>> "https://url.domian.com?id=22¶m1=1¶m2=2".split("?")[1].split("&")
['id=22', 'param1=1', 'param2=2']
The error is in the url[1:].split("&")
Solution:
>>> dicty = {x[0] : x[1] for x in [x.split("=") for x in url.split("&") ]}
{'id': '22', 'param1': '1', 'param2': '2'}
Related
so what i wanna do is basically i have a list of urls with multiple parameters, such as:
https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2
and i would want to get is something like this:
https://www.somesite.com/path/path2/path3?param1=PAYLOAD¶m2=value2
https://www.somesite.com/path/path2/path3?param1=value1¶m2=PAYLOAD
like i wanna iterate through every parameter (basically every match of "=" and "&") and replace each value one per time. Thank you in advance.
from urllib.parse import urlparse
import re
urls = ["https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2¶m3=value3",
"https://www.anothersite.com/path/path2/path3?param1=value1¶m2=value2¶m3=value3"]
parseds = [urlparse(url) for url in urls]
newurls = []
for parsed in parseds:
params = parsed[4].split("&")
for i, param in enumerate(params):
newparam = re.sub("=.+", "=PAYLOAD", param)
newurls.append(
parsed[0] +
"://" +
parsed[1] +
parsed[2] +
"?" +
parsed[4].replace(param, newparam)
)
newurls is
['https://www.somesite.com/path/path2/path3?param1=PAYLOAD¶m2=value2¶m3=value3',
'https://www.somesite.com/path/path2/path3?param1=value1¶m2=PAYLOAD¶m3=value3',
'https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2¶m3=PAYLOAD',
'https://www.anothersite.com/path/path2/path3?param1=PAYLOAD¶m2=value2¶m3=value3',
'https://www.anothersite.com/path/path2/path3?param1=value1¶m2=PAYLOAD¶m3=value3',
'https://www.anothersite.com/path/path2/path3?param1=value1¶m2=value2¶m3=PAYLOAD']
I've solved it:
from urllib.parse import urlparse
url = "https://github.com/search?p=2&q=user&type=Code&name=djalel"
parsed = urlparse(url)
query = parsed.query
params = query.split("&")
new_query = []
for param in params:
l = params.index(param)
param = str(param.split("=")[0]) + "=" + "PAYLOAD"
params[l] = param
new_query.append("&".join(params))
params = query.split("&")
for query in new_query:
print(str(parsed.scheme) + '://' + str(parsed.netloc) + str(parsed.path) + '?' + query)
Output:
https://github.com/search?p=PAYLOAD&q=user&type=Code&name=djalel
https://github.com/search?p=2&q=PAYLOAD&type=Code&name=djalel
https://github.com/search?p=2&q=user&type=PAYLOAD&name=djalel
https://github.com/search?p=2&q=user&type=Code&name=PAYLOAD
I get an error which is TypeError: can only concatenate str (not "list") to str. How do I join all addresses together with a separator using "|" ?
def make_api_url(**kwargs):
data = pd.read_csv('bitcoinaddr.csv')
Wallet_Address = (data.loc[:, "Address"])
BASE_URL = "https://blockchain.info/balance"
print (Wallet_Address)
for address in Wallet_Address:
print (address)
url = BASE_URL + [f"?active={address}"] #error
print(url)
for key, value in kwargs.items():
url += f"&{key}={value}"
return url
get_balance_url = make_api_url()
I think you want:
address = '|'.join(Wallet_Address)
url = BASE_URL + f"?active={addresses}"
That is, build the pipe separated string of addresses first, then append it one time to the URL as a query parameter.
NOTE: There is no fix url for it. Means it is not possible to see this url always. I want code which works for all the urls.
For ex, http://januapp.com/demo/search.php?search=aaa
http://januapp.com/demo/search.php?other=aaa
Now I want to change it to
http://januapp.com/demo/search.php?search=bbb
http://januapp.com/demo/search.php?other=bbb
I don't know how can I do it?
I tried this
import optparse
import requests
import urlparse
parser = optparse.OptionParser()
parser.add_option("-t","--Host", dest="Target", help="Please provide the target", default="true")
options, args = parser.parse_args()
url = options.Target
xss = []
xss.append("bbb")
try:
url2 =urlparse.urlparse(url)
print url2
url3 = urlparse.parse_qs(url2.query)
parametervalue = [key for key, key in url3.iteritems()] #[['aaa']]
parsed = parametervalue.append(xss[0])
print parsed
finalurl = urljoin(url, parsed)
print finalurl
except Exception as e:
print e
So when I pass this
xss3.py -t http://januapp.com/demo/search.php?search=aaa
The Error occurs below on to the cmd
ParseResult(scheme='http', netloc='januapp.com', path='/demo/search.php', params='', query='search=aaa', fragment='')
None
name 'urljoin' is not defined
See the None
Now that's the problem,
I am using Python2.7.
Thank you very much. Hope you get the problem.
You can try something with this kind of approach.
url = 'http://januapp.com/demo/search.php?search=aaa'
# First get all your query params
arr = url.split('?')
base_url = arr[0] # This is your base url i.e. 'http://januapp.com/demo/search.php'
params = arr[1] # here are your query params ['search=aaa']
# Now seprate out all the query parameters and their values
arr2 = params.split("=") # This will give you somrthing like this : ['search', 'aaa'], the the value will be next to the key
# This is a dictonary to hold the key value pairs
param_value_dict = {} # {'search': 'aaa'}
for i, str in enumerate(arr2):
if i % 2 == 0:
param_value_dict[str] = arr2[i + 1]
# now if you want to chnage the value of search from 'aaa' to 'bbb', then just change it in the dictonary
param_value_dict['search'] = 'bbb'
# now form the new url from the dictonary
new_url = base_url + '?'
for param_name, param_value in param_value_dict.items():
new_url = new_url + param_name + "=" + param_value + "&"
# remove the extra '&'
new_url = new_url[:len(new_url) - 1]
print(new_url)
How about:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search="
print a+ext
Where ext is what you want to search for, a is the link and just add them together.
Or you could replace values like this:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
print a.replace('aaa', ext)
Using regex:
import re
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
b=re.compile(r".+search=")
print re.search(b,a).group()+ext
As of now I am trying to scrape Good.is.The code as of now gives me the regular image(turn the if statement to True) but I want to higher res picture. I was wondering how I would replace a certain text so that I could download the high res picture. I want to change the html: http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html to http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html (The end is different). My code is:
import os, urllib, urllib2
from BeautifulSoup import BeautifulSoup
import HTMLParser
parser = HTMLParser.HTMLParser()
# make folder.
folderName = 'Good.is'
if not os.path.exists(folderName):
os.makedirs(folderName)
list = []
# Python ranges start from the first argument and iterate up to one
# less than the second argument, so we need 36 + 1 = 37
for i in range(1, 37):
list.append("http://www.good.is/infographics/page:" + str(i) + "/sort:recent/range:all")
listIterator1 = []
listIterator1[:] = range(0,37)
counter = 0
for x in listIterator1:
soup = BeautifulSoup(urllib2.urlopen(list[x]).read())
body = soup.findAll("ul", attrs = {'id': 'gallery_list_elements'})
number = len(body[0].findAll("p"))
listIterator = []
listIterator[:] = range(0,number)
for i in listIterator:
paragraphs = body[0].findAll("p")
nextArticle = body[0].findAll("a")[2]
text = body[0].findAll("p")[i]
if len(paragraphs) > 0:
#print image['src']
counter += 1
print counter
print parser.unescape(text.getText())
print "http://www.good.is" + nextArticle['href']
originalArticle = "http://www.good.is" + nextArticle['href']
article = BeautifulSoup(urllib2.urlopen(originalArticle).read())
title = article.findAll("div", attrs = {'class': 'title_and_image'})
getTitle = title[0].findAll("h1")
article1 = article.findAll("div", attrs = {'class': 'body'})
articleImage = article1[0].find("p")
betterImage = articleImage.find("a")
articleImage1 = articleImage.find("img")
paragraphsWithinSection = article1[0].findAll("p")
print betterImage['href']
if len(paragraphsWithinSection) > 1:
articleText = article1[0].findAll("p")[1]
else:
articleText = article1[0].findAll("p")[0]
print articleImage1['src']
print parser.unescape(getTitle)
if not articleText is None:
print parser.unescape(articleText.getText())
print '\n'
link = articleImage1['src']
x += 1
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Have a look at str.replace. If that isn't general enough to get the job done, you'll need to use a regular expression ( re -- probably re.sub ).
>>> str1="http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html"
>>> str1.replace("flash","flat")
'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html'
I think the safest and easiest way is to use a regular expression:
import re
url = 'http://www.google.com/this/is/sample/url/flash.html'
newUrl = re.sub('flash\.html$','flat.html',url)
The "$" means only match the end of the string. This solution will behave correctly even in the (admittedly unlikely) event that your url includes the substring "flash.html" somewhere other than the end, and also leaves the string unchanged (which I assume is the correct behavior) if it does not end with 'flash.html'.
See: http://docs.python.org/library/re.html#re.sub
#mgilson has a good solution, but the problem is it will replace all occurrences of the string with the replacement; so if you have the word "flash" as part of the URL (and not the just the trailing file name), you'll have multiple replacements:
>>> str = 'hello there hello'
>>> str.replace('hello','world')
'world there world'
An alternate solution is to replace the last part after / with flat.html:
>>> url = 'http://www.google.com/this/is/sample/url/flash.html'
>>> url[:url.rfind('/')+1]+'flat.html'
'http://www.google.com/this/is/sample/url/flat.html'
Using urlparse you can do a few bits and bobs:
from urlparse import urlsplit, urlunsplit, urljoin
s = 'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html'
url = urlsplit(s)
head, tail = url.path.rsplit('/', 1)
new_path = head, 'flat.html'
print urlunsplit(url._replace(path=urljoin(*new_path)))
Is there a builtin function to get url like this: ../images.html given a base url like this: http://www.example.com/faq/index.html and a target url such as http://www.example.com/images.html
I checked urlparse module. What I want is counterpart of the urljoin() function.
You could use urlparse.urlparse to find the paths, and the posixpath version of os.path.relname to find the relative path.
(Warning: This works for Linux, but may not for Windows):
import urlparse
import sys
import posixpath
def relurl(target,base):
base=urlparse.urlparse(base)
target=urlparse.urlparse(target)
if base.netloc != target.netloc:
raise ValueError('target and base netlocs do not match')
base_dir='.'+posixpath.dirname(base.path)
target='.'+target.path
return posixpath.relpath(target,start=base_dir)
tests=[
('http://www.example.com/images.html','http://www.example.com/faq/index.html','../images.html'),
('http://google.com','http://google.com','.'),
('http://google.com','http://google.com/','.'),
('http://google.com/','http://google.com','.'),
('http://google.com/','http://google.com/','.'),
('http://google.com/index.html','http://google.com/','index.html'),
('http://google.com/index.html','http://google.com/index.html','index.html'),
]
for target,base,answer in tests:
try:
result=relurl(target,base)
except ValueError as err:
print('{t!r},{b!r} --> {e}'.format(t=target,b=base,e=err))
else:
if result==answer:
print('{t!r},{b!r} --> PASS'.format(t=target,b=base))
else:
print('{t!r},{b!r} --> {r!r} != {a!r}'.format(
t=target,b=base,r=result,a=answer))
The first solutions that comes to mind is:
>>> os.path.relpath('/images.html', os.path.dirname('/faq/index.html'))
'../images.html'
Of course, this requires URL parsing -> domain name comparison (!!) -> path rewriting if that's the case -> re-adding query and fragment.
Edit: a more complete version
import urlparse
import posixpath
def relative_url(destination, source):
u_dest = urlparse.urlsplit(destination)
u_src = urlparse.urlsplit(source)
_uc1 = urlparse.urlunsplit(u_dest[:2]+tuple('' for i in range(3)))
_uc2 = urlparse.urlunsplit(u_src[:2]+tuple('' for i in range(3)))
if _uc1 != _uc2:
## This is a different domain
return destination
_relpath = posixpath.relpath(u_dest.path, posixpath.dirname(u_src.path))
return urlparse.urlunsplit(('', '', _relpath, u_dest.query, u_dest.fragment)
Then
>>> relative_url('http://www.example.com/images.html', 'http://www.example.com/faq/index.html')
'../images.html'
>>> relative_url('http://www.example.com/images.html?my=query&string=here#fragment', 'http://www.example.com/faq/index.html')
'../images.html?my=query&string=here#fragment'
>>> relative_url('http://www.example.com/images.html', 'http://www2.example.com/faq/index.html')
'http://www.example.com/images.html'
>>> relative_url('https://www.example.com/images.html', 'http://www.example.com/faq/index.html')
'https://www.example.com/images.html'
Edit: now using the posixpath implementation of os.path to make it work under windows too.
import itertools
import urlparse
def makeRelativeUrl(sourceUrl, targetUrl):
'''
:param sourceUrl: a string
:param targetUrl: a string
:return: the path to target url relative to first or targetUrl if at different net location
'''
# todo test
parsedSource = urlparse.urlparse(sourceUrl)
parsedTarget = urlparse.urlparse(targetUrl)
if parsedSource.netloc == parsedTarget.netloc:
# if target on same path but lower than source url
if parsedTarget.path.startswith(parsedSource.path):
return parsedTarget.path.replace(parsedSource.path, '.')
# on same path
elif parsedTarget.path.rsplit('/', 1)[0] == parsedSource.path.rsplit('/', 1)[0]:
return './' + parsedTarget.path.rsplit('/', 1)[1]
# same netloc, varying paths
else:
path = ''
upCount = 0
for item in list(itertools.izip_longest(parsedSource.path.rsplit('/'), parsedTarget.path.rsplit('/'))):
if item[0] == item[1]:
pass
else:
if item[0] is not None:
upCount += 1
if item[1] is not None:
path += item[1] + '/'
return upCount * '../' + path
else:
return targetUrl
if __name__ == '__main__':
'''
"tests" :p
'''
url1 = 'http://coolwebsite.com/questions/bobobo/bo/bo/1663807/how-can-i-iterate-through-two-lists-in-parallel-in-python'
url2 = 'http://coolwebsite.com/questions/126524/iterate-a-list-with-indexes-in-python'
print url1
print url2
print 'second relative to second:'
print makeRelativeUrl(url1, url2)
url1 = 'http://coolwebsite.com/questions/1663807/how-can-i-iterate-through-two-lists-in-parallel-in-python'
url2 = 'http://coolwebsite.com/questions/1663807/bananas'
print url1
print url2
print 'second relative to first:'
print makeRelativeUrl(url1, url2)
url1 = 'http://coolwebsite.com/questions/1663807/fruits'
url2 = 'http://coolwebsite.com/questions/1663807/fruits/berries/bananas'
print url1
print url2
print 'second relative to first:'
print makeRelativeUrl(url1, url2)
Run 'tests' to see if it works :P