from requests import *
import json
import base64
import urllib
from cmd import Cmd
url = "http://api.response.htb/"
url_digest = "cab532f75001ed2cc94ada92183d2160319a328e67001a9215956a5dbf10c545"
def get(url, url_digest): data = {
"url": url,
"url_digest": url_digest,
"method": "GET",
"session": "5f7bf45b02c832cf5b40c15ab6d365af",
"session_digest": "a2b9ac69ab85795d13d12857a709a024cd729dcdf2c3fd3bb21ed514bc9990ac"
}
headers = {'Content-Type': 'application/json'}
url_proxy = "http://proxy.response.htb/fetch"
s = Session()
res = s.post(url_proxy, json=data, headers=headers)
body = json.loads(res.text)['body']
body = base64.b64decode(body)
if "zip" in url:
f = open("file.zip", "wb")
f.write(body)
f.close()
print("Done saving file :-");
else: print body
def url_de(url):
s = Session()
res = s.get('http://www.response.htb/status/main.js.php',
cookies={'PHPSESSID': url})
x = res.text.find("session_digest':'")
y = res.text.find("'};")
return res.text[x+17:y]
class pr(Cmd):
prompt = "==> "
def default(self, url): url_digest = url_de(url)
get(url, url_digest)
def do_exit(self, a): exit()
pr().cmdloop()
at line 32 vs code is giving an error message as expected expression pylance and unable to proceed further. please anyone help me to solve this error. i am getting two error one is in else and another is at return statement at line 43. so if anyone can able to identify the error and help me out to solve this please help me.
Indentation is significant in Python.
You have one line after your if indented, then lines which are not indented. This means the conditional is finished. You then have an else by itself, which is not permitted.
You likely meant:
if "zip" in url:
f = open("file.zip", "wb")
f.write(body)
f.close()
print("Done saving file :-");
else:
print(body)
But this would be improved by using a context manager:
if "zip" in url:
with open("file.zip", "wb") as f:
f.write(body)
print("Done saving file :-");
else:
print(body)
This is your code scope
Just indent 28-30 and 38-43 line , then part 1,2 will into if scope
part 3,4 into func scope
Data to be changed
I gave this as an example
urlchange.py
import requests
Url = ""+".json"
# json request
json_post = request.get(Url).json()
...codes where I pull JSON data
test.py
import urlchange
urlchange.Url = "https://stackoverflow.com/questions/"
Url= "" <---- how do I change it
I couldn't replace string with an expression
The code you have written in urlchange.py will be executed when you import the file so, Change to:
urlchange.py
Url=""
def main():
global Url
Url += ".json"
# json request
json_post = request.get(Url).json()
# rest of your code
test.py
import urlchange
urlchange.Url = "https://stackoverflow.com/questions/"
url_change.main()
OR
urlchange.py
def main(Url):
Url += ".json"
# json request
json_post = request.get(Url).json()
# rest of your code
test.py
import urlchange
Url = "https://stackoverflow.com/questions/"
url_change.main(Url)
I am writing code to parse tracker information in torrent file using python.
import bencoder
import sys
target = './'+sys.argv[1]
with open(target, 'rb') as torrent_file:
torrent = bencoder.decode(torrent_file.read())
i=0
while True:
try:
print(torrent[b'announce-list'][i])
i+=1
except:
break
The output is as follows.
[b'udp://tracker.openbittorrent.com:80/announce']
[b'udp://tracker.opentrackr.org:1337/announce']
I want to parse the value in the form below.
["tracker.openbittorrent.com", 80]
["tracker.opentrackr.org", 1337]
How should I parse it?
You might use urllib.parse.urlparse for this as follows
from urllib.parse import urlparse
url1 = b'udp://tracker.openbittorrent.com:80/announce'
url2 = b'udp://tracker.opentrackr.org:1337/announce'
c1 = urlparse(url1)
c2 = urlparse(url2)
hostport1 = c1.netloc.rsplit(b':',1)
hostport2 = c2.netloc.rsplit(b':',2)
hostport1[0] = hostport1[0].decode()
hostport1[1] = int(hostport1[1])
hostport2[0] = hostport2[0].decode()
hostport2[1] = int(hostport2[1])
print(hostport1)
print(hostport2)
output
['tracker.openbittorrent.com', 80]
['tracker.opentrackr.org', 1337]
Explanation: I extract netloc, then split at most once at first from right b':', then apply .decode to host port to convert bytes into str and int to convert bytes into int.
EDIT: After more careful reading, I noticed that you might access .hostname and .port which allow much more concise code to do that task, that is
from urllib.parse import urlparse
url1 = b'udp://tracker.openbittorrent.com:80/announce'
url2 = b'udp://tracker.opentrackr.org:1337/announce'
c1 = urlparse(url1)
c2 = urlparse(url2)
hostport1 = [c1.hostname.decode(), c1.port]
hostport2 = [c2.hostname.decode(), c2.port]
print(hostport1)
print(hostport2)
gives same output as code above.
I am trying to generate some link.
NOTE: THERE IS PROBLEM WITH return vs print.
when i write the code with with return, it is only return one linK:
run this code:
import requests
import re
wikiurl = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
state_pat = re.compile(r'title=\"(\w+)\">')
def get_page_content(url):
response = requests.get(url)
return response.text
def link_generator(wikiurl):
content = get_page_content(wikiurl)
names = state_pat.findall(content)
for i in names:
return 'https://www.local.com/business/results/listing.cfm?s=tile+and+grout+cleaning&ar=' + i + '%2CNY&gsp=ZFZWU1RaU09zWGNYdjFEV1l2ZHFLNVZUUFRPT3c3a21lbFVCbERQOU5VS3p6ai9DRXNMa29PcVZ0ZVV0TXZLM01wUVFUUHZYK2lrMnB5VGJyMHZJeUNoK1dXaUoxZ1NKT3AxbVlJOGN1aVBEb1NRMzlCemdDVHh5aGd3eU5DYUpKWDRtNFVQR0llOFJibUhQR3pSV3ppWFR4ekJoRVltL29UdFQ0MW9KUS9IenJrcjVBMUt3bkErRnlSVnFjRnZ0TjhRWEdET0FuZWRVUGNkemdxUlkzOUYyUjZXbHBzQWRMY3hEUTY4WmtnYkRsSkEvazBrVVY5d0NmSVVMaWp0WnNDNmFsZFNzMitWeHZDYTg2YmJwRGQzSisvOUJaYWNBaFdUd21LaWJpNk9veS9OT1N1VE5DV3RUNDIxdkY5NmZ4bWFVcWtLc1BlVkNRNlEvSG4ydER1T1ZkcXk4Um5BWU5kUU9UZnVOUE9BPQ%253D%253D&lwfilter=&wsrt=&wpn='
a = link_generator(wikiurl)
print(a)
and if i run this code adding a print into fuction, it returns all the link, why? i need all the link with return
run this code: you will see different:
import requests
import re
wikiurl = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
state_pat = re.compile(r'title=\"(\w+)\">')
def get_page_content(url):
response = requests.get(url)
return response.text
def link_generator(wikiurl):
content = get_page_content(wikiurl)
names = state_pat.findall(content)
for i in names:
print('https://www.local.com/business/results/listing.cfm?s=tile+and+grout+cleaning&ar=' + i + '%2CNY&gsp=ZFZWU1RaU09zWGNYdjFEV1l2ZHFLNVZUUFRPT3c3a21lbFVCbERQOU5VS3p6ai9DRXNMa29PcVZ0ZVV0TXZLM01wUVFUUHZYK2lrMnB5VGJyMHZJeUNoK1dXaUoxZ1NKT3AxbVlJOGN1aVBEb1NRMzlCemdDVHh5aGd3eU5DYUpKWDRtNFVQR0llOFJibUhQR3pSV3ppWFR4ekJoRVltL29UdFQ0MW9KUS9IenJrcjVBMUt3bkErRnlSVnFjRnZ0TjhRWEdET0FuZWRVUGNkemdxUlkzOUYyUjZXbHBzQWRMY3hEUTY4WmtnYkRsSkEvazBrVVY5d0NmSVVMaWp0WnNDNmFsZFNzMitWeHZDYTg2YmJwRGQzSisvOUJaYWNBaFdUd21LaWJpNk9veS9OT1N1VE5DV3RUNDIxdkY5NmZ4bWFVcWtLc1BlVkNRNlEvSG4ydER1T1ZkcXk4Um5BWU5kUU9UZnVOUE9BPQ%253D%253D&lwfilter=&wsrt=&wpn=')
a = link_generator(wikiurl)
print(a)
When you issue a return statement in a function it doesn't execute any further lines and returns to its caller. If you want to iteratively return items in a generator you can replace return with yield. Alternatively collect the results as a list and return the list.
You then need to change your final line when you're calling this to:
a = list(link_generator(wikiurl))
to unpack your generator
I have the following URL:
url = http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg
I would like to extract the file name in this URL: 09-09-201315-47-571378756077.jpg
Once I get this file name, I'm going to save it with this name to the Desktop.
filename = **extracted file name from the url**
download_photo = urllib.urlretrieve(url, "/home/ubuntu/Desktop/%s.jpg" % (filename))
After this, I'm going to resize the photo, once that is done, I've going to save the resized version and append the word "_small" to the end of the filename.
downloadedphoto = Image.open("/home/ubuntu/Desktop/%s.jpg" % (filename))
resize_downloadedphoto = downloadedphoto.resize.((300, 300), Image.ANTIALIAS)
resize_downloadedphoto.save("/home/ubuntu/Desktop/%s.jpg" % (filename + _small))
From this, what I am trying to achieve is to get two files, the original photo with the original name, then the resized photo with the modified name. Like so:
09-09-201315-47-571378756077.jpg
rename to:
09-09-201315-47-571378756077_small.jpg
How can I go about doing this?
You can use urllib.parse.urlparse with os.path.basename:
import os
from urllib.parse import urlparse
url = "http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg"
a = urlparse(url)
print(a.path) # Output: /kyle/09-09-201315-47-571378756077.jpg
print(os.path.basename(a.path)) # Output: 09-09-201315-47-571378756077.jpg
Your URL might contain percent-encoded characters like %20 for space or %E7%89%B9%E8%89%B2 for "特色". If that's the case, you'll need to unquote (or unquote_plus) them. You can also use pathlib.Path().name instead of os.path.basename, which could help to add a suffix in the name (like asked in the original question):
from pathlib import Path
from urllib.parse import urlparse, unquote
url = "http://photographs.500px.com/kyle/09-09-2013%20-%2015-47-571378756077.jpg"
urlparse(url).path
url_parsed = urlparse(url)
print(unquote(url_parsed.path)) # Output: /kyle/09-09-2013 - 15-47-571378756077.jpg
file_path = Path("/home/ubuntu/Desktop/") / unquote(Path(url_parsed.path).name)
print(file_path) # Output: /home/ubuntu/Desktop/09-09-2013 - 15-47-571378756077.jpg
new_file = file_path.with_stem(file_path.stem + "_small")
print(new_file) # Output: /home/ubuntu/Desktop/09-09-2013 - 15-47-571378756077_small.jpg
Also, an alternative is to use unquote(urlparse(url).path.split("/")[-1]).
os.path.basename(url)
Why try harder?
In [1]: os.path.basename("https://example.com/file.html")
Out[1]: 'file.html'
In [2]: os.path.basename("https://example.com/file")
Out[2]: 'file'
In [3]: os.path.basename("https://example.com/")
Out[3]: ''
In [4]: os.path.basename("https://example.com")
Out[4]: 'example.com'
Note 2020-12-20
Nobody has thus far provided a complete solution.
A URL can contain a ?[query-string] and/or a #[fragment Identifier] (but only in that order: ref)
In [1]: from os import path
In [2]: def get_filename(url):
...: fragment_removed = url.split("#")[0] # keep to left of first #
...: query_string_removed = fragment_removed.split("?")[0]
...: scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
...: if scheme_removed.find("/") == -1:
...: return ""
...: return path.basename(scheme_removed)
...:
In [3]: get_filename("a.com/b")
Out[3]: 'b'
In [4]: get_filename("a.com/")
Out[4]: ''
In [5]: get_filename("https://a.com/")
Out[5]: ''
In [6]: get_filename("https://a.com/b")
Out[6]: 'b'
In [7]: get_filename("https://a.com/b?c=d#e")
Out[7]: 'b'
filename = url[url.rfind("/")+1:]
filename_small = filename.replace(".", "_small.")
maybe use ".jpg" in the last case since a . can also be in the filename.
You could just split the url by "/" and retrieve the last member of the list:
url = "http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg"
filename = url.split("/")[-1]
#09-09-201315-47-571378756077.jpg
Then use replace to change the ending:
small_jpg = filename.replace(".jpg", "_small.jpg")
#09-09-201315-47-571378756077_small.jpg
With python3 (from 3.4 upwards) you can abuse the pathlib library in the following way:
from pathlib import Path
p = Path('http://example.com/somefile.html')
print(p.name)
# >>> 'somefile.html'
print(p.stem)
# >>> 'somefile'
print(p.suffix)
# >>> '.html'
print(f'{p.stem}-spamspam{p.suffix}')
# >>> 'somefile-spamspam.html'
❗️ WARNING
The pathlib module is NOT meant for parsing URLs — it is designed to work with POSIX paths and not with URLs. Don't use it in production code! It's a dirty quick hack for non-critical code. The code is only provided as an example of what you can do but probably should not do. If you need to parse URLs then go with urllib.parse or alternatives.
Use urllib.parse.urlparse to get just the path part of the URL, and then use pathlib.Path on that path to get the filename:
from urllib.parse import urlparse
from pathlib import Path
url = "http://example.com/some/long/path/a_filename.jpg?some_query_params=true&some_more=true#and-an-anchor"
a = urlparse(url)
a.path # '/some/long/path/a_filename.jpg'
Path(a.path).name # 'a_filename.jpg'
Sometimes there is a query string:
filename = url.split("/")[-1].split("?")[0]
new_filename = filename.replace(".jpg", "_small.jpg")
A simple version using the os package:
import os
def get_url_file_name(url):
url = url.split("#")[0]
url = url.split("?")[0]
return os.path.basename(url)
Examples:
print(get_url_file_name("example.com/myfile.tar.gz")) # 'myfile.tar.gz'
print(get_url_file_name("example.com/")) # ''
print(get_url_file_name("https://example.com/")) # ''
print(get_url_file_name("https://example.com/hello.zip")) # 'hello.zip'
print(get_url_file_name("https://example.com/args.tar.gz?c=d#e")) # 'args.tar.gz'
Sometimes the link you have can have redirects (that was the case for me). In that case you have to solve the redirects
import requests
url = "http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg"
response = requests.head(url)
url = response.url
then you can continue with the best answer at the moment (Ofir's)
import os
from urllib.parse import urlparse
a = urlparse(url)
print(a.path) # Output: /kyle/09-09-201315-47-571378756077.jpg
print(os.path.basename(a.path)) # Output: 09-09-201315-47-571378756077.jpg
it doesn't work with this page however, as the page isn't available anymore
Python split url to find image name and extension
helps you to extract the image name. to append name :
imageName = '09-09-201315-47-571378756077'
new_name = '{0}_small.jpg'.format(imageName)
I see people using the Pathlib library to parse URLs. This is not a good idea! Pathlib is not designed for it, use special libraries like urllib or similar instead.
This is the most stable version I could come up with. It handles params as well as fragments:
from urllib.parse import urlparse, ParseResult
def update_filename(url):
parsed_url = urlparse(url)
path = parsed_url.path
filename = path[path.rfind('/') + 1:]
if not filename:
return
file, extension = filename.rsplit('.', 1)
new_path = parsed_url.path.replace(filename, f"{file}_small.{extension}")
parsed_url = ParseResult(**{**parsed_url._asdict(), 'path': new_path})
return parsed_url.geturl()
Example:
assert update_filename('https://example.com/') is None
assert update_filename('https://example.com/path/to/') is None
assert update_filename('https://example.com/path/to/report.pdf') == 'https://example.com/path/to/report_small.pdf'
assert update_filename('https://example.com/path/to/filename with spaces.pdf') == 'https://example.com/path/to/filename with spaces_small.pdf'
assert update_filename('https://example.com/path/to/report_01.01.2022.pdf') == 'https://example.com/path/to/report_01.01.2022_small.pdf'
assert update_filename('https://example.com/path/to/report.pdf?param=1¶m2=2') == 'https://example.com/path/to/report_small.pdf?param=1¶m2=2'
assert update_filename('https://example.com/path/to/report.pdf?param=1¶m2=2#test') == 'https://example.com/path/to/report_small.pdf?param=1¶m2=2#test'
We can extract filename from a url by using ntpath module.
import ntpath
url = 'http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg'
name, ext = ntpath.splitext(ntpath.basename(url))
# 09-09-201315-47-571378756077 .jpg
print(name + '_small' + ext)
09-09-201315-47-571378756077_small.jpg