Unshort amazon EU link using Python - python

i'm trying to unshort amazon link using python, from pattern: "https:// amzn.eu/XXXX".
It seems the url is not recognized!
If the url is in the format "https:// amzn.to/XXXXX" it works!
Only with amzn.EU problem appears.
This is my code. Any suggest?
import os, pathlib, re, requests, time, warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
def formaturl(url):
if not re.match('(?:http|ftp|https)://', url):
return 'http://{}'.format(url)
return url
def unshort_link(url):
url = formaturl(url)
warnings.simplefilter('ignore',InsecureRequestWarning)
session = requests.Session()
resp = session.head(url, allow_redirects=True, verify=False)
unshort_url = resp.url
return unshort_url
not_working_link = 'https://amzn.eu/d/fb1IYWl'
#working_link = 'https://amzn.to/3A0milQ'
unshorted_url = unshort_link(not_working_link)
print(unshorted_url)

The HEAD request doesn't work on this link, it returns a 404.
However, with a GET it'll work as expected:
resp = requests.get('https://amzn.eu/d/fb1IYWl')
resp.url
# 'https://www.amazon.it/dp/B00HVFQF3I/ref=cm_sw_r_apa_i_9GRWP18TK8S32ZPVJVM7_0?_encoding=UTF8&psc=1'

Related

Unshort link generated by android app with python

I'm trying to unshort an url generated by Banggood android app. The link is
https://banggood.app.link/ifPZZ5jS98
The code works great with all bit.ly urls. But with this url it doesn't work. Any idea.
My code:
import requests
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
def unshort_link(url):
warnings.simplefilter('ignore',InsecureRequestWarning)
session = requests.Session()
resp = session.head(url, allow_redirects=True, verify=False)
unshort_url = resp.url
return unshort_url
unshorted_url = unshort_link("https://banggood.app.link/ifPZZ5jS98")
print(unshorted_url)
# expected result: https://www.banggood.com/LANGFEITE-L8S-2019-Version-15Ah-48_1V-800W+2-Dual-Motor-Folding-Electric-Scooter-Color-Display-DC-Brushless-Motor-45km-or-h-Top-Speed-40km-Range-EU-Plug-p-1486764.html?akmClientCountry=IT&channel=googleshopping&utm_source=googleshopping&utm_medium=cpc_bgs&utm_campaign=xibei-ssc-it-all-0303_prdshare_copy&utm_content=xibei&_branch_match_id=762227340915062987

Python - Retrieve and use a cookie to download a file

Trying to download the following file:
https://e4ftl01.cr.usgs.gov/MOLA/MYD14A2.006/2017.10.24/MYD14A2.A2017297.h19v01.006.2017310142443.hdf
I first need to sign into the following site before doing so:
https://urs.earthdata.nasa.gov
After reviewing my browser's web console, I believe it's using a cookie to allow me to download the file. How can I do this using python? I find out how to retrieve the cookies:
import os, requests
username = 'user'
password = 'pwd'
url = 'https://urs.earthdata.nasa.gov'
r = requests.get(url, auth=(username,password))
cookies = r.cookies
How can I then use this to download the HDF file? I've tried the following but always receive 401 error.
url2 = "https://e4ftl01.cr.usgs.gov/MOLA/MYD14A2.006/2017.10.24/MYD14A2.A2017297.h19v01.006.2017310142443.hdf"
r2 = requests.get(url2, cookies=r.cookies)
Have you tried a simple basic authentification :
from requests.auth import HTTPBasicAuth
url2='https://e4ftl01.cr.usgs.gov/MOLA/MYD14A2.006/2017.10.24/MYD14A2.A2017297.h19v01.006.2017310142443.hdf'
requests.get(url2, auth=HTTPBasicAuth('user', 'pass'))
or read this example
To download a file using the Requests library with the browser cookies, you can use the next function:
import browser_cookie3
import requests
import shutil
import os
cj = browser_cookie3.brave()
def download_file(url, root_des_path='./'):
local_filename = url.split('/')[-1]
local_filename = os.path.join(root_des_path, local_filename)
# r = requests.get(link, cookies=cj)
with requests.get(url, cookies=cj, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
a = download_file(link)
In this example, cj is the cookies of Brave browser ( you can use ffox or chrome). then, these cj are passed to Requests to download the file.
Note, you need to get "browser_cookie3" library
pip install browser-cookie3

Login into web site using Python

This question has been addresses in various shapes and flavors but I have not been able to apply any of the solutions I read online.
I would like to use Python to log into the site: https://app.ninchanese.com/login
and then reach the page: https://app.ninchanese.com/leaderboard/global/1
I have tried various stuff but without success...
Using POST method:
import urllib
import requests
oURL = 'https://app.ninchanese.com/login'
oCredentials = dict(email='myemail#hotmail.com', password='mypassword')
oSession = requests.session()
oResponse = oSession.post(oURL, data=oCredentials)
oResponse2 = oSession.get('https://app.ninchanese.com/leaderboard/global/1')
Using the authentication function from requests package
import requests
oSession = requests.session()
oResponse = oSession.get('https://app.ninchanese.com/login', auth=('myemail#hotmail.com', 'mypassword'))
oResponse2 = oSession.get('https://app.ninchanese.com/leaderboard/global/1')
Whenever I print oResponse2, I can see that I'm always on the login page so I am guessing the authentication did not work.
Could you please advise how to achieve this?
You have to send the csrf_token along with your login request:
import urllib
import requests
import bs4
URL = 'https://app.ninchanese.com/login'
credentials = dict(email='myemail#hotmail.com', password='mypassword')
session = requests.session()
response = session.get(URL)
html = bs4.BeautifulSoup(response.text)
credentials['csrf_token'] = html.find('input', {'name':'csrf_token'})['value']
response = session.post(URL, data=credentials)
response2 = session.get('https://app.ninchanese.com/leaderboard/global/1')

Python Requests library redirect new url

I've been looking through the Python Requests documentation but I cannot see any functionality for what I am trying to achieve.
In my script I am setting allow_redirects=True.
I would like to know if the page has been redirected to something else, what is the new URL.
For example, if the start URL was: www.google.com/redirect
And the final URL is www.google.co.uk/redirected
How do I get that URL?
You are looking for the request history.
The response.history attribute is a list of responses that led to the final URL, which can be found in response.url.
response = requests.get(someurl)
if response.history:
print("Request was redirected")
for resp in response.history:
print(resp.status_code, resp.url)
print("Final destination:")
print(response.status_code, response.url)
else:
print("Request was not redirected")
Demo:
>>> import requests
>>> response = requests.get('http://httpbin.org/redirect/3')
>>> response.history
(<Response [302]>, <Response [302]>, <Response [302]>)
>>> for resp in response.history:
... print(resp.status_code, resp.url)
...
302 http://httpbin.org/redirect/3
302 http://httpbin.org/redirect/2
302 http://httpbin.org/redirect/1
>>> print(response.status_code, response.url)
200 http://httpbin.org/get
This is answering a slightly different question, but since I got stuck on this myself, I hope it might be useful for someone else.
If you want to use allow_redirects=False and get directly to the first redirect object, rather than following a chain of them, and you just want to get the redirect location directly out of the 302 response object, then r.url won't work. Instead, it's the "Location" header:
r = requests.get('http://github.com/', allow_redirects=False)
r.status_code # 302
r.url # http://github.com, not https.
r.headers['Location'] # https://github.com/ -- the redirect destination
I think requests.head instead of requests.get will be more safe to call when handling url redirect. Check a GitHub issue here:
r = requests.head(url, allow_redirects=True)
print(r.url)
the documentation has this blurb https://requests.readthedocs.io/en/master/user/quickstart/#redirection-and-history
import requests
r = requests.get('http://www.github.com')
r.url
#returns https://www.github.com instead of the http page you asked for
For python3.5, you can use the following code:
import urllib.request
res = urllib.request.urlopen(starturl)
finalurl = res.geturl()
print(finalurl)
I wrote the following function to get the full URL from a short URL (bit.ly, t.co, ...)
import requests
def expand_short_url(url):
r = requests.head(url, allow_redirects=False)
r.raise_for_status()
if 300 < r.status_code < 400:
url = r.headers.get('Location', url)
return url
Usage (short URL is this question's url):
short_url = 'https://tinyurl.com/' + '4d4ytpbx'
full_url = expand_short_url(short_url)
print(full_url)
Output:
https://stackoverflow.com/questions/20475552/python-requests-library-redirect-new-url
I wasn't able to use requests library and had to go different way. Here is the code that I post as solution to this post. (To get redirected URL with requests)
This way you actually open the browser, wait for your browser to log the url in the history log and then read last url in your history. I wrote this code for google chrom, but you should be able to follow along if you are using different browser.
import webbrowser
import sqlite3
import pandas as pd
import shutil
webbrowser.open("https://twitter.com/i/user/2274951674")
#source file is where the history of your webbroser is saved, I was using chrome, but it should be the same process if you are using different browser
source_file = 'C:\\Users\\{your_user_id}\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\History'
# could not directly connect to history file as it was locked and had to make a copy of it in different location
destination_file = 'C:\\Users\\{user}\\Downloads\\History'
time.sleep(30) # there is some delay to update the history file, so 30 sec wait give it enough time to make sure your last url get logged
shutil.copy(source_file,destination_file) # copying the file.
con = sqlite3.connect('C:\\Users\\{user}\\Downloads\\History')#connecting to browser history
cursor = con.execute("SELECT * FROM urls")
names = [description[0] for description in cursor.description]
urls = cursor.fetchall()
con.close()
df_history = pd.DataFrame(urls,columns=names)
last_url = df_history.loc[len(df_history)-1,'url']
print(last_url)
>>https://twitter.com/ozanbayram01
All the answers are applicable where the final url exists/working fine.
In case, final URL doesn't seems to work then below is way to capture all redirects.
There was scenario where final URL isn't working anymore and other ways like url history give error.
Code Snippet
long_url = ''
url = 'http://example.com/bla-bla'
try:
while True:
long_url = requests.head(url).headers['location']
print(long_url)
url = long_url
except:
print(long_url)

How can I get the final redirect URL, including the path, in Python? (urllib2.urlopen().geturl() isn't doing it) [duplicate]

Python's urllib2 follows 3xx redirects to get the final content. Is there a way to make urllib2 (or some other library such as httplib2) also follow meta refreshes? Or do I need to parse the HTML manually for the refresh meta tags?
Here is a solution using BeautifulSoup and httplib2 (and certificate based authentication):
import BeautifulSoup
import httplib2
def meta_redirect(content):
soup = BeautifulSoup.BeautifulSoup(content)
result=soup.find("meta",attrs={"http-equiv":"Refresh"})
if result:
wait,text=result["content"].split(";")
if text.strip().lower().startswith("url="):
url=text.strip()[4:]
return url
return None
def get_content(url, key, cert):
h=httplib2.Http(".cache")
h.add_certificate(key,cert,"")
resp, content = h.request(url,"GET")
# follow the chain of redirects
while meta_redirect(content):
resp, content = h.request(meta_redirect(content),"GET")
return content
A similar solution using the requests and lxml libraries. Also does a simple check that the thing being tested is actually HTML (a requirement in my implementation). Also is able to capture and use cookies by using the request library's sessions (sometimes necessary if redirection + cookies are being used as an anti-scraping mechanism).
import magic
import mimetypes
import requests
from lxml import html
from urlparse import urljoin
def test_for_meta_redirections(r):
mime = magic.from_buffer(r.content, mime=True)
extension = mimetypes.guess_extension(mime)
if extension == '.html':
html_tree = html.fromstring(r.text)
attr = html_tree.xpath("//meta[translate(#http-equiv, 'REFSH', 'refsh') = 'refresh']/#content")[0]
wait, text = attr.split(";")
if text.lower().startswith("url="):
url = text[4:]
if not url.startswith('http'):
# Relative URL, adapt
url = urljoin(r.url, url)
return True, url
return False, None
def follow_redirections(r, s):
"""
Recursive function that follows meta refresh redirections if they exist.
"""
redirected, url = test_for_meta_redirections(r)
if redirected:
r = follow_redirections(s.get(url), s)
return r
Usage:
s = requests.session()
r = s.get(url)
# test for and follow meta redirects
r = follow_redirections(r, s)
OK, seems no library supports it so I have been using this code:
import urllib2
import urlparse
import re
def get_hops(url):
redirect_re = re.compile('<meta[^>]*?url=(.*?)["\']', re.IGNORECASE)
hops = []
while url:
if url in hops:
url = None
else:
hops.insert(0, url)
response = urllib2.urlopen(url)
if response.geturl() != url:
hops.insert(0, response.geturl())
# check for redirect meta tag
match = redirect_re.search(response.read())
if match:
url = urlparse.urljoin(url, match.groups()[0].strip())
else:
url = None
return hops
If you dont want to use bs4 ,you can use lxml like this:
from lxml.html import soupparser
def meta_redirect(content):
root = soupparser.fromstring(content)
result_url = root.xpath('//meta[#http-equiv="refresh"]/#content')
if result_url:
result_url = str(result_url[0])
urls = result_url.split('URL=') if len(result_url.split('url=')) < 2 else result_url.split('url=')
url = urls[1] if len(urls) >= 2 else None
else:
return None
return url
Use BeautifulSoup or lxml to parse the HTML.

Categories