i cant access to download the file from dextools by urllib - python

i wont to download json from url of dextools
url is >> :https://www.dextools.io/chain-bsc/api/Pancakeswap/pools?timestampToShow=1669287222&range=1
my code:
from urllib.request import urlopen, Request
import json
download_url = "https://www.dextools.io/chain-bsc/api/Pancakeswap/pools?timestampToShow=1657123280&range=1"
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
req = Request(download_url, headers=header)
webpage = urlopen(req).read()
data = json.loads(webpage)

Related

I can't access to the JSON file from URL in python

I want to download JSON from URL of dextools URL is
https://www.dextools.io/chain-bsc/api/Pancakeswap/pools?timestampToShow=1669287222&range=1
my code:
from urllib.request import urlopen, Request
import json
download_url = "https://www.dextools.io/chain-bsc/api/Pancakeswap/pools?timestampToShow=1657123280&range=1"
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
req = Request(download_url, headers=header)
webpage = urlopen(req).read()
data = json.loads(webpage)

Extracting redirected link from an url

I am trying to extract the redirected link of this link. When I click on this link I am redirected to this page and I want to store this page link. So, for this I have tried with urllib module but it didn't give any response.
from urllib import request
headers = headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'}
url = 'https://www.forexfactory.com/news/403059-manufacturing-in-us-expands-after-reaching-three-year-low/hit'
response = requests.get(url, headers=headers)
print(response) # Output: <Response [503]>
So, how can I extract this link?
You can use cloudscraper to process the cloudflare redirect:
import cloudscraper
scraper = cloudscraper.create_scraper()
url = 'https://www.forexfactory.com/news/403059-manufacturing-in-us-expands-after-reaching-three-year-low/hit'
r = scraper.get(url)
print(r.url)
you can use the requests library
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'}
url = 'https://www.forexfactory.com/news/403059-manufacturing-in-us-expands-after-reaching-three-year-low/hit'
response = requests.get(url, headers=headers)
print(response.url)

You don't have permission to access "http://www.carrefour.pk/" on this server.<p> Reference #18.451d2017.1615456534.6b4445

I'm trying to scrape carrefour website data through python. I've used scrappy, beautiful soup, selenium but nothing seems to work. I'm getting the error that you don't have the permission to access. Is there any way to scrape this website? The code is attached below, NEED HELP!
from requests_html import HTMLSession
session = HTMLSession()
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
resp = session.get("https://www.carrefour.pk/",headers=headers)
resp.html.render()
a=resp.html.html
print(a)
think you are using the wrong headers. These headers work fine for me.
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
Or full:
import requests
from bs4 import BeautifulSoup as bs
# Block cookies
from http import cookiejar # Python 2: import cookielib as cookiejar
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
s = requests.Session()
s.cookies.set_policy(BlockAll())
#Get URL
url = "https://www.carrefour.pk"
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
r = s.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
print(soup)

How to change user agent urllib2

I'm trying to access a page using the following
page = urllib2.urlopen(full_url)
soup = BeautifulSoup(page, 'html.parser')
li_post_id = "post-" + str(post_id)
li_soup = soup.find('li', attrs={'id':li_post_id})
This works fine on my ubuntu machine, but when running it on my Windows server I get 403 Forbidden error, so I assume the issue is with the user agent.
How do I change this, say, to Firefox? I have only seen tutorials to change the user agent using requests, but I don't want to change all of my code to this.
You could try this.
import random
import requests, bs4
agents= [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)']
headers = {"User-Agent":random.choice(agents)}
response = requests.get(full_url,headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
Changing the header doesn't have anything to do with BeautifulSoup. It is meant for HTML parsing only. You need to change it in your urllib request like so:
Python3
import urllib.request
req = urllib.request.build_opener()
req.addheaders = [('User-Agent', 'Some user agent')]
response = req.open('http://www.stackoverflow.com')
Python2.7
import urllib2
req = urllib2.build_opener()
req.addheaders = [('User-Agent', 'Some user agent')]
response = req.open('http://www.stackoverflow.com')

I am unable to get all the links present on the page

I am using beautifulsoup and urllib to extract a webpage , I have set the user agent and the cookie , and yet i fail to receive all the links from the webpage...
Heres my code :
import bs4 as bs
import urllib.request
import requests
#sauce = urllib.request.urlopen('https://github.com/search?q=javascript&type=Code&utf8=%E2%9C%93').read()
#soup = bs.BeautifulSoup(sauce,'lxml')
'''
session = requests.Session()
response = session.get(url)
print(session.cookies.get_dict())
'''
url = 'https://github.com/search?q=javascript&type=Code&utf8=%E2%9C%93'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie' : '_gh_sess=eyJzZXNzaW9uX2lkIjoiMDNhMGI2NjQxZjY4Mjc1YmQ3ZjAyNmJiODM2YzIzMTUiLCJfY3NyZl90b2tlbiI6IlJJOUtrd3E3WVFOYldVUzkwdmUxZ0Z4MHZLN3M2eE83SzhIdVJTUFVsVVU9In0%3D--4485d36d4c86aec01cde254e34db68005193546e
logged_in: no'}
response = requests.get(url,headers=headers)
print(response.cookies)
soup = bs.BeautifulSoup(response.content,'lxml')
for url in soup.find_all('a'):
print(url.get('href'))
Is there something I'm missing? Inside a browser I get the links to all the code whereas in the script I get only a few of the links , none with the code...
The webpage opening perfectly in the browser...

Categories