Ending Requests Python - python

I'm using a proxy service to cycle requests with different proxy ips for web scraping. Do I need to build in functionality to end requests so as to not overload the web server I'm scraping?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import concurrent.futures
list_of_urls = ['https://www.example']
NUM_RETRIES = 3
NUM_THREADS = 5
def scrape_url(url):
params = {'api_key': 'API_KEY', 'url': url}
# send request to scraperapi, and automatically retry failed requests
for _ in range(NUM_RETRIES):
try:
response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
if response.status_code in [200, 404]:
## escape for loop if the API returns a successful response
break
except requests.exceptions.ConnectionError:
response = ''
## parse data if 200 status code (successful response)
if response.status_code == 200:
## do stuff
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(scrape_url, list_of_urls)

Hi if you are using the latest version of requests, then most probably it is keeping the TCP connection alive. What you can do is to define a request class and set it up not to keep the connections alive and then proceed normally with you code
s = requests.session()
s.config['keep_alive'] = False
As discussed here, there really isn't such a thing as an HTTP connection and what httplib refers to as the HTTPConnection is really the underlying TCP connection which doesn't really know much about your requests at all. Requests abstracts that away and you won't ever see it.
The newest version of Requests does in fact keep the TCP connection alive after your request.. If you do want your TCP connections to close, you can just configure the requests to not use keep-alive.
Alternatively
s = requests.session(config={'keep_alive': False})
Updated version of your code
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import concurrent.futures
list_of_urls = ['https://www.example']
NUM_RETRIES = 3
NUM_THREADS = 5
def scrape_url(url):
params = {'api_key': 'API_KEY', 'url': url}
s = requests.session()
s.config['keep_alive'] = False
# send request to scraperapi, and automatically retry failed requests
for _ in range(NUM_RETRIES):
try:
response = s.get('http://api.scraperapi.com/', params=urlencode(params))
if response.status_code in [200, 404]:
## escape for loop if the API returns a successful response
break
except requests.exceptions.ConnectionError:
response = ''
## parse data if 200 status code (successful response)
if response.status_code == 200:
## do stuff
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(scrape_url, list_of_urls)

Related

urllib3 and requests, fix status code 403 in urllib3, requests work good as well

When I run the code only requests code work and it's return status code 200. and urllib3 fail, it return status code 403. I want urllib3 will return the same as requests results, here are the following codes:
requests code:
import requests
proxies = {"http":"http://username:password#host:port"} #proxy protocol
r = requests.get('https://www.examples.com', proxies=proxies) #make the request
print(r.status_code) #return status code 200, successful request
urllib3 code:
import urllib3
auth_headers = urllib3.make_headers(proxy_basic_auth='username:password') #proxy authorization build
Proxy = urllib3.ProxyManager(proxy_url="http://host:port", proxy_headers=auth_headers)
url = 'https://www.example.com/'
r = Proxy.request('GET', url) #make the request
print(r.status) #return status code 403, I want it to return status code 200 like the same as the requests code.
Edited: It's also work good with PoolManager()

Using proxies with playwright in python

I'm using playwright to extract data from a website and I want to use proxies which I get from this website : https://www.proxy-list.download/HTTPS. It doesn't work, and I'm wondering if this is because the proxies are free ? If this is the reason, can someone know where can i find proxies that will work ?
This is my code :
from playwright.sync_api import sync_playwright
import time
url = "https://www.momox-shop.fr/livres-romans-et-litterature-C055/"
with sync_playwright() as p:
browser = p.firefox.launch(
headless=False,
proxy= {
'server': '209.166.175.201:3128'
})
page = browser.new_page()
page.goto(url)
time.sleep(5)
Thank you !
Yes, according to your link, all proxies are "dead"
Before using proxies try checking them here is one possible solution:
import json
import requests
from pythonping import ping
from concurrent.futures import ThreadPoolExecutor
check_proxies_url = "https://httpbin.org/ip"
good_proxy = set()
# proxy_lst = requests.get("https://www.proxy-list.download/api/v1/get", params={"type": "https"})
# proxies = [proxy for proxy in proxy_lst.text.split('\r\n') if proxy]
proxy_lst = requests.get("http://proxylist.fatezero.org/proxy.list")
proxies = (f"{json.loads(data)['host']}:{json.loads(data)['port']}" for data in proxy_lst.text.split('\n') if data)
def get_proxies(proxy):
proxies = {
"https": proxy,
"http": proxy
}
try:
response = requests.get(url=check_proxies_url, proxies=proxies, timeout=2)
response.raise_for_status()
if ping(target=proxies["https"].split(':')[0], count=1, timeout=2).rtt_avg_ms < 150:
good_proxy.add(proxies["https"])
print(f"Good proxies: {proxies['https']}")
except Exception:
print(f"Bad proxy: {proxies['https']}")
with ThreadPoolExecutor() as executor:
executor.map(get_proxies, proxies)
print(good_proxy)
Get a list of active proxies with ping up to 150ms.
Output:
{'209.166.175.201:8080', '170.39.194.156:3128', '20.111.54.16:80', '20.111.54.16:8123'}
But in any case, this is a shared proxy and their performance is not guaranteed. If you want to be sure that your parser will work, then it is better to buy a proxy.
I ran your code with received proxy '170.39.194.156:3128' and for now it works

Python urllib request always results in Error 400: Bad Request

Thanks for reading. For a small reserach project, I'm trying to gather some data from KBB (www.kbb.com). However, I'm always getting a "urllib.error.HTTPError: HTTP Error 400: Bad Request" Error. I think I can access different websites with this simple piece of code. I'm not sure if this is an issue with the code or the specific website itself?
Maybe someone can point me in the right direction.
from urllib import request as urlrequest
proxy_host = '23.107.176.36:32180'
url = "https://www.kbb.com/gmc/canyon-extended-cab/2018/"
req = urlrequest.Request(url)
req.set_proxy(proxy_host, 'https')
page = urlrequest.urlopen(req)
print(page)
There are 2 issue but one solution as I found below
Is the proxy server which is refused.
You need authentication for the server in every case it responds with a 403 forbidden
Using urlib
from urllib import request as urlrequest
proxy_host = '23.107.176.36:32180'
url = "https://www.kbb.com/gmc/canyon-extended-cab/2018/"
req = urlrequest.Request(url)
# req.set_proxy(proxy_host, 'https')
page = urlrequest.urlopen(req)
print(req)
> urllib.error.HTTPError: HTTP Error 403: Forbidden
Using Requests
import requests
url = "https://www.kbb.com/gmc/canyon-extended-cab/2018/"
res = requests.get(url)
print(res)
# >>> <Response [403]>
Using PostMan
edit Solution
Setting a timeout litter longer it works. however I had to retry several times, because the proxy sometimes just dont' reponds
import urllib.request
proxy_host = '23.107.176.36:32180'
url = "https://www.kbb.com/gmc/canyon-extended-cab/2018/"
proxy_support = urllib.request.ProxyHandler({'https' : proxy_host})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
res = urllib.request.urlopen(url, timeout=1000) # Set
print(res.read())
Result
b'<!doctype html><html lang="en"><head><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5,minimum-scale=1"><meta http-equiv="x-dns-prefetch-control" content="on"><link rel="dns-prefetch preconnect" href="//securepubads.g.doubleclick.net" crossorigin><link rel="dns-prefetch preconnect" href="//c.amazon-adsystem.com" crossorigin><link .........
Using Requests
import requests
proxy_host = '23.107.176.36:32180'
url = "https://www.kbb.com/gmc/canyon-extended-cab/2018/"
# NOTE: we need a loger timeout for the proxy t response and set verify sale for an ssl error
r = requests.get(url, proxies={"https": proxy_host}, timeout=90000, verify=False) # Timeout are in milliseconds
print(r.text)
Your code appears to work fine without the set_proxy statement, I think it is most likely that your proxy server is rejecting the request rather than KBB.

problem with python requests while using proxies

I am trying to scrape a website using python requests. We can only scrape the website using proxies so I implemented the code for that. However its banning all my requests even when i am using proxies, So I used a website https://api.ipify.org/?format=json to check whether proxies working properly or not. I found it showing my original IP even while using proxies. The code is below
from concurrent.futures import ThreadPoolExecutor
import string, random
import requests
import sys
http = []
#loading http into the list
with open(sys.argv[1],"r",encoding = "utf-8") as data:
for i in data:
http.append(i[:-1])
data.close()
url = "https://api.ipify.org/?format=json"
def fetch(session, url):
for i in range(5):
proxy = {'http': 'http://'+random.choice(http)}
try:
with session.get(url,proxies = proxy, allow_redirects=False) as response:
print("Proxy : ",proxy," | Response : ",response.text)
break
except:
pass
# #timer(1, 5)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=1) as executor:
with requests.Session() as session:
executor.map(fetch, [session] * 100, [url] * 100)
executor.shutdown(wait=True)
I tried a lot but didn't understand how my ip address is getting shown instead of the proxy ipv4. You will find output of the code here https://imgur.com/a/z02uSvi
The problem that you have set proxy for http and sending request to website which uses https. Solution is simple:
proxies = dict.fromkeys(('http', 'https', 'ftp'), 'http://' + random.choice(http))
# You can set proxy for session
session.proxies.update(proxies)
response = session.get(url)
# Or you can pass proxy as argument
response = session.get(url, proxies=proxies)

Webscraping: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop

I am new at this, but trying to scrape data from the website that requires a log in. Getting an error trying to open it. It appear that the problems is in cookies, that they are not being properly stored?
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from http.cookiejar import CookieJar
import urllib
username = 'xxx'
password = 'xxx'
values = {'email': username, 'password': password}
session = requests.session()
login_url = 'https://login.aripaev.ee/Account/Login?ReturnUrl=%2fOAuth%2fAuthorize%3fclient_id%3dinfopank%26redirect_uri%3dhttps%253A%252F%252Finfopank.ee%252FAccount%252FLogin%253FreturnUrl%253D%25252F%2526returnAsRedirect%253DFalse%26state%3dLjNuwARtELJnVPcF8ka2Jg%26scope%3d%252FUserDataService%252Fjson%252FProfile%2520%252FUserDataService%252Fjson%252FPermissions%2520%252FUserDataService%252Fjson%252FOrders%2520%252FUserDataService%252Fv2%252Fjson%252FProfile%2520%252FUserDataService%252Fv2%252Fjson%252FPermissions%2520%252FUserDataService%252Fv2%252Fjson%252FOrders%26response_type%3dcode&client_id=infopank&redirect_uri=https%3A%2F%2Finfopank.ee%2FAccount%2FLogin%3FreturnUrl%3D%252F%26returnAsRedirect%3DFalse&state=LjNuwARtELJnVPcF8ka2Jg&scope=%2FUserDataService%2Fjson%2FProfile%20%2FUserDataService%2Fjson%2FPermissions%20%2FUserDataService%2Fjson%2FOrders%20%2FUserDataService%2Fv2%2Fjson%2FProfile%20%2FUserDataService%2Fv2%2Fjson%2FPermissions%20%2FUserDataService%2Fv2%2Fjson%2FOrders&response_type=code'
url = 'https://infopank.ee/ettevote/1/'
result = session.get(login_url)
result = session.post(login_url, data = values, headers = dict(referer=login_url))
cookieProcessor = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(cookieProcessor)
page = urlopen(url)
Error message:
HTTPError: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
Any suggestions are welcome - thanks!
Don't mix urllib.request with requests. If you are going to use requests, it will just work fine.
Remove these lines from your program:
from urllib.request import urlopen
from http.cookiejar import CookieJar
import urllib
cookieProcessor = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(cookieProcessor)
page = urlopen(url)
This code has the issue that it doesn't have the cookies that were in the requests.session and also that the call to urlopen uses the default opener which has no cookie support at all. Rather opener.open should have been used.
Replace this with:
page = session.get(url)
Then the requests.session keeps track of the cookies for you.

Categories