How to conditionally disable proxies for some requests playwright python - python

try:
with sync_playwright() as p:
driver = p.firefox.launch(hedless=headless, proxy={
"server": 'fa****y.com:10000',
'username': 'iu***53cpeuytpna0cc7bd7',
"password": '**W7**Fm5',
})
context = driver.new_context()
page = context.new_page()
page.route("**/*",lambda route:route.abort()
if route.request.resource_type == "image"
or route.request.resource_type == "stylesheet"
or route.request.resource_type == "svg"
else route.continue_()
)
#page.set_default_timeout(100000)
try:
url = 'https://accounts.google.com/v3/signin/identifier?dsh=S-536847501%3A1663770960047319&continue=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&followup=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&passive=1209600&service=androiddeveloper&flowName=GlifWebSignIn&flowEntry=ServiceLogin&ifkv=AQDHYWoj7hmeMm5YT3PrA0sojYcd3nnuAx2JkCLnedM0A9sCEUG9nrlRYD-grtVE1CcBagVSvXOG'
page.goto(url)
except Exception:
print(" [+] Time out Error ")
print(Fore.LIGHTBLUE_EX +" [+] Start >>> " + self.gmail )
page.fill('id=identifierId',self.gmail)
btn = '#identifierNext > div > button'
page.click(btn)
page.wait_for_timeout(3000)
inbpass = '#password > div.aCsJod.oJeWuf > div > div.Xb9hP > input'
page.fill(inbpass,self.password)
btnpass = '#passwordNext > div > button'
page.click(btnpass)
time.sleep(3)
page.wait_for_timeout(3000)
try:
page.locator("text=I understand").click(timeout=10000)
page.wait_for_timeout(1500)
sleep(1)
except:
pass
try:
page.locator("div[role=\"link\"]:has-text(\"Confirm your recovery email\")").click()
page.wait_for_timeout(3000)
page.locator("[aria-label=\"Enter recovery email address\"]").fill(self.recvery)
page.wait_for_timeout(3000)
time.sleep(3)
# Click button:has-text("Next")
page.locator("button:has-text(\"Next\")").click()
time.sleep(10)
except:
pass
page.locator("text=YourselfChoose if your account is for personal >> button").click()
time.sleep(3)
page.wait_for_timeout(1500)
**IN THIS LINE:
(In this line : I want to make proxy stop because it's not unlimited and I am paying for GB and I want to use only local internet from this line to end is there any help ?)**
I want to to stop proxy usage when reach to click in
One: for proxy usage is high and I don't need proxy anymore in the next step.
Two: I want to make it faster because proxy is slow when it reach to the page.
Please help me with code or anything I don't know how to solve it

There is no direct way to achieve this, but there are workarounds you can use to achieve the same results using routing. Basically, after you no longer require the proxy, you route all requests playwright makes to a handler, which subsequently bypasses proxies and forwards the response back to playwright. However, this approach will only work with async version of playwright (otherwise executing a blocking call from a route handler will block further network traffic too). Consider the below code:
import aiohttp, asyncio
import playwright
def no_proxy_route(session):
"""
Bypasses any proxy and routes request directly. It's a factory function to store value of session so we don't need
to create a new one every time.
"""
async def route_handler(route: playwright.async_api.Route):
request = route.request
try:
body, status, headers = await fetch(session, request)
# Abort route in case of error
except (aiohttp.ClientConnectionError, asyncio.TimeoutError):
await route.abort()
return
# If no error, fulfill route with the given body and headers
await route.fulfill(status=status, headers=headers, body=body)
return route_handler
async def fetch(session, request, timeout=5000):
"""
Fetch the given request using aiohttp
"""
assert timeout > 0, "timeout must be positive"
try:
async with session.request(request.method, request.url, headers=request.headers,
data=request.post_data, timeout=timeout) as response:
body = await response.read()
status = response.status
headers = response.headers
except Exception:
raise
else:
return body, status, headers
Now if you want to conditionally disable proxies for some requests, you simply create a matching pattern and route all those requests through no_proxy_route. An example is given below:
### All your previous code
# .
# .
# .
# .
### which required a proxy
# Create an aiohttp session with a dummy cookie jar. This is because we will be passing the cookies
# explicitly and don't want previous requests/responses cookies to persist and interfere
session = aiohttp.ClientSession(cookie_jar=aiohttp.DummyCookieJar())
# Create a route to our handler with an appropriate pattern. The below pattern will route ALL subsequent requests to
# handler. Remember that you can use regex for patterns as well.
await page.route('**/*', no_proxy_route(session))
After this is done, all requests that match the pattern, with which you created the route, will not use a proxy, even if the context has one set. But again, this is only useful if you are using the async API for playwright.

Related

Is having a concurrent.futures.ThreadPoolExecutor call dangerous in a FastAPI endpoint?

I have the following test code:
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor() as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I need to use the concurrent.futures.ThreadPoolExecutor part of the code in a FastAPI endpoint.
My concern is the impact of the number of API calls and the inclusion of threads. Concern about creating too many threads and its related consequences, starving the host, crashing the application and/or the host.
Any thoughts or gotchas on this approach?
You should rather use the HTTPX library, which provides an async API. As described in this answer , you spawn a Client and reuse it every time you need it. To make asynchronous requests with HTTPX, you'll need an AsyncClient.
You could control the connection pool size as well, using the limits keyword argument on the Client, which takes an instance of httpx.Limits. For example:
limits = httpx.Limits(max_keepalive_connections=5, max_connections=10)
client = httpx.AsyncClient(limits=limits)
You can adjust the above per your needs. As per the documentation on Pool limit configuration:
max_keepalive_connections, number of allowable keep-alive connections, or None to always allow. (Defaults 20)
max_connections, maximum number of allowable connections, or None for no limits. (Default 100)
keepalive_expiry, time limit on idle keep-alive connections in seconds, or None for no limits. (Default 5)
If you would like to adjust the timeout as well, you can use the timeout paramter to set timeout on an individual request, or on a Client/AsyncClient instance, which results in the given timeout being used as the default for requests made with this client (see the implementation of Timeout class as well). You can specify the timeout behavior in a fine grained detail; for example, setting the read timeout parameter will specify the maximum duration to wait for a chunk of data to be received (i.e., a chunk of the response body). If HTTPX is unable to receive data within this time frame, a ReadTimeout exception is raised. If set to None instead of some positive numerical value, there will be no timeout on read. The default is 5 seconds timeout on all operations.
You can use await client.aclose() to explicitly close the AsyncClient when you are done with it (this could be done inside a shutdown event handler, for instance).
To run multiple asynchronous operations—as you need to request five different URLs, when your API endpoint is called—you can use the awaitable asyncio.gather(). It will execute the async operations and return a list of results in the same order the awaitables (tasks) were passed to that function.
Working Example:
from fastapi import FastAPI
import httpx
import asyncio
URLS = ['https://www.foxnews.com/',
'https://edition.cnn.com/',
'https://www.nbcnews.com/',
'https://www.bbc.co.uk/',
'https://www.reuters.com/']
limits = httpx.Limits(max_keepalive_connections=5, max_connections=10)
timeout = httpx.Timeout(5.0, read=15.0) # 15s timeout on read. 5s timeout elsewhere.
client = httpx.AsyncClient(limits=limits, timeout=timeout)
app = FastAPI()
#app.on_event('shutdown')
async def shutdown_event():
await client.aclose()
async def send(url, client):
return await client.get(url)
#app.get('/')
async def main():
tasks = [send(url, client) for url in URLS]
responses = await asyncio.gather(*tasks)
return [r.text[:50] for r in responses] # return the first 50 chars of each response
If you would like to avoid reading the entire response body into RAM, you could use Streaming responses, as described in this answer and demonstrated below:
# ... rest of the code is the same as above
from fastapi.responses import StreamingResponse
async def send(url, client):
req = client.build_request('GET', url)
return await client.send(req, stream=True)
async def iter_content(responses):
for r in responses:
async for chunk in r.aiter_text():
yield chunk[:50] # return the first 50 chars of each response
yield '\n'
break
await r.aclose()
#app.get('/')
async def main():
tasks = [send(url, client) for url in URLS]
responses = await asyncio.gather(*tasks)
return StreamingResponse(iter_content(responses), media_type='text/plain')

How to check first proxy and start connection and terminate if successful?

I want to use proxy lists but I'm not sure how to do this:
Check if first connection is established (it does not matter it is the first proxy IP or not), run something and then close the program and exit, else go to the next proxy.
This is my code that loops over all proxies:
import requests
from Proxy_List_Scrapper import Scrapper, Proxy, ScrapperException
ALL = 'ALL'
scrapper = Scrapper(category=ALL, print_err_trace=False)
# Get ALL Proxies According to your Choice
data = scrapper.getProxies()
n = 0
while True:
n += 1
try:
for item in data.proxies:
# Enter a proxy IP address and port.
url = 'http://api.ipify.org'
# Send a GET request to the url and pass the proxy as a parameter.
page = requests.get(url, proxies={"http": f'{item.ip}:{item.port}', "https": f'{item.ip}:{item.port}'})
# Prints the content of the requested url.
print(f'proxy is {item.ip}:{item.port} and content is {page.text}')
except requests.exceptions.ProxyError:
print(f'error #{n}')
Maybe you should run try/except inside for-loop - to catch problem with proxy server and continue with next proxy on list. And you could use break to finish loop when you find working proxy.
Something like this:
import requests
from Proxy_List_Scrapper import Scrapper, Proxy, ScrapperException
scrapper = Scrapper(category='ALL', print_err_trace=False)
data = scrapper.getProxies()
url = 'http://api.ipify.org'
for n, item in enumerate(data.proxies):
try:
page = requests.get(url, proxies={"http": f'{item.ip}:{item.port}', "https": f'{item.ip}:{item.port}'})
print(f'proxy is {item.ip}:{item.port} and content is {page.text}')
# exit `for`-loop when connection finished without error and it gives different IP
if page.text != my_original_ip:
break
except requests.exceptions.ProxyError:
print(f'error #{n} {item.ip}:{item.port}')

Long running requests with asyncio and aiohttp

Apologies for asking with what may be considered redundant, but I'm finding it extremely difficult to figure out what are the current recommended best practices for using asyncio and aiohttp.
I'm working with an API that ultimately returns a link to a generated CSV file. There are two steps in using the API.
Submit request the triggers a long running process and returns a status URL.
Poll the status URL until the status_code is 201 and then get the URL of the CSV file from the headers.
Here's a stripped down example of how I can successfully do this synchronously with requests.
import time
import requests
def submit_request(id):
"""Submit request to create CSV for specified id"""
body = {'id': id}
response = requests.get(
url='https://www.example.com/endpoint',
json=body
)
response.raise_for_status()
return response
def get_status(request_response):
"""Check whether the CSV has been created."""
status_response = requests.get(
url=request_response.headers['Location']
)
status_response.raise_for_status()
return status_response
def get_data_url(id, poll_interval=10):
"""Submit request to create CSV for specified ID, wait for it to finish,
and return the URL of the CSV.
Wait between status requests based on poll_interval.
"""
response = submit_request(id)
while True:
status_response = get_status(response)
if status_response.status_code == 201:
break
time.sleep(poll_interval)
data_url = status_response.headers['Location']
return data_url
What I'd like to do is be able to submit a group of requests at once, and then wait on all of them to be finished. But I'm not clear on how to structure this with asyncio and aiohttp.
One option would be to first submit all of the requests and then use await.gather (or something) to get all of the status URLs. Then start another event loop where I continuously poll the status_urls until they have all completed and I end up with a list of data URLs.
Alternatively, I suppose I could create a single function that submits the request, gets the status URL, and then polls that until it completes. In that case I would just have a single event loop where I submit each of the IDs that I want processed.
If some pseudo code for those options would be useful I can try to provide it. I've looked at a lot of different examples where you submit requests for a bunch of URLs asynchronously -- this for example -- but I'm finding that I get a bit lost when trying to translate them to this slightly more complicated scenario where I submit the request and then get back a new URL to poll.
FYI based on the comments above my current solution is something like this.
import asyncio
import aiohttp
async def get_data_url(session, id):
url = 'https://www.example.com/endpoint'
body = {'id': id}
async with session.post(url=url, json=body) as response:
response.raise_for_status()
status_url = response.headers['Location']
while True:
async with session.get(url=status_url) as status_response:
status_response.raise_for_status()
if status_response.status == 201:
return status_response.headers['Location']
await asyncio.sleep(10)
async def main(access_token, id):
headers = {'token': access_token}
async with aiohttp.ClientSession(headers=headers) as session:
data_url = await get_data_url(session, id)
return data_url
This works though I'm still not sure on best practices for submitting a set of IDs. I think asyncio.gather would work but it looks like it's deprecated. Ideally I would have a queue of say 100 IDs and only have 5 requests running at any given time. I've found some examples like this but they depend on asyncio.Queue which is also deprecated.

Can't use https proxies along with reusing the same session within a script built upon asyncio

I'm trying to use https proxy within async requests making use of asyncio library. When it comes to use http proxy, there is a clear instruction here but I get stuck in case of using https proxy. Moreover, I would like to reuse the same session, not creating a new session every time I send a requests.
I've tried so far (proxies used within the script are directly taken from a free proxy site, so consider them as placeholders):
import asyncio
import aiohttp
from bs4 import BeautifulSoup
proxies = [
'http://89.22.210.191:41258',
'http://91.187.75.48:39405',
'http://103.81.104.66:34717',
'http://124.41.213.211:41828',
'http://93.191.100.231:3128'
]
async def get_text(url):
global proxies,proxy_url
while True:
check_url = proxy_url
proxy = f'http://{proxy_url}'
print("trying using:",check_url)
async with aiohttp.ClientSession() as session:
try:
async with session.get(url,proxy=proxy,ssl=False) as resp:
return await resp.text()
except Exception:
if check_url == proxy_url:
proxy_url = proxies.pop()
async def field_info(field_link):
text = await get_text(field_link)
soup = BeautifulSoup(text,'lxml')
for item in soup.select(".summary .question-hyperlink"):
print(item.get_text(strip=True))
if __name__ == '__main__':
proxy_url = proxies.pop()
links = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=50".format(page) for page in range(2,5)]
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(asyncio.gather(*(field_info(url) for url in links)))
loop.run_until_complete(future)
loop.close()
How can I use https proxies within the script along with reusing the same session?
This script creates dictionary proxy_session_map, where keys are proxies and values are sessions. That way we know for which proxy belongs which session.
If there's some error using the proxy, I add this proxy to disabled_proxies set so I won't use this proxy again:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from random import choice
proxies = [
'http://89.22.210.191:41258',
'http://91.187.75.48:39405',
'http://103.81.104.66:34717',
'http://124.41.213.211:41828',
'http://93.191.100.231:3128'
]
disabled_proxies = set()
proxy_session_map = {}
async def get_text(url):
while True:
try:
available_proxies = [p for p in proxies if p not in disabled_proxies]
if available_proxies:
proxy = choice(available_proxies)
else:
proxy = None
if proxy not in proxy_session_map:
proxy_session_map[proxy] = aiohttp.ClientSession(timeout = aiohttp.ClientTimeout(total=5))
print("trying using:",proxy)
async with proxy_session_map[proxy].get(url,proxy=proxy,ssl=False) as resp:
return await resp.text()
except Exception as e:
if proxy:
print("error, disabling:",proxy)
disabled_proxies.add(proxy)
else:
# we haven't used proxy, so return empty string
return ''
async def field_info(field_link):
text = await get_text(field_link)
soup = BeautifulSoup(text,'lxml')
for item in soup.select(".summary .question-hyperlink"):
print(item.get_text(strip=True))
async def main():
links = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=50".format(page) for page in range(2,5)]
tasks = [field_info(url) for url in links]
await asyncio.gather(
*tasks
)
# close all sessions:
for s in proxy_session_map.values():
await s.close()
if __name__ == '__main__':
asyncio.run(main())
Prints (for example):
trying using: http://89.22.210.191:41258
trying using: http://124.41.213.211:41828
trying using: http://124.41.213.211:41828
error, disabling: http://124.41.213.211:41828
trying using: http://93.191.100.231:3128
error, disabling: http://124.41.213.211:41828
trying using: http://103.81.104.66:34717
BeautifulSoup to get image name from P class picture tag in Python
Scrape instagram public information from google cloud functions [duplicate]
Webscraping using R - the full website data is not loading
Facebook Public Data Scraping
How it is encode in javascript?
... and so on.

aiohttp: How to efficiently check HTTP headers before downloading response body?

I am writing a web crawler using asyncio/aiohttp. I want the crawler to only want to download HTML content, and skip everything else. I wrote a simple function to filter URLS based on extensions, but this is not reliable because many download links do not include a filename/extension in them.
I could use aiohttp.ClientSession.head() to send a HEAD request, check the Content-Type field to make sure it's HTML, and then send a separate GET request. But this will increase the latency by requiring two separate requests per page (one HEAD, one GET), and I'd like to avoid that if possible.
Is it possible to just send a regular GET request, and set aiohttp into "streaming" mode to download just the header, and then proceed with the body download only if the MIME type is correct? Or is there some (fast) alternative method for filtering out non-HTML content that I should consider?
UPDATE
As requested in the comments, I've included some example code of what I mean by making two separate HTTP requests (one HEAD request and one GET request):
import asyncio
import aiohttp
urls = ['http://www.google.com', 'http://www.yahoo.com']
results = []
async def get_urls_async(urls):
loop = asyncio.get_running_loop()
async with aiohttp.ClientSession() as session:
tasks = []
for u in urls:
print(f"This is the first (HEAD) request we send for {u}")
tasks.append(loop.create_task(session.get(u)))
results = []
for t in asyncio.as_completed(tasks):
response = await t
url = response.url
if "text/html" in response.headers["Content-Type"]:
print("Sending the 2nd (GET) request to retrive body")
r = await session.get(url)
results.append((url, await r.read()))
else:
print(f"Not HTML, rejecting: {url}")
return results
results = asyncio.run(get_urls_async(urls))
This is a protocol problem, if you do a GET, the server wants to send the body. If you don't retrieve the body you have to discard the connection (this is in fact what it does if you don't do a read() before __aexit__ on the response).
So the above code should do more of less what you want. NOTE the server may send in the first chunk already more than just the headers

Categories