I want to use proxy lists but I'm not sure how to do this:
Check if first connection is established (it does not matter it is the first proxy IP or not), run something and then close the program and exit, else go to the next proxy.
This is my code that loops over all proxies:
import requests
from Proxy_List_Scrapper import Scrapper, Proxy, ScrapperException
ALL = 'ALL'
scrapper = Scrapper(category=ALL, print_err_trace=False)
# Get ALL Proxies According to your Choice
data = scrapper.getProxies()
n = 0
while True:
n += 1
try:
for item in data.proxies:
# Enter a proxy IP address and port.
url = 'http://api.ipify.org'
# Send a GET request to the url and pass the proxy as a parameter.
page = requests.get(url, proxies={"http": f'{item.ip}:{item.port}', "https": f'{item.ip}:{item.port}'})
# Prints the content of the requested url.
print(f'proxy is {item.ip}:{item.port} and content is {page.text}')
except requests.exceptions.ProxyError:
print(f'error #{n}')
Maybe you should run try/except inside for-loop - to catch problem with proxy server and continue with next proxy on list. And you could use break to finish loop when you find working proxy.
Something like this:
import requests
from Proxy_List_Scrapper import Scrapper, Proxy, ScrapperException
scrapper = Scrapper(category='ALL', print_err_trace=False)
data = scrapper.getProxies()
url = 'http://api.ipify.org'
for n, item in enumerate(data.proxies):
try:
page = requests.get(url, proxies={"http": f'{item.ip}:{item.port}', "https": f'{item.ip}:{item.port}'})
print(f'proxy is {item.ip}:{item.port} and content is {page.text}')
# exit `for`-loop when connection finished without error and it gives different IP
if page.text != my_original_ip:
break
except requests.exceptions.ProxyError:
print(f'error #{n} {item.ip}:{item.port}')
Related
try:
with sync_playwright() as p:
driver = p.firefox.launch(hedless=headless, proxy={
"server": 'fa****y.com:10000',
'username': 'iu***53cpeuytpna0cc7bd7',
"password": '**W7**Fm5',
})
context = driver.new_context()
page = context.new_page()
page.route("**/*",lambda route:route.abort()
if route.request.resource_type == "image"
or route.request.resource_type == "stylesheet"
or route.request.resource_type == "svg"
else route.continue_()
)
#page.set_default_timeout(100000)
try:
url = 'https://accounts.google.com/v3/signin/identifier?dsh=S-536847501%3A1663770960047319&continue=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&followup=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&passive=1209600&service=androiddeveloper&flowName=GlifWebSignIn&flowEntry=ServiceLogin&ifkv=AQDHYWoj7hmeMm5YT3PrA0sojYcd3nnuAx2JkCLnedM0A9sCEUG9nrlRYD-grtVE1CcBagVSvXOG'
page.goto(url)
except Exception:
print(" [+] Time out Error ")
print(Fore.LIGHTBLUE_EX +" [+] Start >>> " + self.gmail )
page.fill('id=identifierId',self.gmail)
btn = '#identifierNext > div > button'
page.click(btn)
page.wait_for_timeout(3000)
inbpass = '#password > div.aCsJod.oJeWuf > div > div.Xb9hP > input'
page.fill(inbpass,self.password)
btnpass = '#passwordNext > div > button'
page.click(btnpass)
time.sleep(3)
page.wait_for_timeout(3000)
try:
page.locator("text=I understand").click(timeout=10000)
page.wait_for_timeout(1500)
sleep(1)
except:
pass
try:
page.locator("div[role=\"link\"]:has-text(\"Confirm your recovery email\")").click()
page.wait_for_timeout(3000)
page.locator("[aria-label=\"Enter recovery email address\"]").fill(self.recvery)
page.wait_for_timeout(3000)
time.sleep(3)
# Click button:has-text("Next")
page.locator("button:has-text(\"Next\")").click()
time.sleep(10)
except:
pass
page.locator("text=YourselfChoose if your account is for personal >> button").click()
time.sleep(3)
page.wait_for_timeout(1500)
**IN THIS LINE:
(In this line : I want to make proxy stop because it's not unlimited and I am paying for GB and I want to use only local internet from this line to end is there any help ?)**
I want to to stop proxy usage when reach to click in
One: for proxy usage is high and I don't need proxy anymore in the next step.
Two: I want to make it faster because proxy is slow when it reach to the page.
Please help me with code or anything I don't know how to solve it
There is no direct way to achieve this, but there are workarounds you can use to achieve the same results using routing. Basically, after you no longer require the proxy, you route all requests playwright makes to a handler, which subsequently bypasses proxies and forwards the response back to playwright. However, this approach will only work with async version of playwright (otherwise executing a blocking call from a route handler will block further network traffic too). Consider the below code:
import aiohttp, asyncio
import playwright
def no_proxy_route(session):
"""
Bypasses any proxy and routes request directly. It's a factory function to store value of session so we don't need
to create a new one every time.
"""
async def route_handler(route: playwright.async_api.Route):
request = route.request
try:
body, status, headers = await fetch(session, request)
# Abort route in case of error
except (aiohttp.ClientConnectionError, asyncio.TimeoutError):
await route.abort()
return
# If no error, fulfill route with the given body and headers
await route.fulfill(status=status, headers=headers, body=body)
return route_handler
async def fetch(session, request, timeout=5000):
"""
Fetch the given request using aiohttp
"""
assert timeout > 0, "timeout must be positive"
try:
async with session.request(request.method, request.url, headers=request.headers,
data=request.post_data, timeout=timeout) as response:
body = await response.read()
status = response.status
headers = response.headers
except Exception:
raise
else:
return body, status, headers
Now if you want to conditionally disable proxies for some requests, you simply create a matching pattern and route all those requests through no_proxy_route. An example is given below:
### All your previous code
# .
# .
# .
# .
### which required a proxy
# Create an aiohttp session with a dummy cookie jar. This is because we will be passing the cookies
# explicitly and don't want previous requests/responses cookies to persist and interfere
session = aiohttp.ClientSession(cookie_jar=aiohttp.DummyCookieJar())
# Create a route to our handler with an appropriate pattern. The below pattern will route ALL subsequent requests to
# handler. Remember that you can use regex for patterns as well.
await page.route('**/*', no_proxy_route(session))
After this is done, all requests that match the pattern, with which you created the route, will not use a proxy, even if the context has one set. But again, this is only useful if you are using the async API for playwright.
Example of 522 error when I go to the webpage manually
Example of 525 error when I go to the webpage manually
Example of 504 error when I go to the webpage manually
I am running the following for loop which goes through a dictionary of subreddits(key) and urls (value). The urls produce a dictionary with all posts from 2022 of a given subreddit. Sometimes the for loop stops and produces a 'http error 525' or other errors.
I'm wondering how I can check for these errors when reading the url and then try again until the error is not given before moving to the next subreddit.
for subredd, url in dict_last_subreddit_posts.items():
print(subredd)
page = urllib.request.urlopen(url).read()
dict_last_posts[subredd] = page
I haven't been able to figure it out.
You can put this code in try and except block like this:
for subredd, url in dict_last_subreddit_posts.items():
print(subredd)
while True:
try:
page = urllib.request.urlopen(url).read()
dict_last_posts[subredd] = page
break # exit the while loop if the request succeeded
except urllib.error.HTTPError as e:
if e.code == 525 or e.code == 522 or e.code == 504:
print("Encountered HTTP error while reading URL. Retrying...")
else:
raise # re-raise the exception if it's a different error
This code will catch any HTTP Error that occurs while reading the URL and check if the error code is 525 or 504 or 525. If it is, it will print a message and try reading the URL again. If it's a different error, it will re-raise the exception so that you can handle it appropriately.
NOTE: This code will retry reading the URL indefinitely until it succeeds or a different error occurs. You may want to add a counter or a timeout to prevent the loop from going on forever in case the error persists.
It's unwise to indefinitely retry a request. Set a limit even if it's very high, but don't set it so high that it causes you to be rate limited (HTTP status 429). The backoff_factor will also have an impact on rate limiting.
Use the requests package for this. This makes it very easy to set a custom adapter for all of your requests via Session, and it includes Retry from urllib3 which takes care of retry behavior in an object you can pass to your adapter.
import requests
from requests.adapters import HTTPAdapter, Retry
s = requests.Session()
retries = Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[504, 522, 525]
)
s.mount('https://', HTTPAdapter(max_retries=retries))
for subredd, url in dict_last_subreddit_posts.items():
response = s.get(url)
dict_last_posts[subredd] = response.content
You can play around with total (maximum number of retries) and backoff_factor (adjusts wait time between retries) to get the behavior you want.
Try something like this:
for subredd, url in dict_last_subreddit_posts.items():
print(subredd)
http_response = urllib.request.urlopen(url)
while http_response.status != 200:
if http_response.status == 503:
http_response = urllib.request.urlopen(url)
elif http_response.status == 523:
#enter code here
else:
#enter code here
dict_last_posts[subredd] = http_response.read()
But, Michael Ruth answer is better
I am writing some small python app which uses requests to get and post data to an html page.
now the problem I am having is that if I can't reach the html page the code stops with a max retries exceeded. I want to be able to do some things if I can't reach the server.
is such a thing possible?
here is sample code:
import requests
url = "http://127.0.0.1/"
req = requests.get(url)
if req.status_code == 304:
#do something
elif req.status_code == 404:
#do something else
# etc etc
# code here if server can`t be reached for whatever reason
You want to handle the exception requests.exceptions.ConnectionError, like so:
try:
req = requests.get(url)
except requests.exceptions.ConnectionError as e:
# Do stuff here
You may want to set a suitable timeout when catching ConnectionError:
url = "http://www.stackoverflow.com"
try:
req = requests.get(url, timeout=2) #2 seconds timeout
except requests.exceptions.ConnectionError as e:
# Couldn't connect
See this answer if you want to change the number of retries.
I have this program that check a website, and I want to know how can I check it via proxy in Python...
this is the code, just for example
while True:
try:
h = urllib.urlopen(website)
break
except:
print '['+time.strftime('%Y/%m/%d %H:%M:%S')+'] '+'ERROR. Trying again in a few seconds...'
time.sleep(5)
By default, urlopen uses the environment variable http_proxy to determine which HTTP proxy to use:
$ export http_proxy='http://myproxy.example.com:1234'
$ python myscript.py # Using http://myproxy.example.com:1234 as a proxy
If you instead want to specify a proxy inside your application, you can give a proxies argument to urlopen:
proxies = {'http': 'http://myproxy.example.com:1234'}
print("Using HTTP proxy %s" % proxies['http'])
urllib.urlopen("http://www.google.com", proxies=proxies)
Edit: If I understand your comments correctly, you want to try several proxies and print each proxy as you try it. How about something like this?
candidate_proxies = ['http://proxy1.example.com:1234',
'http://proxy2.example.com:1234',
'http://proxy3.example.com:1234']
for proxy in candidate_proxies:
print("Trying HTTP proxy %s" % proxy)
try:
result = urllib.urlopen("http://www.google.com", proxies={'http': proxy})
print("Got URL using proxy %s" % proxy)
break
except:
print("Trying next proxy in 5 seconds")
time.sleep(5)
Python 3 is slightly different here. It will try to auto detect proxy settings but if you need specific or manual proxy settings, think about this kind of code:
#!/usr/bin/env python3
import urllib.request
proxy_support = urllib.request.ProxyHandler({'http' : 'http://user:pass#server:port',
'https': 'https://...'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
with urllib.request.urlopen(url) as response:
# ... implement things such as 'html = response.read()'
Refer also to the relevant section in the Python 3 docs
Here example code guide how to use urllib to connect via proxy:
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
# build a new opener that adds authentication and caching FTP handlers
opener = urllib.request.build_opener(proxy_support, authinfo,
urllib.request.CacheFTPHandler)
# install it
urllib.request.install_opener(opener)
f = urllib.request.urlopen('http://www.google.com/')
"""
For http and https use:
proxies = {'http':'http://proxy-source-ip:proxy-port',
'https':'https://proxy-source-ip:proxy-port'}
more proxies can be added similarly
proxies = {'http':'http://proxy1-source-ip:proxy-port',
'http':'http://proxy2-source-ip:proxy-port'
...
}
usage
filehandle = urllib.urlopen( external_url , proxies=proxies)
Don't use any proxies (in case of links within network)
filehandle = urllib.urlopen(external_url, proxies={})
Use proxies authentication via username and password
proxies = {'http':'http://username:password#proxy-source-ip:proxy-port',
'https':'https://username:password#proxy-source-ip:proxy-port'}
Note: avoid using special characters such as :,# in username and passwords
I'm playing around trying to write a client for a site which provides data as an HTTP stream (aka HTTP server push). However, urllib2.urlopen() grabs the stream in its current state and then closes the connection. I tried skipping urllib2 and using httplib directly, but this seems to have the same behaviour.
The request is a POST request with a set of five parameters. There are no cookies or authentication required, however.
Is there a way to get the stream to stay open, so it can be checked each program loop for new contents, rather than waiting for the whole thing to be redownloaded every few seconds, introducing lag?
You could try the requests lib.
import requests
r = requests.get('http://httpbin.org/stream/20', stream=True)
for line in r.iter_lines():
# filter out keep-alive new lines
if line:
print line
You also could add parameters:
import requests
settings = { 'interval': '1000', 'count':'50' }
url = 'http://agent.mtconnect.org/sample'
r = requests.get(url, params=settings, stream=True)
for line in r.iter_lines():
if line:
print line
Do you need to actually parse the response headers, or are you mainly interested in the content? And is your HTTP request complex, making you set cookies and other headers, or will a very simple request suffice?
If you only care about the body of the HTTP response and don't have a very fancy request, you should consider simply using a socket connection:
import socket
SERVER_ADDR = ("example.com", 80)
sock = socket.create_connection(SERVER_ADDR)
f = sock.makefile("r+", bufsize=0)
f.write("GET / HTTP/1.0\r\n"
+ "Host: example.com\r\n" # you can put other headers here too
+ "\r\n")
# skip headers
while f.readline() != "\r\n":
pass
# keep reading forever
while True:
line = f.readline() # blocks until more data is available
if not line:
break # we ran out of data!
print line
sock.close()
One way to do it using urllib2 is (assuming this site also requires Basic Auth):
import urllib2
p_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
url = 'http://streamingsite.com'
p_mgr.add_password(None, url, 'login', 'password')
auth = urllib2.HTTPBasicAuthHandler(p_mgr)
opener = urllib2.build_opener(auth)
urllib2.install_opener(opener)
f = opener.open('http://streamingsite.com')
while True:
data = f.readline()