Python Requests - Connection Refused - python

I'm trying to get the status_code from various URLs in a csv file using the requests Python module.
It works for some websites, but for most of them it shows 'Connection Refused', even though if I visit the websites through the browser they load just fine.
The code looks like this:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from fake_useragent import UserAgent
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df = pd.read_csv('Websites.csv')
output_data = pd.DataFrame(columns=['url', 'status'])
number_urls = df.shape[0]
i = 0
for url in df['urls']:
session = requests.Session()
adapter = HTTPAdapter(max_retries=3)
adapter.max_retries.respect_retry_after_header = False
session.mount('http://', adapter)
session.mount('https://', adapter)
print(url)
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
try:
# Status
start = time.time()
response = session.get(url, headers=header, verify=False, timeout=0.5)
request_time = time.time() - start
info = "Request completed in {0:.0f}ms".format(request_time)
print(info)
status = response.status_code
if (status == 200):
status = "Connection Successful"
if (status == 404):
status = "404 Error"
if (status == 403):
status = "403 Error"
if (status == 503):
status = "503 Error"
print(status)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
except requests.exceptions.Timeout:
status = "Connection Timed Out"
print(status)
request_time = time.time() - start
info = "TimeOut in {0:.0f}ms".format(request_time)
print(info)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
except requests.exceptions.ConnectionError:
status = "Connection Refused"
print(status)
request_time = time.time() - start
info = "Connection Error in {0:.0f}ms".format(request_time)
print(info)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
output_data.to_csv('dead_blocked2.csv', index=False)
print('CSV file created!')
Here's an example of one website that shows Connection Refused, even though it works: https://www.dytt8.net
I've tried using different TLS versions using the following piece of code and updating my session, but it still doesn't work:
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
Can anyone help?
Thanks!

Related

How do i use diffrent session and proxy after every 5 request in python?

below is my code but currently it sometimes take different proxy on every new request and for some reason the request are taking too much time to show response, way more than usual
i also have no way of knowing if the session is staying same for all five request or are they changing on every new request
i'm rotating proxy using next(proxy_cycle) and i'm hitting the request like this in the postman http://127.0.0.1:8000/test/http://ipinfo.io/json
i don't know what exactly i'm doing wrong.
from fastapi import FastAPI
import requests
from bs4 import BeautifulSoup
import lxml
import asyncio
import uvicorn
from itertools import cycle
app = FastAPI()
counter_lock = asyncio.Lock()
counter = 0
def make_5_req_per_session(url, proxy, session):
try:
response = session.get(url, proxies=proxy, timeout=10)
if response.status_code == 200:
return response.json()
else:
return None
except Exception as e:
# print(e)
print("dead proxy")
return None
def load_proxies():
# read proxies from a text file
with open("proxies.txt", "r") as proxies_file:
list_of_proxies = []
for proxy in proxies_file:
list_of_proxies.append(proxy.strip())
return list_of_proxies
list_of_proxies = load_proxies()
proxy_cycle = cycle(list_of_proxies)
# first time proxy and session
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
#app.get("/test/{url:path}")
async def hello_world(
url,
proxy=proxy,
session=session,
):
global counter
async with counter_lock:
counter += 1
if counter == 5:
counter = 0
# make new proxy and session after 5 requests
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
data = make_5_req_per_session(url, proxy, session)
# if the response is None, then try again with new proxy
while data is None:
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
data = make_5_req_per_session(url, proxy, session)
return {"url": f" {counter} {proxy} {data} "}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
this is my proxies in "proxies.txt"
111.21.183.58:9091
112.105.59.218:80
110.164.3.7:8888

Redirect instead of HTTP403

I'm trying to make a proxy with a Blocklist. I initially used 403 when a user goes to a blocked page. However, it doesn't work with HTTPS and returns ERR_TUNNEL_CONNECTION_FAILED as explained in Respond with 403 in an HTTPS proxy
Thus, I want to redirect the user to a html page like this
This is my code:
import socket
import threading
import signal
import sys
import fnmatch
import errno
import time
import pdb
import re
from time import gmtime, strftime, localtime
import logging
import config
import rule
import tag as tag_store
from Ignite import flame
core = flame()
p=re.compile('(http:\/\/)?([\w\.-]*)(\:(\d*))?(\/.*)?')
thread_logger = logging.getLogger('thread')
access_logger = logging.getLogger('access')
csv_logger = logging.getLogger('csv')
def proxy(browser_conn, client_addr):
print("hi")
def ishostAllowed(host):
print("\n\nHost:")
print(str(host))
access_logger.info(str(host))
if host.split('.')[-1].isdigit():
thread_logger.warn("Invalid host:".format(host),extra=req)
return core.check_allow(host)
#pdb.set_trace()
tags=tag_store.get(host)
if not tags:
thread_logger.warn("{0} isn't allowed: empty tags".format(host),extra=req)
return core.check_allow(host)
for tag in tag_store.get(host):
if not rule.isTagAllowed(tag):
thread_logger.warn("{0}:{1} isn't allowed".format(host,tag),extra=req)
return core.check_allow(host)
return core.check(host)
def proxy_http(request):
try:
# create a socket to connect to the web server
#pdb.set_trace()
server_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_conn.settimeout(config.connection_timeout)
server_conn.connect((request['host'], request['port']))
server_conn.sendall(request['raw_data']) # send request to webserver
while 1:
data = server_conn.recv(config.max_request_len) # receive data from web server
if (len(data) > 0):
browser_conn.send(data) # send to browser
else:
break
except socket.error as error_msg:
thread_logger.error(str(error_msg)+":"+str(request),extra=req);
finally:
if server_conn:
server_conn.close()
if browser_conn:
browser_conn.close()
return
def response(status,message):
reply = "HTTP/1.1 {0} {1}\r\n"
reply += "Proxy-agent: Sinktrap\r\n"
reply += "\r\n"
reply = reply.format(status,message);
#pdb.set_trace()
browser_conn.sendall( reply.encode() )
def proxy_https(request):
#pdb.set_trace()
try:
server_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# If successful, send 200 code response
server_conn.connect((req['host'], req['port']))
response(200,'Connection established')
except socket.error as err:
# If the connection could not be established, exit
# Should properly handle the exit with http error code here
thread_logger.error("Cannot establish https connection:"+err,extra=req);
if server_conn:
server_conn.close()
if browser_conn:
browser_conn.close()
return
# Indiscriminately forward bytes
browser_conn.setblocking(0)
server_conn.setblocking(0)
timeout=time.time()+60 # 1 minute
while timeout-time.time()>0:
request_done=False
replied_done=False
try:
request =browser_conn.recv(config.max_request_len) # receive data from browser
if (len(request) > 0):
server_conn.sendall(request) # send to web server
else:
request_done=True
#hread_logger.info("REQUEST len: " + str(len(request)),extra=req);
except socket.error as e:
if e.errno==errno.EWOULDBLOCK:
time.sleep(0.1)
pass
else:
thread_logger.error("pipe error:"+str(e),extra=req);
break
try:
reply = server_conn.recv(config.max_request_len) # receive data from web server
if (len(reply) > 0):
browser_conn.sendall(reply) # send to browser
else:
replied_done=True
#thread_logger.info("reply len: " + str(len(reply)),extra=req);
except socket.error as e:
if e.errno==errno.EWOULDBLOCK:
time.sleep(0.1)
pass
else:
thread_logger.error("pipe error:"+str(e),extra=req);
break
if request_done and replied_done:
break
server_conn.close()
browser_conn.close()
raw_data = browser_conn.recv(config.max_request_len) # get the request from browser
req={'raw_data':raw_data,
'tname' : threading.currentThread().getName(),
'client_ip' : client_addr[0],
'client_port' : client_addr[1]
}
thread_logger.info("REQUEST: {0}".format(raw_data),extra=req);
#pdb.set_trace()
try:
# request_line is the first one. https://www.w3.org/Protocols/rfc2616/rfc2616-sec5.html
msg_body_pos=len(raw_data)
for i in range(4,len(raw_data)):
if raw_data[i-4:i].decode()=='\r\n\r\n':
msg_body_pos=i
break
lines=raw_data[:msg_body_pos-4].decode('utf-8').split('\r\n')
if len(lines[0])<16:
thread_logger.warn("INVALU REQUEST:{0}".format(raw_data),extra=req);
return
headers = {k:v for k,v in (x.split(':',1) for x in lines[1:]) }
if 'Referer' in headers:
req['Referer']=headers['Referer']
else:
req['Referer']=''
if 'User-Agent' in headers:
req['User-Agent']=headers['User-Agent']
else:
req['User-Agent']=''
req['request_line'] =lines[0]
req['method'],req['request_uri'],req['http_version']=lines[0].split(' ')
#check if the first line is valid request. request_line might be empty
if not req['method'] or not req['request_uri'] or not req['http_version']:
thread_logger.warn("INVALU REQUEST:{0}".format(raw_data),extra=req);
return
except Exception as e:
thread_logger.error("INVALU REQUEST:{0} {1}".format(e, raw_data),extra=req);
logging.exception("INVALU REQUEST")
return
access_logger.info("",extra=req)
#pdb.set_trace()
m=p.match(req['request_uri'])
req['host']=m.group(2)
req['port']=int(m.group(4)) if m.group(4) else 80
# Check if request is allowed or not
if not ishostAllowed(req['host']):
csv_logger.info("blocked",extra=req);
thread_logger.warn("Block REQUEST:{0}".format(raw_data),extra=req);
response(403,"The website has been blocked by Ignite's proxy.")
#Breq = req
#Breq['host'] = "azlancoding.github.io/Blocked"
#proxy_https(Breq)
#response(307,"https://azlancoding.github.io/BLOCKED")
return
csv_logger.info("allowed",extra=req);
#pdb.set_trace()
if req['method']=='CONNECT':
proxy_https(req)
else:
proxy_http(req)
The original proxy is pcxy
See my github project here

Multiple requests over single connection with Python

When I download a file using single request I do the following:
session = requests.Session()
params = {'fd': 1, 'count': 1024, 'auth': 'auth_token'}
r = session.get('https://httpbin.org/bytes/9', params=params)
print(r.content)
# b'\xb3_\\l\xe2\xbf/:\x07'
How can I do multiple requests without waiting for an answer?
Server api docs says:
You can push multiple requests over single connection without waiting
for answer, to improve performance. The server will process the
requests in the order they are received and you are guaranteed to
receive answers in the same order. It is important however to send all
requests with "Connection: keep-alive", otherwise the API server will
close the connection without processing the pending requests.
They are talking about one thread and multiple requests without waiting for an answer. I suppose it is called HTTP pipelining.
How can I do this with Python Requests library?
A similar answer suggests using parallel calls which is not the case for my question. It also says: "requests does pool connections, keeping the TCP connection open". How can I implement this?
Can I use any other synchronous library, if it's not possible for requests?
You can get several pages in parallel, without threads. It exploits HTTP pipelining by resetting the state(private variable!) of HTTPSConnection to trick it into sending the next request ahead of time.
from http.client import HTTPSConnection, _CS_IDLE
from urllib.parse import urlparse, urlunparse
def pipeline(host, pages, max_out_bound=4, debuglevel=0):
page_count = len(pages)
conn = HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
responses = [None] * page_count
finished = [False] * page_count
content = [None] * page_count
headers = {'Host': host, 'Content-Length': 0, 'Connection': 'Keep-Alive'}
while not all(finished):
# Send
out_bound = 0
for i, page in enumerate(pages):
if out_bound >= max_out_bound:
break
elif page and not finished[i] and responses[i] is None:
if debuglevel > 0:
print('Sending request for %r...' % (page,))
conn._HTTPConnection__state = _CS_IDLE # private variable!
conn.request("GET", page, None, headers)
responses[i] = conn.response_class(conn.sock, method=conn._method)
out_bound += 1
# Try to read a response
for i, resp in enumerate(responses):
if resp is None:
continue
if debuglevel > 0:
print('Retrieving %r...' % (pages[i],))
out_bound -= 1
skip_read = False
resp.begin()
if debuglevel > 0:
print(' %d %s' % (resp.status, resp.reason))
if 200 <= resp.status < 300:
# Ok
content[i] = resp.read()
cookie = resp.getheader('Set-Cookie')
if cookie is not None:
headers['Cookie'] = cookie
skip_read = True
finished[i] = True
responses[i] = None
elif 300 <= resp.status < 400:
# Redirect
loc = resp.getheader('Location')
responses[i] = None
parsed = loc and urlparse(loc)
if not parsed:
# Missing or empty location header
content[i] = (resp.status, resp.reason)
finished[i] = True
elif parsed.netloc != '' and parsed.netloc != host:
# Redirect to another host
content[i] = (resp.status, resp.reason, loc)
finished[i] = True
else:
path = urlunparse(parsed._replace(scheme='', netloc='', fragment=''))
if debuglevel > 0:
print(' Updated %r to %r' % (pages[i], path))
pages[i] = path
elif resp.status >= 400:
# Failed
content[i] = (resp.status, resp.reason)
finished[i] = True
responses[i] = None
if resp.will_close:
# Connection (will be) closed, need to resend
conn.close()
if debuglevel > 0:
print(' Connection closed')
for j, f in enumerate(finished):
if not f and responses[j] is not None:
if debuglevel > 0:
print(' Discarding out-bound request for %r' % (pages[j],))
responses[j] = None
break
elif not skip_read:
resp.read() # read any data
if any(not f and responses[j] is None for j, f in enumerate(finished)):
# Send another pending request
break
else:
break # All responses are None?
return content
if __name__ == '__main__':
domain = 'en.wikipedia.org'
pages = ['/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection']
data = pipeline(domain, pages, max_out_bound=3, debuglevel=1)
for i, page in enumerate(data):
print()
print('==== Page %r ====' % (pages[i],))
print(page[:512])

Download and handle errors

I've been using a function that I took from the book Web Scraping with Python from O'Really by Ryan Mitchell:
import sys
import os.path
import socket
import random
import urllib2
import contextlib
import diskCache
import logging as logger
from bs4 import BeautifulSoup
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout (DEFAULT_TIMEOUT)
def download (url, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, \
cache=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, data=None):
result = None
if cache:
try:
result = cache[url]
except KeyError:
# url is not available in cache
pass
if result is not None and result['code'] is not None \
and num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download
result = None
if result is None:
proxy = random.choice(proxies) if proxies else None
headers = {'User-agent': user_agent}
result = call (url, headers, proxy=proxy, num_retries=num_retries, cache=cache)
if cache:
# save result to cache
cache[url] = result
return result['html']
def call (url, headers, proxy, num_retries, cache=None, data=None):
request = urllib2.Request(url, data, headers or {})
with contextlib.closing (urllib2.urlopen(request)) as connection:
try:
logger.info ('Downloading: %s', url)
html = connection.read ()
code = connection.getcode ()
except Exception as e:
logger.exception ('Download error:', str(e))
if cache:
del cache['url']
html = None
if hasattr (e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
return download (url, headers, num_retries-1, data) # retry server errors
else:
code = None
return {'html': html, 'code':code}
I wanted to know if there is a simpler way of handling the errors when downloading urls. I've seen that the requests library is a higher level and easier library and maybe it could simplify this. At the very least how would this code be for python3?
It would be something like
"""Functions used by the fetch module"""
# Standard library imports
import time
import socket
import logging as logger
from typing import Dict, Optional
# Third party imports
import requests
from requests.exceptions import HTTPError, Timeout
from bs4 import BeautifulSoup
# Constants
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout(DEFAULT_TIMEOUT)
def fetch(url: str, retries: Optional[int] = DEFAULT_RETRIES) -> Dict:
"""Download an url"""
code = None
try:
logger.info('Downloading: %s', url)
resp = requests.get(url)
resp.raise_for_status()
code = resp.status_code
except (HTTPError, Timeout) as ex:
logger.exception("Couldn't download %s", ex)
return None
if code is not None and retries > 0 and \
500 <= code < 600: # Server error
logger.info('Retrying download')
time.sleep(DEFAULT_DELAY)
return fetch(url, retries-1)
return {'html': resp, 'code': code}
As you said this is a lot easier with requests
resp = requests.get(url, headers=headers, timeout=timeout)
print(resp.status_code)
print(resp.text)
# for an API use resp.json()
There is no exception raised by default. You can call resp.raise_for_status() if you do want to raise an exception.
See http://docs.python-requests.org/en/master/user/quickstart/ for details

Switching proxies in python: Psuedo code

Lets say I have a website that I want to scrape. Ex. cheapoair.com
I want to use a normal requests in python to scrape the data on the first, hypothetical page. If I end up being blocked by the server, I want to switch to a proxy. I have a list of proxy servers and a method, and I also have a list of user agent strings. However, I think I need help thinking through the problem.
For reference
uagen() will return a user agent string
proxit() will return a proxy
Here is what I have so far:
import requests
from proxy_def import *
from http import cookiejar
import time
from socket import error as SocketError
import sys
start_time = time.time()
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
headers = {'User-Agent': uagen()}
print(headers)
s = requests.Session()
s.cookies.set_policy(BlockAll)
cookies = {'SetCurrency': 'USD'}
sp = proxit()
for i in range(100000000000):
while True:
try:
print('trying on ', sp)
print('with user agent headers', headers)
s.proxies = {"http": sp}
r = s.get("http://www.cheapoair.com", headers=headers, timeout=15, cookies=cookies)
print(i, sp, 'success')
print("--- %s seconds ---" % (time.time() - start_time))
except SocketError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.ConnectionError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.Timeout as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except KeyboardInterrupt:
print("The program has been terminated")
sys.exit(1)
break
#print(r.text)
print('all done',
'\n')
What I am looking for is an idea of how to say, start with a normal requests (not from a proxy), and if you end up with an error (such as being rejected by the server), switch to a proxy and try again.
I can almost picture it, but cant quite see it.
I'm thinking, that if I place a variable after
for i in range(1000000000000):
But before while true: That updates the sp then it might work. Another possibility it to maybe declare s.proxies = {"http": ""} and then if I run into an error, switch to s.poxies = {"http": "proxit()"} or s.poxies = {"http": "sp"}
Thanks!
I figured it out.
while True:
try:
#do this thing
#but remove variable from here and declare it before "while True"
except SockerError as e:
#switch headers, switch user agent string
s.proxies = {"http": proxit()}
continue
That will refresh the variable after it gets an error from the server

Categories