Python Scraper - Socket Error breaks script if target is 404'd - python

Encountered an error while building a web scraper to compile data and output into XLS format; when testing again a list of domains in which I wish to scrape from, the program faulters when it recieves a socket error. Hoping to find an 'if' statement that would null parsing a broken website and continue through my while-loop. Any ideas?
workingList = xlrd.open_workbook(listSelection)
workingSheet = workingList.sheet_by_index(0)
destinationList = xlwt.Workbook()
destinationSheet = destinationList.add_sheet('Gathered')
startX = 1
startY = 0
while startX != 21:
workingCell = workingSheet.cell(startX,startY).value
print ''
print ''
print ''
print workingCell
#Setup
preSite = 'http://www.'+workingCell
theSite = urlopen(preSite).read()
currentSite = BeautifulSoup(theSite)
destinationSheet.write(startX,0,workingCell)
And here's the error:
Traceback (most recent call last):
File "<pyshell#2>", line 1, in <module>
homeMenu()
File "C:\Python27\farming.py", line 31, in homeMenu
openList()
File "C:\Python27\farming.py", line 79, in openList
openList()
File "C:\Python27\farming.py", line 83, in openList
openList()
File "C:\Python27\farming.py", line 86, in openList
homeMenu()
File "C:\Python27\farming.py", line 34, in homeMenu
startScrape()
File "C:\Python27\farming.py", line 112, in startScrape
theSite = urlopen(preSite).read()
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 342, in open_http
h.endheaders(data)
File "C:\Python27\lib\httplib.py", line 951, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 811, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 773, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 754, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11004] getaddrinfo failed

Ummm that looks like the error I get when my internet connection is down. HTTP 404 errors are what you get when you do have a connection but the URL that you specify can't be found.
There's no if statement to handle exceptions; you need to "catch" them using the try/except construct.
Update: Here's a demonstration:
import urllib
def getconn(url):
try:
conn = urllib.urlopen(url)
return conn, None
except IOError as e:
return None, e
urls = """
qwerty
http://www.foo.bar.net
http://www.google.com
http://www.google.com/nonesuch
"""
for url in urls.split():
print
print url
conn, exc = getconn(url)
if conn:
print "connected; HTTP response is", conn.getcode()
else:
print "failed"
print exc.__class__.__name__
print str(exc)
print exc.args
Output:
qwerty
failed
IOError
[Errno 2] The system cannot find the file specified: 'qwerty'
(2, 'The system cannot find the file specified')
http://www.foo.bar.net
failed
IOError
[Errno socket error] [Errno 11004] getaddrinfo failed
('socket error', gaierror(11004, 'getaddrinfo failed'))
http://www.google.com
connected; HTTP response is 200
http://www.google.com/nonesuch
connected; HTTP response is 404
Note that so far we have just opened the connection. Now what you need to do is check the HTTP response code and decide whether there is anything worth retrieving using conn.read()

Related

Failed to establish a new connection: [Errno 11001] getaddrinfo failed' in Python 3.9

This script is used to send data to a 3rd party. It seems to run fine when connected to any network other than my work network. When connected to my work network, it will run for a few minutes, PUT/POSTing as it should, and then suddenly the following error will appear:
File "C:\Users\user\AppData\Local\RPackages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 169, in _new_conn
conn = connection.create_connection(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\util\connection.py", line 73, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.2032.0_x64__qbz5n2kfra8p0\lib\socket.py", line 954, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 353, in connect
conn = self._new_conn()
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connection.py", line 181, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\urllib3\util\retry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='vaccine-verify-v1.services.school.edu', port=443): Max retries exceeded with url: /api/verify (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\scripture\bali_vax.py", line 71, in <module>
post_vax_record(row)
File "C:\scripture\bali_vax.py", line 38, in post_vax_record
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\api.py", line 117, in post
return request('post', url, data=data, json=json, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='vaccine-verify-v1.services.school.edu', port=443): Max retries exceeded with url: /api/verify (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022FD9B44FA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Script
import timeit
start = timeit.default_timer()
import requests
url = 'https://vaccine-verify-v1.services.school.edu/api/verify'
client_id = "abc"
client_secret = "123"
def post_vax_record(record):
manuallyVerified = record[7].lower() == "true";
vaxrecord = { "firstName":record[0], "lastName": record[1], "dob": record[2], "phoneNumber": (record[3] or '4444444444'), "state": record[4], "zip": record[5][:5], "campusName": "Baltimore", "instituitionId": 0, "campusSuCd": 28, "sunyLanId": record[6], "sunyStudId": 0, "ssoId": "BEL-"+record[6], "manuallyVerified": manuallyVerified }
headers = {'Client_Id':client_id, 'Client_Secret':client_secret,'Content-Type':'application/json'}
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
if response:
json_response = response.json()
if ('message' in json_response and json_response['message'] == 'The Student record already exists. Please use put operation to insert the record.'): #changed from update to insert
print ('Post failed. Trying put instead')
response = requests.put(url,json=vaxrecord, headers=headers)
if response:
print('Success on put for '+record[6])
else:
print('Put failed for '+record[6])
else:
print('Success on post for '+record[6])
print(response.json())
else:
print('Post failed for '+record[6]+' with error code: ')
print(response.status_code)
import csv
with open('data_file.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
if row[6].upper().split('#')[1] in ("LIVE.LODI.EDU", "LODI.EDU"):
if row[0].upper() != "FIRSTNAME":
row[6] = row[6].upper().split('#')[0]
post_vax_record(row)
else:
print('NON-LODI EMAIL FOUND FOR '+row[1]+', '+row[0]+'. WILL NOT PROCESS RECORD!')
stop = timeit.default_timer()
print('Run Time: ', stop - start)
While testing this script, and using smaller CSVs (with less records to read in), I never had an issue. Just recently has this been occurring. Any help is greatly appreciated.
Thank you.
Please wrap response = requests.post(url,json=vaxrecord, headers=headers, timeout=5) line with try-catch block
while retry_counter < 5:
try:
response = requests.post(url,json=vaxrecord, headers=headers, timeout=5)
except socket.error as error:
print("Connection Failed due to socket - {}").format(error)
print("Attempting {} of 5").format(retry_counter)
time.sleep(3)
retry_counter += 1
NOTE: Generally retrying code after some time in socket error case will resolve the issue(happened in my case) hence the below solution
while doing request.post, if socket.error occurs then retry the code 3-5 time with a delay of your 1-3 seconds.

How to check if there is internet connection

I Have to check if my computer is connected to internet (so if i open the broswer i can or i cannot visit an url) , for do this task i tried with some codes:
The first is
import socket
def internet_on():
try:
print("checking internet connection..")
socket.setdefaulttimeout(5)
host = socket.gethostbyname("www.google.com")
s = socket.create_connection((host, 80), 2)
s.close()
print('internet on.')
return True
except Exception as e:
print(e)
print("internet off.")
return False
internet_on()
code that i took from this answer Checking internet connection with Python
After i tried with this:
from urllib.request import urlopen
def internet_on():
try:
urlopen("https://www.instagram.com/", timeout=5)
return True
except Exception as err:
print(str(err))
return False
internet_on()
code that i took from this answer Checking network connection
And this
import socket
REMOTE_SERVER = "www.google.com"
def internet_on(hostname):
try:
# see if we can resolve the host name -- tells us if there is
# a DNS listening
host = socket.gethostbyname(hostname)
# connect to the host -- tells us if the host is actually
# reachable
s = socket.create_connection((host, 80), 2)
return True
except Exception:
return False
internet_on(REMOTE_SERVER)
code that i took from this answer Test if an internet connection is present in python
If the connection is active the codes work fines
but when there is no connection all this codes raise the same errors:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\util\connection.py", line 57, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Program Files\Python37-32\lib\socket.py", line 748, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 343, in _make_request
self._validate_conn(conn)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 839, in _validate_conn
conn.connect()
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 301, in connect
conn = self._new_conn()
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connection.py", line 168, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\urllib3\util\retry.py", line 398, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /_exploreurself (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 835, in <module>
bot.unfollow_process(RANDOM, sense=UP_DOWN, following_target=0, sleep_time=60)
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 651, in unfollow_process
current_following = self.user_following_num(self._username)
File "C:/Users/mcara/OneDrive/Desktop/file InstaBot da editare/v0.4/bot.py", line 387, in user_following_num
r = requests.get(url).text
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\mcara\PycharmProjects\1\venv\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /_exploreurself (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x03B39E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Try this one here IP address 216.58.192.142 is one of Google's IP address. Due to static IP address, this code is not robust and may not work always so, replace this IP address with some other website which you believe respond more faster.
The reason why the code uses a fixed IP address instead of a fully qualified domain name (FQDN) is because a FQDN would require a DNS lookup. When the machine does not have a working internet connection, the DNS lookup itself may block the call to urllib_request.urlopen for more than a second.
import urllib2
def internet_on():
try:
urllib2.urlopen('http://216.58.192.142', timeout=1)
return True
except urllib2.URLError as err:
return False
Try to run a ping to www.google.com with the library 'subprocess', an example of code below. This solution is an indirect one, but sometimes is better to delegate in system commands to do certain jobs like this one.
import subprocess
def my_dir(my_path):
output=''
try:
p = subprocess.Popen('ping '+my_path, stdout = subprocess.PIPE, stderr = subprocess.STDOUT,
#close_fds = True, #not valid for Windows platforms
shell = True
)
output, err = p.communicate()
#print(output)
finally:
if p is not None:
try: p.kill()
except: pass
return output
print(my_dir('www.google.com'))
Then, you can parse its output to know if you have reached Google's servers or not.

When I tried to pass data to a function from another it shows me a lot of errors

This code worked successfully..
import urllib.request
def profanity():
connection = urllib.request.urlopen('http://www.wdylike.appspot.com/?q='+'bal')
output = connection.read()
print(output)
connection.close()
profanity()
But I want run the code like the below it caused problem to me. But I want to pass data which is being read from a local txt file and pass this data to profanity() function. What is to do?
import urllib.request
def read_file():
qoutes=open(r"C:\Python34\profanity.txt")
a=qoutes.read()
profanity(a)
qoutes.close()
def profanity(b):
connection = urllib.request.urlopen('http://www.wdylike.appspot.com/?q='+b)
output = connection.read()
print(output)
connection.close()
##profanity()
read_file()
The error log:
Traceback (most recent call last):
File "C:\Python34\check_profanity.py", line 18, in <module>
read_file()
File "C:\Python34\check_profanity.py", line 8, in read_file
profanity(a)
File "C:\Python34\check_profanity.py", line 12, in profanity
connection = urllib.request.urlopen('http://www.wdylike.appspot.com/?q='+b)
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 462, in open
req = meth(req)
File "C:\Python34\lib\urllib\request.py", line 1106, in do_request_
raise URLError('no host given')
urllib.error.URLError: <urlopen error no host given>

python greenlet SSL EOF error

I keep getting the following error but i have no idea how to try..except it or handle it. Any ideas?
The error happens when i load test the server or if I send a request on port 443 that is not SSL, such as http://server:443/call
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/gevent/greenlet.py", line 327, in run
result = self._run(*self.args, **self.kwargs)
File "/usr/local/lib/python2.7/dist-packages/gevent/server.py", line 102, in wrap_socket_and_handle
ssl_socket = self.wrap_socket(client_socket, **self.ssl_args)
File "/usr/local/lib/python2.7/dist-packages/gevent/ssl.py", line 383, in wrap_socket
ciphers=ciphers)
File "/usr/local/lib/python2.7/dist-packages/gevent/ssl.py", line 94, in __init__
self.do_handshake()
File "/usr/local/lib/python2.7/dist-packages/gevent/ssl.py", line 305, in do_handshake
return self._sslobj.do_handshake()
SSLError: [Errno 8] _ssl.c:510: EOF occurred in violation of protocol
<Greenlet at 0x7f74b804eb90: <bound method WSGIServer.wrap_socket_and_handle of <WSGIServer at 0x7f74b9d16810 fileno=7 address=0.0.0.0:443>>(<socket at 0x7f74b2442590 fileno=198 sock=46.101.4, ('54.198.34.85', 26771))> failed with SSLError
The code on a basic level is as follows
from gevent.pywsgi import WSGIServer
try:
class HttpsHandler:
def application(self, env, start_response):
start_response("200 OK", "Content-Length: 0")
return []
WSGIServer(
('', 443),
application=HttpsHandler().application,
keyfile=cfg.ssl_keyfile,
certfile=cfg.ssl_crtfile,
ca_certs=cfg.ssl_cacerts,
log=open('logs/access.log', 'a')
).serve_forever()
except Exception, e:
print e
except KeyboardInterrupt, e:
pass

python paramiko module error with callback

I'm trying to use the paramiko module to copy a (big) file in my local network, and get the output to display a GtkProgressBar.
A part of my code is:
...
NetworkCopy.pbar.set_text("Copy of the file in the Pi...")
while gtk.events_pending(): # refresh the progress bar
gtk.main_iteration()
self.connection(transferred, toBeTransferred)
def connection(self, transferred, toBeTransferred):
sftp = self.sftp
fichier_pc = self.fichier_pc
chemin_pi = self.chemin_pi # var names are in french !
fichier = self.fichier
transferred = self.transferred
toBeTransferred = self.toBeTransferred
print "Transferred: {0}\tStill to send: {1}".format(transferred, toBeTransferred)
sftp.put(fichier_pc, chemin_pi + fichier, callback=self.connection)
In the terminal, I can see
Transferred: 0 Still to send: 3762398252
for a while, but after 10s I have this error:
File "network_copier.py", line 158, in connection
sftp.put(fichier_pc, chemin_pi + fichier, callback=self.connection)
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_client.py", line 615, in put
return self.putfo(fl, remotepath, os.stat(localpath).st_size, callback, confirm)
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_client.py", line 577, in putfo
fr.close()
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_file.py", line 67, in close
self._close(async=False)
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_file.py", line 88, in _close
self.sftp._request(CMD_CLOSE, self.handle)
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_client.py", line 689, in _request
return self._read_response(num)
File "/usr/lib/python2.7/dist-packages/paramiko/sftp_client.py", line 721, in _read_response
raise SSHException('Server connection dropped: %s' % (str(e),))
paramiko.SSHException: Server connection dropped:
I have the 1.12.2 version of paramiko, from this ppa
Thanks for your help
Edit: The solution is to use pexpect instead of paramiko. It's working with big files.
See here

Categories