Python 3 get HTML content - python

I'm using this code to get the web site html content,
import urllib.request
import lxml.html as lh
req= urllib.request.Request("http://www.ip-adress.com/ip_tracer/157.123.22.11",
headers={'User-Agent' : "Magic Browser"})
html = urllib.request.urlopen(req).read()
doc = lh.fromstring(html)
print (''.join(doc.xpath('.//*[#class="odd"]')[-1].text_content().split()))
I want to get the Organization: Zenith Data Systems.
but it shows some errors
Traceback (most recent call last):
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 1135, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 967, in request
self._send_request(method, url, body, headers)
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 1005, in _send_request
self.endheaders(body)
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 963, in endheaders
self._send_output(message_body)
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 808, in _send_output
self.send(msg)
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 746, in send
self.connect()
File "/usr/local/python3.2.3/lib/python3.2/http/client.py", line 724, in connect
self.timeout, self.source_address)
File "/usr/local/python3.2.3/lib/python3.2/socket.py", line 404, in create_connection
raise err
File "/usr/local/python3.2.3/lib/python3.2/socket.py", line 395, in create_connection
sock.connect(sa)
socket.error: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ext.py", line 4, in <module>
html = urllib.request.urlopen(req).read()
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 138, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 369, in open
response = self._open(req, data)
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 387, in _open
'_open', req)
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 347, in _call_chain
result = func(*args)
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 1155, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/local/python3.2.3/lib/python3.2/urllib/request.py", line 1138, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 111] Connection refused>}
How to solve it. Thanks,

Basically, Connection Refused means only registered users are allowed to access the page, or server under heavy maintenance or similar reasons.
From your above code, if you want to handle errors you may try using try and except like below code:
try:
req= urllib.request.Request("http://www.ip-adress.com/ip_tracer/157.123.22.11",headers={'User-Agent' : "Magic Browser"})
html = urllib.request.urlopen(req).read()
doc = lh.fromstring(html)
print (''.join(doc.xpath('.//*[#class="odd"]')[-1].text_content().split()))
except urllib.error.URLError as e:
print(e.reason)

Related

Web Scraping in HTML to using py-script

<body class="white-vertion black-bg">
<!-- Start Loader -->
<p>
<py-script>
import ssl
from urllib.request import urlopen
from bs4 import BeautifulSoup
context = ssl._create_unverified_context()
result = urlopen("https://blog.naver.com/PostList.naver?blogId=woong3164&categoryNo=0&from=postList", context=context)
bsObj = BeautifulSoup(result.read(), "html.parser")
</py-script>
</p>
I used py-script in HTML to do web scraping. However, this error occurred.
'JsException(PythonError: Traceback (most recent call last):
File "/lib/python3.10/urllib/request.py", line 1348, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/lib/python3.10/http/client.py", line 1282, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/lib/python3.10/http/client.py", line 1328, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/lib/python3.10/http/client.py", line 1277, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/lib/python3.10/http/client.py", line 1037, in _send_output
self.send(msg)
File "/lib/python3.10/http/client.py", line 975, in send
self.connect()
File "/lib/python3.10/http/client.py", line 1447, in connect
super().connect()
File "/lib/python3.10/http/client.py", line 941, in connect
self.sock = self._create_connection(
File "/lib/python3.10/socket.py", line 845, in create_connection
raise err
File "/lib/python3.10/socket.py", line 833, in create_connection
sock.connect(sa)
BlockingIOError: [Errno 26] Operation in progress
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/lib/python3.10/site-packages/_pyodide/_base.py", line 429, in eval_code
.run(globals, locals)
File "/lib/python3.10/site-packages/_pyodide/_base.py", line 300, in run
coroutine = eval(self.code, globals, locals)
File "", line 6, in
File "/lib/python3.10/urllib/request.py", line 216, in urlopen
return opener.open(url, data, timeout)
File "/lib/python3.10/urllib/request.py", line 519, in open
response = self._open(req, data)
File "/lib/python3.10/urllib/request.py", line 536, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/lib/python3.10/urllib/request.py", line 496, in _call_chain
result = func(*args)
File "/lib/python3.10/urllib/request.py", line 1391, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/lib/python3.10/urllib/request.py", line 1351, in do_open
raise URLError(err) urllib.error.URLError: )'
I think this error was caused by ssl.
How can I solve this error?
Your problem is caused by using unsupported Python packages. The package urllib uses APIs (TCP Sockets) that do not exist in the browser. This is not a limitation of PyScript, no browser application can use socket-based APIs.
The solution is to use supported APIs such as fetch or pyfetch.

Python script crashes when fail to connect to server

I'm testing my Python script and when server turned off my script crashes. How can I change this, to make connection again. Now, my script is crashing when it fails to connect to the server. Here is my script:
import urllib.request
import json
def connectToServer():
with urllib.request.urlopen("http://localhost:5000/user/connect") as url:
data = json.loads(url.read().decode())
Here is error:
Traceback (most recent call last):
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1346, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1010, in _send_output
self.send(msg)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 950, in send
self.connect()
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 921, in connect
self.sock = self._create_connection(
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\socket.py", line 843, in create_connection
raise err
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\socket.py", line 831, in create_connection
sock.connect(sa)
ConnectionRefusedError: [WinError 10061] Подключение не установлено, т.к. конечный компьютер отверг запрос на подключение
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "e:\Projects\Hacker_Pro Python\main.py", line 4, in <module>
server.connectToServer()
File "e:\Projects\Hacker_Pro Python\server.py", line 8, in connectToServer
with urllib.request.urlopen("http://localhost:5000/user/connect") as url:
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1375, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\Rostik\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1349, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [WinError 10061] Подключение не установлено, т.к. конечный компьютер отверг запрос на подключение>
The python try statement may help you.
def connectToServer():
try:
with urllib.request.urlopen("http://localhost:5000/user/connect") as url:
data = json.loads(url.read().decode())
except ConnectionRefusedError as error:
print('connection was refused:\n{}'.format(error))
except:
print('could not connect to server')

Exception handling results in "unresolved reference" python

Currently I'm trying to read a webpage and then process the content but I also want to implement exception handling for when the client is offline. This is the code I have so far:
from urllib.request import Request, urlopen
from lxml import html
def get_webSite(link): # returns a HTML page which can be read by the xPath
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
return html.fromstring(urlopen(req).read())
try:
htmlPage = get_webSite("www.example.com")
print(htmlPage)
except urllib.error.URLError as e:
print("You're offline, try again later")
Problem is that python does not seem to recognize the "urllib.error.URLError" which get's thrown if there is no internet connection. Intellij always tells me "Unresolved reference urllib"
This is the output of the console without any internet connection:
"C:\Program Files\Python39\py
thon.exe" C:/test/test.py
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\urllib\request.py", line 1342, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Program Files\Python39\lib\http\client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Program Files\Python39\lib\http\client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Program Files\Python39\lib\http\client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Program Files\Python39\lib\http\client.py", line 1010, in _send_output
self.send(msg)
File "C:\Program Files\Python39\lib\http\client.py", line 950, in send
self.connect()
File "C:\Program Files\Python39\lib\http\client.py", line 1417, in connect
super().connect()
File "C:\Program Files\Python39\lib\http\client.py", line 921, in connect
self.sock = self._create_connection(
File "C:\Program Files\Python39\lib\socket.py", line 822, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "C:\Program Files\Python39\lib\socket.py", line 953, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/test/test.py", line 17, in <module>
htmlPage = get_webSite("https://www.google.com")
File "C:/test/test.py", line 10, in get_webSite
return html.fromstring(urlopen(req).read())
File "C:\Program Files\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python39\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "C:\Program Files\Python39\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Program Files\Python39\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Program Files\Python39\lib\urllib\request.py", line 1385, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\Python39\lib\urllib\request.py", line 1345, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/test/test.py", line 19, in <module>
except urllib.error.URLError as e:
NameError: name 'urllib' is not defined
Process finished with exit code 1
I'm only able to catch this error with "except:" but not with "except urllib.error.URLError:". How can I solve that?
You imported urllib.request. Your program then doesn't know urllib.error as it is not included in urllib.request.
Either import urllib or import urllib.error should do the trick.

Getting UrlError While running Python code for extracting Url from in Ubuntu

Below is the stack trace at the terminal end in Ubuntu
Even my anacinda is taking too much time to open (around 20 minutes)
Traceback (most recent call last):
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 1290, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 1239, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 1026, in _send_output
self.send(msg)
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 966, in send
self.connect()
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 1406, in connect
super().connect()
File "/home/narendra/anaconda3/lib/python3.7/http/client.py", line 938, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/home/narendra/anaconda3/lib/python3.7/socket.py", line 727, in create_connection
raise err
File "/home/narendra/anaconda3/lib/python3.7/socket.py", line 716, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "TASK_1.py", line 23, in <module>
response = urllib.request.urlopen(line,context=gcontext)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/home/narendra/anaconda3/lib/python3.7/urllib/request.py", line 1319, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>
below is my code.
this code is extracting the Url data into the file
it will one by one pick URLs for the url.txt
then it will extract the all page data from that particular URL.
import urllib.request, urllib.error, urllib.parse
import io
import ssl
#localhost, 127.0.0.0/8, ::1, 10.0.0.0/8
# using readline() that reads file line by line.
file1 = open("url.txt", "r")
count = 0
gcontext = ssl.SSLContext()`
for i in range(18):
count += 1
# Getting the next line from file
line = file1.readline()
# if line is empty
# end of file is reached
if not line:
break
response = urllib.request.urlopen(line,context=gcontext)
webContent = response.read()
with io.open("file_" + str(i) + ".txt", 'w', encoding='utf-8') as f:
f.write(webContent)
f.close()

urlopen error [Errno 11001] getaddrinfo failed

I'm new to python and I'm following a video tutorial.
So here's the code snippet
from urllib.request import urlopen
with urlopen('http://sixty-north/c/t.txt') as story:
story_words = []
for line in story:
line_words = line.decode('utf-8').split()
for word in line_words:
story_words.append(word)
I'm able to access http://sixty-north.com/c/t.txt in my browser.
However when I type this into command prompt: python words.py I get this error:
C:\New folder>python words.py
Traceback (most recent call last):
File "C:\Python33\lib\urllib\request.py", line 1248, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Python33\lib\http\client.py", line 1065, in request
self._send_request(method, url, body, headers)
File "C:\Python33\lib\http\client.py", line 1103, in _send_request
self.endheaders(body)
File "C:\Python33\lib\http\client.py", line 1061, in endheaders
self._send_output(message_body)
File "C:\Python33\lib\http\client.py", line 906, in _send_output
self.send(msg)
File "C:\Python33\lib\http\client.py", line 844, in send
self.connect()
File "C:\Python33\lib\http\client.py", line 822, in connect
self.timeout, self.source_address)
File "C:\Python33\lib\socket.py", line 417, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "words.py", line 2, in <module>
with urlopen('http://sixty-north/c/t.txt') as story:
File "C:\Python33\lib\urllib\request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 469, in open
response = self._open(req, data)
File "C:\Python33\lib\urllib\request.py", line 487, in _open
'_open', req)
File "C:\Python33\lib\urllib\request.py", line 447, in _call_chain
result = func(*args)
File "C:\Python33\lib\urllib\request.py", line 1274, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Python33\lib\urllib\request.py", line 1251, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
There is no such host: sixty-north. Replace sixty-north with sixty-north.com (notice: .com at the end)
Change host: sixty-north with sixty-north.com
And also, Restart your Internet Connection.
Your Internet is not working properly.

Categories