download images in python with Beautifulsoup - python

I want to download a photo from the Iranian website and put the code in the culab and get timeout error and URLerror.
from bs4 import BeautifulSoup
import urllib.request
def make_soup(url):
thepage = urllib.request.urlopen(url)
#req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#thepage = urlopen(req).read()
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i=1
soup = make_soup("https://www.banikhodro.com/car/pride/")
for img in soup.find_all('img'):
temp = img.get('src')
#print(temp)
if temp[0]=="/":
image = "https://www.banikhodro.com/car/pride/"+temp
else:
image = temp
#print(image)
nametemp = img.get('alt')
nametemp = str(nametemp)
if len(nametemp)== 0:
i=i+1
else:
filename=nametemp
imagefile = open(filename+ ".jpeg", 'wb')
imagefile.write(urllib.request.urlopen(image).read())
imagefile.close()
TimeoutError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/urllib3/connection.py in _new_conn(self)
158 conn = connection.create_connection(
--> 159 (self._dns_host, self.port), self.timeout, **extra_kw)
160
15 frames
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f079e4cdcf8>: Failed to establish a new connection: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
MaxRetryError: HTTPSConnectionPool(host='www.banikhodro.com', port=443): Max retries exceeded with url: /car/pride/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f079e4cdcf8>: Failed to establish a new connection: [Errno 110] Connection timed out',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='www.banikhodro.com', port=443): Max retries exceeded with url: /car/pride/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f079e4cdcf8>: Failed to establish a new connection: [Errno 110] Connection timed out',))
add timeout error and connection error.These errors are given to me in GoogelColab when use Iranian Websait for downloded images in colab
Thanks in advance to those who answer my questions

One way of doing this would be:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.banikhodro.com/car/pride/").content
soup = BeautifulSoup(page, "html5lib").find_all("span", {"class": "photo"})
images = [
f"https://www.banikhodro.com{img.find('img')['src']}" for img in soup
if "Adv" in img.find("img")["src"]
]
for image in images:
print(f"Fetching {image}")
with open(image.rsplit("/")[-1], "wb") as img:
img.write(requests.get(image).content)
This fetches all non-generic images of car offers to your local folder.
183093_1-m.jpg
183098_1-m.jpg
183194_1-m.jpg
183208_1-m.jpg
183209_1-m.jpg
183272_1-m.jpg
183279_1-m.jpg
183286_1-m.jpg
183384_1-m.jpg

import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.banikhodro.com/car/pride/").content
soup = BeautifulSoup(page, "html5lib")
images = [
f"https://www.banikhodro.com{img['src']}" for img in soup.find_all('img')
# sort it accordingly based on class or id inside find_all method
]
for image in images:
print(f"Fetching {image}")
with open(image.split("/")[-1], "wb") as img:
img.write(requests.get(image).content)
pip install requests # to install the most preferred requests module
This code will give all kinds of images including footer etc.
You can sort those image data in find_all method which has an parameter called attrs
for more info refer : click here

Related

Python Max retries exceeded with url

The scanner works until it finds an external address that is no longer available and then crashes .
I just want to scan only herold.at and extract the email addresses.
I want him to stop scanning outside addresses. I tried with
r = requests.get ('http://github.com', allow_redirects = False) but does not work.
import csv
import requests
import re
import time
from bs4 import BeautifulSoup
# Number of pages plus one
allLinks = [];mails=[];
url = 'https://www.herold.at/gelbe-seiten/wien/was_installateur/?page='
for page in range(3):
time.sleep(5)
print('---', page, '---')
response = requests.get(url + str(page), timeout=1.001)
soup=BeautifulSoup(response.text,'html.parser')
links = [a.attrs.get('href') for a in soup.select('a[href]') ]
for i in links:
#time.sleep(15)
if(("Kontakt" in i or "Porträt")):
allLinks.append(i)
allLinks=set(allLinks)
def findMails(soup):
#time.sleep(15)
for name in soup.find_all("a", "ellipsis"):
if(name is not None):
emailText=name.text
match=bool(re.match('[a-zA-Z0-9-_.]+#[a-zA-Z0-9-_.]+',emailText))
if('#' in emailText and match==True):
emailText=emailText.replace(" ",'').replace('\r','')
emailText=emailText.replace('\n','').replace('\t','')
if(len(mails)==0)or(emailText not in mails):
print(emailText)
mails.append(emailText)
for link in allLinks:
if(link.startswith("http") or link.startswith("www")):
r=requests.get(link)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
else:
newurl=url+link
r=requests.get(newurl)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
mails=set(mails)
if(len(mails)==0):
print("NO MAILS FOUND")
Error:
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.gebrueder-lamberger.at', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000021A24AA7308>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
The error is in this line if(link.startswith("http") or link.startswith("www")): change the http into https and it should work. I tried it and it fetched all emails.
--- 0 ---
--- 1 ---
--- 2 ---
office#smutny-installationen.at
office#offnerwien.at
office#remes-gmbh.at
wien13#lugar.at
office#rossbacher-at.com
office#weiner-gmbh.at
office#wojtek-installateur.at
office#b-gas.at
office#blasl-gmbh.at
gsht#aon.at
office#ertl-installationen.at
office#jakubek.co.at
office#peham-installateur.at
office#installateur-weber.co.at
office#gebrueder-lamberger.at
office#ar-allround-installationen.at
Also, you can try the urllib3 to set up your streaming pool.

How to catch the orignial exception

I'm using the requests module with max_retries option. I would like to catch the exceptions only related to timeouts and slow replies:
import requests
from requests.exceptions import ConnectTimeout, Timeout
URL = 'http://exmaple.com/sleep' # sleeps for 5 seconds before reply
with requests.Session() as s:
try:
a = requests.adapters.HTTPAdapter(max_retries=2)
s.mount('http://', a)
r = s.get(URL, timeout=1)
except (ConnectTimeout, Timeout) as err:
print('# {} - timeout'.format(URL))
But it looks like the underlying urllib3 library throws ReadTimeoutError and requests doesn't catch it and throws ConnectionError instead:
requests.exceptions.ConnectionError: HTTPConnectionPool(host='example.com', port=80): Max retries exceeded with url: /sleep (Caused by ReadTimeoutError("HTTPConnectionPool(host='example.com', port=80): Read timed out. (read timeout=1)"))
I don't want to add ConnectionError to the list because there are other exceptions that inherit from it so it would also catch those.
Is there a way to catch the original exception or perhaps all exceptions in the chain using traceback module.
Ideally, you should catch those other exceptions above ConnectionError and raise them if you want your program to throw an error.
class OtherException(requests.exceptions.ConnectionError):
pass
try:
raise OtherException('This is other exception.')
except OtherException as oe:
raise oe
except requests.exceptions.ConnectionError:
print('The error you want to catch')
You can use a similar contruct:
import traceback
import logging
try:
whatever()
except Exception as e:
logging.error(traceback.format_exc())
# Your actions here
This will almost catch everything except, for example, KeyboardInterrupt and SystemExit.
Catching those would make the script quite hard to quit.

Python requests detailed ConnectionError handling

I just wrote this:
try:
r = requests.get('http://example.com')
except requests.exceptions.ConnectionError as e:
print(e)
And I got this output:
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
Does anyone know how could I get different types of connection errors? Like 'connection aborted', 'connection refused' and 'connection reset' from this exception and handle them?
If your goal is to get the response message and then handle them. You can try this code.
import requests
response = requests.get("http://www.example.com")
print(response.status_code)
print(response.reason)

python capture URLError code

I want to use Python to monitor a website that uses HTTPS.
The problem is that the certificate on the website is invalid.
I don't care about that, I just want to know that the website is running.
My working code looks like this:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("https://somedomain.com")
try:
response = urlopen(req)
except HTTPError as e:
print('server couldn\'t fulfill the request')
print('error code: ', e.code)
except URLError as e:
print(e.args)
else:
print ('website ok')
that ends in URLError being called. The error code is 645.
C:\python>python monitor443.py
(SSLError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:645)'),)
So, I'm trying to except code 645 as OK. I've tried this:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("https://somedomain.com")
try:
response = urlopen(req)
except HTTPError as e:
print('server couldn\'t fulfill the request')
print('error code: ', e.code)
except URLError as e:
if e.code == 645:
print("ok")
print(e.args)
else:
print ('website ok')
but get this error:
Traceback (most recent call last):
File "monitor443.py", line 11, in <module>
if e.code == 645:
AttributeError: 'URLError' object has no attribute 'code'
how do I add this exception?
Please have a look at the great requests package. It will simplify your life when doing http communication. See http://requests.readthedocs.io/en/master/.
pip install requests
To skip certificate check, you would do something like this (note the verify parameter!):
requests.get('https://kennethreitz.com', verify=False)
<Response [200]>
See the full documentation here.
HTH
I couldn't install the SLL library (egg_info error).
This is what I ended up doing
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
def sendEmail(r):
#send notification
print('send notify')
req = Request("https://somedomain.com")
try:
response = urlopen(req)
except HTTPError as e:
print('server couldn\'t fulfill the request')
print('error code: ', e.code)
sendEmail('server couldn\'t fulfill the request')
except URLError as e:
theReason=str(e.reason)
#[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:645)
if theReason.find('CERTIFICATE_VERIFY_FAILED') == -1:
sendEmail(theReason)
else:
print('website ok')
else:
print('website ok')

Python Requests package: lost connection while streaming

I'd like to use the Requests package to connect to the streaming API of a web service. Suppose I use the following code to send a request, receive the response and iterate through the lines of response as they arrive:
import requests
r = requests.get('http://httpbin.org/stream/20', stream=True)
for line in r.iter_lines():
if line:
print line
While waiting to receive new data, we are basically waiting for r.iter_lines() to generate a new piece of data. But what if I lose internet connection while waiting? How can I find out so I can attempt to reconnect?
You can disconnect from your network to have a try. Requests raise such error:
requests.exceptions.ConnectionError: HTTPConnectionPool(host='httpbin.org', port=80): Max retries exceeded with url: /stream/20 (Caused by : [Errno -3] Temporary failure in name resolution)
The error message shows Requests already retries for network error. You can refer to this answer for setting the max_retries. If you wants more customization (e.g. waits between retries), do it in a loop:
import socket
import requests
import time
MAX_RETRIES = 2
WAIT_SECONDS = 5
for i in range(MAX_RETRIES):
try:
r = requests.get('http://releases.ubuntu.com/14.04.1/ubuntu-14.04.1-desktop-amd64.iso',
stream=True, timeout=10)
idx = 1
for chunk in r.iter_content(chunk_size=1024):
if chunk:
print 'Chunk %d received' % idx
idx += 1
break
except requests.exceptions.ConnectionError:
print 'build http connection failed'
except socket.timeout:
print 'download failed'
time.sleep(WAIT_SECONDS)
else:
print 'all tries failed'
EDIT: I tested with a large file. I used iter_content instead, because it's a binary file. iter_lines is based on iter_content (source codes), so I believe the behaviour is same. Procedure: run the codes with network connected. After receiving some chunks, disconnect. Wait 2-3 seconds, reconnect, the downloading continued. So requests package DOES retry for connection lost in the iteration.
Note: If no network when build the connection (requests.get()), ConnectionError is raised; if network lost in the iter_lines / iter_content, socket.timeout is raised.

Categories