Skipping errors in a loop when webscraping

Skipping errors in a loop when webscraping - python

I have taken links from one page (159 totalled) and now want to run them through a loop and get information on each of their pages.
When I do this I am getting an error as one of the links is coming back with an error (I think that is why it is erroring out)
Any advise/direction or help would be greatly appreciated.
Thank you
See code below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = "https://www.auveco.com"
productlinks = []
r = requests.get('https://www.auveco.com/products')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='opened')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
#print(productlinks)
#part#2
partno = []
for link in productlinks:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
name = soup.find_all('li', class_='product-code').text
except:
name='nopage'
print(name)
Here is the error I am getting:
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 382, in prepare_url
scheme, auth, host, port, path, query, fragment = parse_url(url)
File "C:\Program Files\Python39\lib\site-packages\urllib3\util\url.py", line 392, in parse_url
return six.raise_from(LocationParseError(source_url), None)
File "<string>", line 3, in raise_from
urllib3.exceptions.LocationParseError: Failed to parse: https://www.auveco.comjavascript:void(0);
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\ppluc\PycharmProjects\pythonProject\auv22.py", line 30, in <module>
r = requests.get(link, headers=headers)
File "C:\Program Files\Python39\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Program Files\Python39\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Program Files\Python39\lib\site-packages\requests\sessions.py", line 528, in request
prep = self.prepare_request(req)
File "C:\Program Files\Python39\lib\site-packages\requests\sessions.py", line 456, in prepare_request
p.prepare(
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 316, in prepare
self.prepare_url(url, params)
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 384, in prepare_url
raise InvalidURL(*e.args)
requests.exceptions.InvalidURL: Failed to parse: https://www.auveco.comjavascript:void(0);
nopage

You can also wrap r = requests.get(link, headers=headers) in a try - except block, like:
try:
r = requests.get(link, headers=headers)
except requests.exceptions.InvalidURL as e:
print(str(e))
Then we can also skip the other parsing steps, if the get fails:
#part#2
partno = []
for link in productlinks:
try:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
name = soup.find_all('li', class_='product-code').text
except requests.exceptions.InvalidURL as e:
print(str(e))
name='url invalid'
except:
name='nopage'
print(name)

Related

I am scrping linkedin profiles but got the error

I want to scrape the linkedin profiles based on specific keywords but got the error here is my code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from tkinter import *
class Linkedin():
def getData(self):
driver = webdriver.Chrome('/home/danish-khan/scrapers/researchgate/chromedriver')
driver.get('https://www.linkedin.com/login')
driver.find_element_by_id('username').send_keys('danishkhankd237#gmail.com') #Enter username of linkedin account here
driver.find_element_by_id('password').send_keys('dankhanish446') #Enter Password of linkedin account here
driver.find_element_by_xpath("//button[#type='submit']").click()
#*********** Search Result ***************#
search_key = "data analyst" # Enter your Search key here to find people
key = search_key.split()
print('\nkeyword:', key)
keyword = ""
for key1 in key:
keyword = keyword + str(key1).capitalize() +"%20"
keyword = keyword.rstrip("%20")
print('\nkeyword2 :', keyword)
#global data
data = []
profile_links = []
for no in range(1,3):
start = "&page={}".format(no)
search_url = "https://www.linkedin.com/search/results/people/?keywords={}&origin=SUGGESTION{}".format(keyword,start)
driver.get(search_url)
# driver.maximize_window()
for scroll in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
search = BeautifulSoup(driver.page_source,'lxml')
for people in search.findAll('span', class_ = 't-16'):
profiles = people.find_all('a', attrs = {'class' : 'app-aware-link'})
count = 0
for i in profiles:
profiles2 = i['href']
print(profiles2)
profile_links.append(profiles2)
print("Going to scrape Page {} data".format(no))
print('\nprofile_links :', profile_links)
lent = 0
for people in profile_links:
#count = 0
# if count%2==0:
# lent+=1
print('Profile :', people)
driver.get(people)
print('\ngetting\n')
# #********** Profile Details **************#
card = BeautifulSoup(driver.page_source,'lxml')
try:
Name = card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text
except:
Name = 'None'
try:
Work_at = (card.find('div', attrs = {'class' : 'text-body-medium break-words'}).text).strip()
except:
Work_at = "None"
try:
Image = card.find("img", attrs = {'loading' : 'lazy'})['src']
except:
Image = 'None'
try:
Education = card.find('h3', attrs = {'class' : 'pv-entity__school-name t-16 t-black t-bold'}).text
except:
Education = 'None'
try:
Location = soup.find('span', attrs = {'class' : 'text-body-small inline t-black--light break-words'}).text.strip()
except:
Location = 'None'
details = {
'Name' : 'hgf', #card.find('h1', attrs = {'class' : 'text-heading-xlarge inline t-24 v-align-middle break-words'}).text,
'Location' : '',
'Work_at' : '',
'Education' : '',
'Profile_image' : '',
'Website' : '',
'Email' : ''
}
details['Name'] = Name
print(details)
time.sleep(15)
driver.quit()
driver.quit()
def start(self):
self.getData()
if __name__ == "__main__":
obJH = Linkedin()
obJH.start()
firstly i want to collect all the url of the user profiles of specific kewyords like here data analyst and then go through all the profile urls to scrape specific data from these profiles but it only scrape two urls and not all the profiles urls and seconds when going through the list of urls i got the error:
python linkdn2.py
keyword: ['data', 'analyst']
keyword2 : Data%20Analyst
https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
getting
{'Name': 'Roshaan Khan', 'Location': '', 'Work_at': '', 'Education': '', 'Profile_image': '', 'Website': '', 'Email': ''}
https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk
Going to scrape Page 1 data
profile_links : ['https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck', 'https://www.linkedin.com/in/sabanasimbutt?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAB7iVNAB_l8blfjWUwqgsV-bkjV3X_3ODdk']
Profile : https://www.linkedin.com/in/roshaankhan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACL58nQBKUordklUHOqNKThOLHNSLnirIck
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 387, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/lib/python3.8/http/client.py", line 950, in send
self.connect()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 187, in connect
conn = self._new_conn()
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "linkdn2.py", line 108, in <module>
obJH.start()
File "linkdn2.py", line 104, in start
self.getData()
File "linkdn2.py", line 55, in getData
driver.get(people)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 79, in request
return self.request_encode_body(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/request.py", line 171, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/poolmanager.py", line 330, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 747, in urlopen
return self.urlopen(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/home/danish-khan/scrapers/scrpers/lib/python3.8/site-packages/urllib3/util/retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56707): Max retries exceeded with url: /session/b7431e8051979e6a9a308bdfd59bf60a/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f431515f610>: Failed to establish a new connection: [Errno 111] Connection refused'))
I have tried many ways to solve this but can't find solution.

I am getting a error using urllib and bs4 "http.client.BadStatusLine:"

I have this file called 'recognized.txt' which has some text like this
Link to the recognized.txt: https://drive.google.com/file/d/1yCQz6cQPDmcCOuXBOCAX4nvNoUqewE0y/view?usp=sharing
:
My code:-
f = open('recognized.txt','r')
message = f.read()
message.replace(" ", "")
print(message)
f.close()
import bs4 as bs
import urllib.request
url = ('https://html.duckduckgo.com/html?q='+message) # no javascript
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, 'lxml')
a = soup.body.b
print(a)
for i in soup.find_all('a', class_='result__snippet'):
print(i.get_text(separator=' - ', strip=True))
So when i run the above code it gives me an error as:-
Traceback (most recent call last):
File "D:\ocr\webparse.py", line 26, in <module>
sauce = urllib.request.urlopen(url).read()
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 544, in _open
'_open', req)
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 1321, in do_open
r = h.getresponse()
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\Praveen\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 279, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
What does the error mean?
Why am i getting that error?

After running your code with your txt-file I managed to replicate the issue. This is what I did:
Removed all the newlines from your message and spaces with strip()
removed 'lxml' from BeautifulSoup()
This seems to produce a decent result.
import bs4 as bs
import urllib.request
with open('Downloads/recognized.txt') as f:
message = f.read().strip()
url = ('https://html.duckduckgo.com/html?q='+message)
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce)
a = soup.body.b
print(a)
for i in soup.find_all('a', class_='result__snippet'):
print(i.get_text(separator=' - ', strip=True))
The printout look like so:
<b>Dinosaur</b>
Dinosaurs - are a diverse group of reptiles of the clade Dinosauria. They first appeared during the Triassic period, between 243 and 233.23 million years ago...
🎦 - Dinosaur - . Quite the same Wikipedia. Just better. - Dinosaur - . From Wikipedia, the free encyclopedia.
Мультфильм, триллер, приключения. Режиссер: Эрик Лейтон, Ральф Зондаг. В ролях: Элфри Вудард, Осси Дэвис, Макс Казелла и др. Путешествие трехтонного игуанодонта по имени Аладар...
Перевод слова - dinosaur - , американское и британское произношение, транскрипция, словосочетания, примеры использования.
The problem seem to be with your message variable. I cleaned it up so it's a simple string without newlines. Now it works fine.

How to solve 'RecursionError: maximum recursion depth exceeded' with Eventlet and Requests in Python

I am trying to implement the Amazon Web Scraper mentioned here. However, I get the output mentioned below. The output repeats until it stops with RecursionError: maximum recursion depth exceeded.
I have already tried downgrading eventlet to version 0.17.4 as mentioned here.
Also, the requestsmodule is getting patched as you can see in helpers.py.
helpers.py
import os
import random
from datetime import datetime
from urllib.parse import urlparse
import eventlet
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
import redis
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import settings
num_requests = 0
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def make_request(url, return_soup=True):
# global request building and response handling
url = format_url(url)
if "picassoRedirect" in url:
return None # skip the redirect URLs
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
proxies = get_proxy()
try:
r = requests.get(url, headers=settings.headers, proxies=proxies)
except RequestException as e:
log("WARNING: Request for {} failed, trying again.".format(url))
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
return r
def format_url(url):
# make sure URLs aren't relative, and strip unnecssary query args
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.de"
path = u.path
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
k, v = piece.split("=")
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
query = query[:-1]
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
# global logging function
if settings.log_stdout:
try:
print("{}: {}".format(datetime.now(), msg))
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
# choose a proxy server to use for this request, if we need one
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy = random.choice(settings.proxies)
proxy_url = "socks5://{user}:{passwd}#{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print(r.text)
output
Traceback (most recent call last):
File "helpers.py", line 112, in <module>
r = make_request('https://api.ipify.org?format=json', return_soup=False)
File "helpers.py", line 36, in make_request
r = requests.get(url, headers=settings.headers, proxies=proxies)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/contrib/socks.py", line 99, in _new_conn
**extra_kw
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 199, in create_connection
sock.connect((remote_host, remote_port))
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 47, in wrapper
return function(*args, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 774, in connect
super(socksocket, self).settimeout(self._timeout)
File "/home/ec2-user/env/lib64/python3.7/site-packages/eventlet/greenio/base.py", line 395, in settimeout
self.setblocking(True)
What might be the problem here?

Turns out removing eventlet.monkey_patch() and import eventlet solved the problem.

requests' MissingSchema exception for Invalid URL

I am trying to scrape content from a website but I am getting the below mentioned error
The method:
def scrape_newtimes():
"""Scrapes content from the NewTimes"""
url = 'https://www.newtimes.co.rw/'
r = requests.get(url, headers=HEADERS)
tree = fromstring(r.content)
links = tree.xpath('//div[#class="x-small-push clearfix"]/a/#href')
for link in links:
r = requests.get(link, headers=HEADERS)
blog_tree = fromstring(r.content)
paras = blog_tree.xpath('//div[#class="article-content"]/p')
para = extract_paratext(paras)
text = extract_text(para)
if not text:
continue
yield '"%s" %s' % (text, link)
The error I am getting:
>>> sc = scrape_newtimes()
>>> string_1 = next(sc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Projects\bird\bird-env\bot.py", line 58, in scrape_newtimes
r = requests.get(link, headers=HEADERS)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
>>>

The exception basically tells you what is wrong:
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
Or with line wrapping the line:
Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda':
No schema supplied. Perhaps you meant
http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
You link does not contain a complete URL

Python 3 script that passes each member of an array to a function

I'm trying to write a small python 3 utility script that checks to see if a file exists on my server.
So I have the code below that has a big array of string values that I pass to a simple function that returns the url and the response code.
However, when I run it I get all these errors I don't even know where to start:
$ python ReturnPath.py
Traceback (most recent call last):
File "ReturnPath.py", line 86, in <module>
checkResponse(u)
File "ReturnPath.py", line 5, in checkResponse
code = urllib.request.urlopen(url).getcode()
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 510, in open
req = Request(fullurl, data)
File "C:\Program Files\Python37\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Program Files\Python37\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Program Files\Python37\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg"'
Here is my code:
import urllib.request
def checkResponse(url):
code = urllib.request.urlopen(url).getcode()
print(url + " = " + code)
return
arrCases = []
arrCases.extend([
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
for u in arrCases:
checkResponse(u)
What am I doing wrong?

You have to catch errors from broken URLs. I also increased speed through multiprocessing.Pool.
import urllib.request
from urllib.error import HTTPError, URLError
import multiprocessing
def checkResponse(url):
try:
code = urllib.request.urlopen(url, timeout=1).getcode()
except (HTTPError, URLError) as error:
print(url, " = ", error)
else:
print(url, " = ", code)
return
arrCases = []
arrCases.extend([
"https://i.stack.imgur.com/DsNOB.jpg",
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
with multiprocessing.Pool(processes=4) as pool:
pool.map(checkResponse, arrCases)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Skipping errors in a loop when webscraping - python

Related

I am scrping linkedin profiles but got the error

I am getting a error using urllib and bs4 "http.client.BadStatusLine:"

How to solve 'RecursionError: maximum recursion depth exceeded' with Eventlet and Requests in Python

requests' MissingSchema exception for Invalid URL

Python 3 script that passes each member of an array to a function

Categories

Resources