I'm trying to scrap some URL with BeautifulSoup. The URL I'm scraping are coming from a google analytics API call, some of then aren't working properly so I need to find a way to skip them.
I tried to add this:
except urllib2.HTTPError:
continue
But I got the following syntax error :
except urllib2.HTTPError:
^
SyntaxError: invalid syntax
Here is my full code:
rawdata = []
urllist = []
sharelist = []
mystring = 'http://www.konbini.com'
def print_results(results):
# Print data nicely for the user.
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist:
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(row)
except urllib2.HTTPError:
continue
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
share = name_box.text.strip() # strip() is used to remove starting and trailing
# save the data in tuple
sharelist.append((row,share))
print(sharelist)
Your except statement is not preceded by a try statement. You should use the following pattern:
try:
page = urllib2.urlopen(row)
except urllib2.HTTPError:
continue
Also note the indentation levels. Code executed under the try clause must be indented, as well as the except clause.
Two errors:
1. No try statement
2. No indentation
Use this:
for row in urllist:
# query the website and return the html to the variable 'page'
try:
page = urllib2.urlopen(row)
except urllib2.HTTPError:
continue
If you just want to catch a 404, you need to check the code returned or raise the error or else you will catch and ignore more than just the 404:
import urllib2
from bs4 import BeautifulSoup
from urlparse import urljoin
def print_results(results):
base = 'http://www.konbini.com'
rawdata = []
sharelist = []
# Print data nicely for the user.
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
# use urljoin to join to the base url
urllist = [urljoin(base, h) for h in rawdata]
for url in urllist:
# query the website and return the html to the variable 'page'
try: # need to open with try
page = urllib2.urlopen(url)
except urllib2.HTTPError as e:
if e.getcode() == 404: # check the return code
continue
raise # if other than 404, raise the error
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
share = name_box.text.strip() # strip() is used to remove starting and trailing
# save the data in tuple
sharelist.append((url, share))
print(sharelist)
As already mentioned by others,
try statement missing
Proper indentation missing.
You should use IDE or Editors so that you won't face such problems, Some good IDE and Editors are
IDE - Eclipse Use Pydev plugin
Editors - Visual Studio Code
Anyways, Code after try and indent
rawdata = []
urllist = []
sharelist = []
mystring = 'http://www.konbini.com'
def print_results(results):
# Print data nicely for the user.
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist:
# query the website and return the html to the variable 'page'
try:
page = urllib2.urlopen(row)
except urllib2.HTTPError:
continue
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
share = name_box.text.strip() # strip() is used to remove starting and trailing
# save the data in tuple
sharelist.append((row, share))
print(sharelist)
Your syntax error is due to the fact that you're missing a try with your except statement.
try:
# code that might throw HTTPError
page = urllib2.urlopen(row)
except urllib2.HTTPError:
continue
Related
while web scraping using BeautifulSoup I have to write try except multiple times. See the code below:
try:
addr1 = soup.find('span', {'class' : 'addr1'}).text
except:
addr1 = ''
try:
addr2 = soup.find('span', {'class' : 'addr2'}).text
except:
addr2 = ''
try:
city = soup.find('strong', {'class' : 'city'}).text
except:
city = ''
The problem is that I have to write try except multiple times and that is very annoying. I want to write a function to handle the exception.
I tried to use the following function but it is still showing an error:
def datascraping(var):
try:
return var
except:
return None
addr1 = datascraping(soup.find('span', {'class' : 'addr1'}).text)
addr2 = datascraping(soup.find('span', {'class' : 'addr2'}).text)
Can anyone help me to solve the issue?
Use a for loop that iterates through a sequence containing your arguments. Then use a conditional statement that checks if the return value is None, prior to attempting to get the text attribute. Then store the results in a dictionary. This way there is no need to use try/except at all.
seq = [('span', 'addr1'), ('span', 'addr2'), ('strong', 'city')]
results = {}
for tag, value in seq:
var = soup.find(tag, {'class': value})
if var is not None:
results[value] = var.text
else:
results[value] = ''
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re
import csv
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# text = input ('Enter Text - ') - In-case the user wants to manually put-in
some text to evaluate
#print ('\n')
#print (len(lst))
# Take 'Content' input from a csv file
file = open("Test_1.CSV", "r", encoding='utf-8')
reader = csv.reader(file)
for line in reader:
text = line[5]
lst = re.findall('(http.?://[^\s]+)', text)
if not lst: print(line[0], 'Empty List')
else:
try:
for url in lst:
try:
try:
html = urllib.request.urlopen(url, context=ctx).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
print (line[0], str_title, ',', url)
except UnicodeEncodeError as e:
#print("Incorrect URL {}".format(url.encode('ascii', errors='ignore')))
b_url = url.encode('ascii', errors='ignore')
n_url = b_url.decode("utf-8")
#print (n_url)
html = urllib.request.urlopen(n_url, context=ctx).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
print (line[0], str_title, ',', url)
except urllib.error.URLError:
print ('Invalid DNS Link')
except urllib.error.HTTPError as err:
if err.code == 404:
print (line[0], 'Invalid Twitter Link')
The above mentioned code reads a csv file, selects a column, then parses that using regex to get all the hyperlinks in a single row, I then use BeautifulSoup to parse through the Hyperlink to get the 'Title String' of the page.
While running this code, I first encountered UnicodeEncodeError and addressed it; I then encountered urllib.error.URLError and addressed that too. Now, I've ran into another one
"Traceback (most recent call last): File "C:\Users\asaxena\Desktop\py4e\Gartner\crawler_new.py", line 32, in <modu le> title = soup.title.string AttributeError: 'NoneType' object has no attribute 'string'".
Is there really any way for me to bypass any type of error that appears ? Even the unforseen ones ? I know BeautifulSoup has a tendency to throw up unexpected errors, partly due to the varied kind of content that roams on the web.
I finally solved it, by placing the entire code under try / except block such that:
try:
#Put all my code here
except Exception as e:
print ('Error Ignored')
The code will be able to handle all types of exceptions.
I'm trying to scrap some URL with BeautifulSoup. The URL I'm scraping are coming from a google analytics API call, some of then aren't working properly so I need to find a way to skip them.
Here is my initial script which is working properly when I don't have any wrong url :
rawdata = []
urllist = []
sharelist = []
mystring = 'http://www.blablabla.com'
def print_results(results):
# Print data nicely for the user.
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist:
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find(attrs={'class': 'nb-shares'})
share = name_box.text.strip() # strip() is used to remove starting and trailing
# save the data in tuple
sharelist.append((row,share))
print(sharelist)
Following an answer from stack, I had these line to deal with the wrong url :
if name_box is None:
continue
Then I had this line :
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
At the top of my script to deal with this error 'ascii' codec can't encode character u'\u200b' in position 22: ordinal not in range(128)
but now my script return me an empty object.
Here is my final script :
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
{...my api call here...}
rawdata = []
urllist = []
sharelist = []
mystring = 'http://www.blablabla.com'
def print_results(results):
# Print data nicely for the user.
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist:
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
share = name_box.text.strip() # strip() is used to remove starting and trailing
# save the data in tuple
sharelist.append((row,share))
print(sharelist)
I'm totally noob to python so please forgive my mistake and lack of vocabulary. I'm trying to scrap some url with BeautifulSoup. My url are coming from a GA api call and some of them doesn't respond.
How do I build my script so that BeautifulSoup ignore the url that doesn't return anything ?
Here is my code :
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist[4:8]:
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find(attrs={'class': 'nb-shares'})
share = name_box.text.strip()
# save the data in tuple
sharelist.append((row,share))
print(sharelist)
I tried to use this :
except Exception:
pass
but I don't know where and got some syntax error. I've look at other questions, but cannot find any answers for me.
You may check the value of name_box variable - it would be None if nothing found:
for row in urllist[4:8]:
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
# ...
I am learning to build web crawlers and currently working on getting all urls from a site. I have been playing around and don't have the same code as I did before but I have been able to get all the links but my issues is the recursion I need to do the same things over and over but what I think my issue is the recursion what it is doing is right for the code I have written. My code is bellow
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
page = urllib2.urlopen( url ).read()
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
for url in urlList:
getAllUrl(url)
return urlList
except urllib2.HTTPError, e:
print e
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
for x in urls:
print x
What I am trying to achieve is get all the urls for a site with the current set-up the program runs till it runs out of memory all I want is to get the urls from a site. Does anyone have any idea on how to do this think I have the right idea just need some small changes to the code.
EDIT
For those of you what are intrested bellow is my working code that gets all the urs for the site someone might find it useful. It's not the best code and does need some work but with some work it could be quite good.
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
urlList = []
try:
page = urllib2.urlopen( url ).read()
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
return urlList
except urllib2.HTTPError, e:
urlList.append( e )
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
fullList = []
for x in urls:
listUrls = list
listUrls = getAllUrl(x)
try:
for i in listUrls:
if not i in fullList:
fullList.append(i)
except TypeError, e:
print 'Woops wrong content passed'
for i in fullList:
print i
I think this works:
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
try:
page = urllib2.urlopen( url ).read()
except:
return []
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin(url, anchor['href']) not in urlList:
urlList.append(urlparse.urljoin(url, anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
return urlList
except urllib2.HTTPError, e:
print e
def listAllUrl(urls):
for x in urls:
print x
urls.remove(x)
urls_tmp = getAllUrl(x)
for y in urls_tmp:
urls.append(y)
if __name__ == "__main__":
urls = ['http://bobthemac.com']
while(urls.count>0):
urls = getAllUrl('http://bobthemac.com')
listAllUrl(urls)
In you function getAllUrl, you call getAllUrl again in a for loop, it makes a recursion.
Elements will never be moved out once put into urlList, so urlList will never be empty, and then, the recursion will never break up.
That's why your program will never end up util out of memory.