I am web scraping ebay for an item's information. The item is not very consistent with some of the info I need so I am using a try/except statement for the code to continue when an index error is arised but for some reason the try/except statement is not being called when the condition is met. Any ideas why this is happening and how to fix it? I have debugged the code but can't find the issue. Thanks
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'}
my_url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1311&_nkw=sm-r800&_sacat=0&LH_TitleDesc=0' \
'&_osacat=0&_odkw=samsung+watch '
def get_data(url):
r = requests.get(url, headers=headers)
soup = bs(r.content, features='html.parser')
return soup
def parse_data(soup):
product_list = []
results = soup.find_all('div', {'class': 's-item__info clearfix'})
for item in results:
try:
products = {'Title': item.find_all('a', {'class': 's-item__link'})[0].h3.text,
'Price': float(item.find('span', {'class': 's-item__price'}).text[1:]),
'Product Rating': float(item.find('div', {'class': 's-item__reviews'}).a.div.find('span', {
'class': 'clipped'}).text.strip(' ')[0]),
'Watchers': float(item.find('div', {'class': 's-item__details clearfix'}).find('span', {
'class': 's-item__hotness s-item__itemHotness'}).text.split(' ')[0])
}
product_list.append(products)
except IndexError:
continue
return product_list
def output(product_list):
df = pd.DataFrame(product_list)
df.to_csv('Samsung Watch Data.csv', index=False)
print('Saved to CSV')
return
my_soup = get_data(my_url)
data = parse_data(my_soup)
output(data)
The problem is your return statement, which is causing your loop to end early, because return finishes the function execution. To make this easier to see:
def f():
for i in range(3):
print(i)
return
f()
0
# Nothing else happens here
To get all of the numbers, I need the return to be at the end of the loop:
def f():
for i in range(3):
print(i)
return
0
1
2
# Now I get all of the numbers
So move your return to the end of your loop, and unindent it:
def parse_data(soup):
product_list = []
results = soup.find_all('div', {'class': 's-item__info clearfix'})
for item in results:
try:
products = {'Title': item.find_all('a', {'class': 's-item__link'})[0].h3.text,
'Price': float(item.find('span', {'class': 's-item__price'}).text[1:]),
'Product Rating': float(item.find('div', {'class': 's-item__reviews'}).a.div.find('span', {
'class': 'clipped'}).text.strip(' ')[0]),
'Watchers': float(item.find('div', {'class': 's-item__details clearfix'}).find('span', {
'class': 's-item__hotness s-item__itemHotness'}).text.split(' ')[0])
}
product_list.append(products)
except IndexError:
continue
return product_list # <---- Here
To ignore any exceptions
Instead of except IndexError use except Exception. This will catch any kind of exception your code might throw, though I'd definitely print what kind of error occurred. Catching specific errors is usually better practice:
try:
# some code
except Exception as e:
print(f"Caught an exception: {e}")
continue
Related
Following this tutorial to create an Ebay Price Tracker with Python, I am encountering an AttributeError: 'NoneType' object has no attribute 'text' when trying to get the title of a product from a search results page in Ebay.
The class is the right one, as you can see here:
'title': item.find('h3', {'class': 's-item__title s-item__title--has-tags'}).text,
Any idea of why I am getting this error and how to bypass it?
Here is the entire code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
searchterm = 'screen'
def get_data(searchterm):
url = f'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw={searchterm}&_sacat=0&LH_PrefLoc=1&LH_Auction=1&rt=nc&LH_Sold=1&LH_Complete=1'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def parse(soup):
productslist = []
results = soup.find_all('div',{'class': 's-item__info clearfix'})
for item in results:
product = {
'title': item.find('h3', {'class': 's-item__title s-item__title--has-tags'}).text,
'soldprice': float(item.find('span', {'class': 's-item__price'}).text.replace('$','').replace(',','').strip()),
'solddate': item.find('span', {'class': 's-item__title--tagblock__COMPLETED'}).find('span',{'class': 'POSITIVE'}.text),
'bids': item.find('span', {'class': 's-item__bids'}).text,
'link': item.find('a', {'class': 's-item__link'})['href'],
}
productslist.append(product)
return productslist
def output (productslist, searchterm):
productsdf = pd.DataFrame(productslist)
productsdf.to_csv(searchterm + 'ebaytrackeroutput.csv', index=False)
print('Saved to CSV')
return
soup = get_data(searchterm)
productslist = parse(soup)
output(productslist, searchterm)
Thank you for your help!
Some items are without title or soldprice. They will return none. And you will get that error. So you need to skip them.
Another thing is item.find('span', {'class': 's-item__title--tagblock__COMPLETED'}) this line is always returning none. So you need to check why.
To learn: How to debug small programs
import requests
from bs4 import BeautifulSoup
import pandas as pd
searchterm = 'screen'
def get_data(searchterm):
url = f'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw={searchterm}&_sacat=0&LH_PrefLoc=1&LH_Auction=1&rt=nc&LH_Sold=1&LH_Complete=1'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def parse(soup):
productslist = []
results = soup.find_all('div',{'class': 's-item__info clearfix'})
for item in results:
title = item.find('h3', {'class': 's-item__title s-item__title--has-tags'})
soldprice = item.find('span', {'class': 's-item__price'})
if title == None or soldprice == None: # if these are none just skip them.
continue
# solddate is always returning none you need to check why
product = {
'title': title.text,
'soldprice': float(soldprice.text.replace('$','').replace(',','').strip()),
# 'solddate': item.find('span', {'class': 's-item__title--tagblock__COMPLETED'}).find('span',{'class': 'POSITIVE'}.text),
'bids': item.find('span', {'class': 's-item__bids'}).text,
'link': item.find('a', {'class': 's-item__link'})['href'],
}
productslist.append(product)
return productslist
def output (productslist, searchterm):
productsdf = pd.DataFrame(productslist)
productsdf.to_csv(searchterm + 'ebaytrackeroutput.csv', index=False)
print('Saved to CSV')
return
soup = get_data(searchterm)
productslist = parse(soup)
output(productslist, searchterm)
What should I do when someone answers my question?
So I've completed my first webscraper, everything is working except one thing and I can't figure out why. My first scraping with the code for x in range(1,6): getQuestions('bygg', x) works fine, but then I add for x in range(1,6): getQuestions('bygg', x) getQuestions('advokat', x) and it just returns 0 (TypeError: 'NoneType' object is not subscriptable) and the problem seems to be coming from my 'nummer': item.find('a', {'class': 'link-body'})['href'], since It says 'nummer': item.find('a', {'class': 'link-body'})['href'], TypeError: 'NoneType' object is not subscriptable
here is the full code
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'tag': tag,
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,6):
getQuestions('bygg', x)
getQuestions('advokat', x)
df = pd.DataFrame(questionlist)
df.to_excel('merinfo skrapare för bygg.xlsx')
print('LBC Marketing TM')
Last note, if I # out the 'nummer': item.find('a', {'class': 'link-body'})['href'], it works fine but this is kinda the most important part lol.
Thankful for any help, best regards!
As #AndyKnight mentioned, you are attempting access ['href'] on an item that is None. You could add some sanity checks for None to help out. Something like:
def get_href_item(src_item, tag, class_name):
href_item = src_item.find(tag, {"class": f"{class_name}"})
if href_item is not None:
href = href_item['href']
if href is not None:
return href
else:
return "HREF_NOT_FOUND"
Then you can use that method to get the 'nummer' values:
question = {
'tag': tag,
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': get_href_item(item, 'a', 'link-body'),
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
You will probably want to add similar sanity checks for None for all of the values you are searching for.
I am trying to scrape multiple pages using grequests and beautifulsoup. I am able to scrape one single page but when I change it to iterate over multiple pages I am getting the above error listed in the title.
CODE:
from bs4 import BeautifulSoup
import pandas as pd
_city = input('Enter the name of the City and State, example format(miami-fl): ')
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0'}
def get_urls():
urls = []
for x in range(1,2):
urls.append(f'https://www.apartments.com/miami-fl/{x}/')
return urls
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.map(reqs)
return resp
def parse(resp):
apartments = []
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find_all('li', {'class': 'mortar-wrapper'})
for item in results:
apartment = {
'Property_name': item.find('span', {'class': 'js-placardTitle title'}).text,
'Unit_name': item.find(''),
'Formatted_address': item.find('div', {'class': 'property-address js-url'}).text,
'City&State': _city,
'Bedrooms': item.find('div', {'class': 'bed-range'}).text,
'Price_Range': item.find('div', {'class': 'price-range'}).text,
'Availability': item.find('div', {'class': 'availability'}).text,
'Property_Amenities': item.find('div', {'class': 'property-amenities'}).text.strip(),
'Phone_Number': item.find('a', {'class': 'phone-link js-phone'}).attrs['href'],
}
apartments.append(apartment)
print(apartments)
return apartments
#def output(apartments):
aptdf = pd.DataFrame(apartments)
aptdf.to_csv('apts.csv', index=False)
print('Saved to CSV')
return
if __name__ == '__main__':
urls = get_urls()
resp = get_data(urls)
df = pd.DataFrame(parse(resp))
df.to_csv('apts.csv', index=False)
#output(apartments)```
edited code to correct format but still wont run or debug
I have been trying to improve my knowledge with Python and I think the code is pretty forward. However I do dislike abit the coding style I have done where I use too much try except in a content there it might not needed to be at first place and also to try to avoid the silenced expetions.
My goal is basically to have a ready payload before scraping as you will see at the top of the code. Those should be always declared before scraping. What i'm trying to do basically is to try to scrape those different data. If we don't find the data, then it should skip or set the value to [], None or False (Depending on what we are trying to do).
I have read abit regarding getattr and isinstance functions but im not sure if there might be a better way than using lots of Try except as a cover if it doesn't find the element on the webpage.
import requests
from bs4 import BeautifulSoup
payload = {
"name": "Untitled",
"view": None,
"image": None,
"hyperlinks": []
}
site_url = "https://stackoverflow.com/questions/743806/how-to-split-a-string-into-a-list"
response = requests.get(site_url)
bs4 = BeautifulSoup(response.text, "html.parser")
try:
payload['name'] = "{} {}".format(
bs4.find('meta', {'property': 'og:site_name'})["content"],
bs4.find('meta', {'name': 'twitter:domain'})["content"]
)
except Exception: # noqa
pass
try:
payload['view'] = "{} in total".format(
bs4.find('div', {'class': 'grid--cell ws-nowrap mb8'}).text.strip().replace("\r\n", "").replace(" ", ""))
except Exception:
pass
try:
payload['image'] = bs4.find('meta', {'itemprop': 'image primaryImageOfPage'})["content"]
except Exception:
pass
try:
payload['hyperlinks'] = [hyperlinks['href'] for hyperlinks in bs4.find_all('a', {'class': 'question-hyperlink'})]
except Exception: # noqa
pass
print(payload)
EDIT:
Example to get incorrect value is to set any find bs4 elements to something else etc:
site_url = "https://stackoverflow.com/questions/743806/how-to-split-a-string-into-a-list"
response = requests.get(site_url)
bs4 = BeautifulSoup(response.text, "html.parser")
print(bs4.find('meta', {'property': 'og:site_name'})["content"]) # Should be found
print(bs4.find('meta', {'property': 'og:site_name_TEST'})["content"]) # Should give us an error due to not found
From the documentation, find returns None when it doesn't find anything while find_all returns an empty list []. You can check that the results are not None before trying to index.
import requests
from bs4 import BeautifulSoup
payload = {
"name": "Untitled",
"view": None,
"image": None,
"hyperlinks": []
}
site_url = "https://stackoverflow.com/questions/743806/how-to-split-a-string-into-a-list"
response = requests.get(site_url)
bs4 = BeautifulSoup(response.text, "html.parser")
try:
prop = bs4.find('meta', {'property': 'og:site_name'})
name = bs4.find('meta', {'name': 'twitter:domain'})
if prop is not None and name is not None:
payload['name'] = "{} {}".format(prop["content"], name["content"])
div = bs4.find('div', {'class': 'grid--cell ws-nowrap mb8'})
if div is not None:
payload['view'] = "{} in total".format(div.text.strip().replace("\r\n", "").replace(" ", ""))
itemprop = bs4.find('meta', {'itemprop': 'image primaryImageOfPage'})
if itemprop is not None:
payload['image'] = itemprop["content"]
payload['hyperlinks'] = [hyperlinks['href'] for hyperlinks in bs4.find_all('a', {'class': 'question-hyperlink'})]
except Exception: # noqa
pass
print(payload)
So you can use one try/except. If you want to handle exceptions differently you can have different except blocks for them.
try:
...
except ValueError:
value_error_handler()
except TypeError:
type_error_handler()
except Exception:
catch_all()
def parse():
html = get_html(URL)
if html.status_code == 200:
phones = []
pages_count = pages(html.text)
for page in range(1, pages_count + 1):
print(f'Parsing a page {page} from {pages_count}...')
html = get_html(URL, params={'p': page})
phones.extend(get_content(html.text))
print(phones)
else:
print('Error')
Hi, I want to list items, but I get an error
File "C:/Users/User/PycharmProjects/Parser/parser.py", line 52, in <module>
parse()
File "C:/Users/User/PycharmProjects/Parser/parser.py", line 46, in parse
phones.extend(get_content(html.text))
TypeError: 'NoneType' object is not iterab
This is all the code:
import requests
from bs4 import BeautifulSoup
URL = 'https://comfy.ua/smartfon/'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('li', class_='pager__number')
if pagination:
return int(pagination[-2].get_text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="product-item__i")
phone = []
for item in items:
phone.append({
'title': item.find('p', class_="product-item__name").get_text(strip=True),
'link': item.find('a', class_="product-item__name-link js-gtm-product-title").get('href'),
'price': item.find('div', class_="price-box__content-i").get_text(strip=True).replace(u'\xa0', u' ')
})
print(phone)
def parse():
html = get_html(URL)
if html.status_code == 200:
phones = []
pages_count = pages(html.text)
for page in range(1, pages_count + 1):
print(f'Parsing a page {page} from {pages_count}...')
html = get_html(URL, params={'p': page})
phones.extend(get_content(html.text))
print(phones)
else:
print('Error')
parse()
I get an empty list, but should get the phones. Also i get an error.
phones.extend(get_content(html.text))
TypeError: 'NoneType' object is not iterab
This error is telling you that you're trying to iterate over None. Since extend() takes an iterable, this is therefore telling you that get_content() is returning None. This often happens when a function returns nothing at all: no return statement is equivalent to return None in Python.
Sure enough, your code for get_content() doesn't have a return statement. You need to add it:
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="product-item__i")
phone = []
for item in items:
phone.append({
'title': item.find('p', class_="product-item__name").get_text(strip=True),
'link': item.find('a', class_="product-item__name-link js-gtm-product-title").get('href'),
'price': item.find('div', class_="price-box__content-i").get_text(strip=True).replace(u'\xa0', u' ')
})
print(phone)
return phone # <--- add this