So I've completed my first webscraper, everything is working except one thing and I can't figure out why. My first scraping with the code for x in range(1,6): getQuestions('bygg', x) works fine, but then I add for x in range(1,6): getQuestions('bygg', x) getQuestions('advokat', x) and it just returns 0 (TypeError: 'NoneType' object is not subscriptable) and the problem seems to be coming from my 'nummer': item.find('a', {'class': 'link-body'})['href'], since It says 'nummer': item.find('a', {'class': 'link-body'})['href'], TypeError: 'NoneType' object is not subscriptable
here is the full code
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'tag': tag,
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,6):
getQuestions('bygg', x)
getQuestions('advokat', x)
df = pd.DataFrame(questionlist)
df.to_excel('merinfo skrapare för bygg.xlsx')
print('LBC Marketing TM')
Last note, if I # out the 'nummer': item.find('a', {'class': 'link-body'})['href'], it works fine but this is kinda the most important part lol.
Thankful for any help, best regards!
As #AndyKnight mentioned, you are attempting access ['href'] on an item that is None. You could add some sanity checks for None to help out. Something like:
def get_href_item(src_item, tag, class_name):
href_item = src_item.find(tag, {"class": f"{class_name}"})
if href_item is not None:
href = href_item['href']
if href is not None:
return href
else:
return "HREF_NOT_FOUND"
Then you can use that method to get the 'nummer' values:
question = {
'tag': tag,
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': get_href_item(item, 'a', 'link-body'),
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
You will probably want to add similar sanity checks for None for all of the values you are searching for.
Related
I'm trying to scrape an ecommerce store but getting Attribute error: nonetype object has no attribute get_text. This happens whenever i try to iterate between each products through the product link. I'm confused if am running into a javascript or captcha or whatnot don't know. Here's my code
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find_all('li', attrs={'style': 'box-sizing: border-box; padding: 0px; margin: 0px;'})
a = features[0].get_text(strip=True)
b = features[1].get_text(strip=True)
c = features[2].get_text(strip=True)
d = features[3].get_text(strip=True)
e = features[4].get_text(strip=True)
f = features[5].get_text(strip=True)
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")
print(f"e: {e}")
print(f"f: {f}")
print('')
Here's the error message:
Traceback (most recent call last):
File "c:\Users\LP\Documents\jumia\jumia.py", line 32, in <module>
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
AttributeError: 'NoneType' object has no attribute 'get_text'
PS C:\Users\LP\Documents\jumia> here
Change the variable baseurl to https://www.jumia.com.ng and change the features variable to features = soup.find('article', class_='col8 -pvs').find_all('li'). After fixing those two issues, you'll probably get an IndexError because not every page has six features listed. You can use something like the following code to iterate through the features and print them:
for i, feature in enumerate(features):
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
With this for loop, you don't need the a to f variables. The chr(ord("a")+i part gets the letter corresponding to index i. However, if there are more than 26 features this will print punctuation characters or garbage. This can be trivially fixed by breaking the loop when i>25. This trick won't work on EBCDIC systems, only ASCII ones.
Even after making these three changes, there was an AttributeError when it tried to scrape a link to a product unrelated to iPhones, which showed up on page 5 of the results. I don't know how the script got that link; it was a medicinal cream. To fix that, either wrap the body of the second for loop in a try except like the following or put the last line of the first for loop under a if 'iphone' in link.
for link in productlinks:
try:
# body of for loop goes here
except AttributeError:
continue
With these changes, the script would look like this:
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com.ng'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
if 'iphone' in link['href']:
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find('article', class_='col8 -pvs').find_all('li')
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
for i, feature in enumerate(features):
if i > 25: # we ran out of letters
break
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
print('')
except AttributeError:
continue
This might be hard to explain so stay with me, I've created a web scraper that scrapes a specific site for information, the code looks like this -
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'tag': tag,
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,6):
getQuestions('bygg', x)
#getQuestions('advokat', x)
df = pd.DataFrame(questionlist)
df.to_excel('merinfo skrapare för bygg.xlsx')
print('LBC Marketing TM')
what I would like to do is create a simple application that can change this part
for x in range(1,6):
getQuestions('bygg', x)
#getQuestions('advokat', x)
mainly the ('bygg') and range (1,6) part. It doesn't have to be pretty but just something that works, and something that I can make pretty in the future. I'm not asking for a full on tutorial (although that would be nice) but just some pointers in the right direction so I know what to search for since I am really new to all this.
Cheers!
Just use a function and pass the parts you want to change as parameters. There is no need for "self modifying" or even generated code just for this:
def get_results(field, start, stop):
results = []
for x in range(start, stop):
results.append(getQuestions(field, x))
return results
get_results('bygg', 1, 6)
get_results('advokat', 1, 10)
the values that get passed to get_results can come from a CSV file, or anyother place- just code that.
I am trying to extract the href from the premier league website, however I cannot seem to get all the html links except from the first page:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.premierleague.com/players/')
soup = BeautifulSoup(r.content, 'lxml')
#get the player index
table = soup.find('div', {'class': 'table playerIndex'})
#<a> is where the href is stored
href_names = [link.get('href') for link in table.findAll('a')]
football_string = 'https://www.premierleague.com'
#concatenate to get the full html link
[football_string + str(x) for x in href_names]
Which only returns the first page - I have tried using selenium however the premier league website has an ad that appears every time it is used that prevents it from working. Any ideas on how to get all the html links?
If I understood your question right, the following approach should do it:
import requests
base = 'https://www.premierleague.com/players/{}/'
link = 'https://footballapi.pulselive.com/football/players'
payload = {
'pageSize': '30',
'compSeasons': '418',
'altIds': 'true',
'page': 0,
'type': 'player',
'id': '-1',
'compSeasonId': '418'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
s.headers['referer'] = 'https://www.premierleague.com/'
s.headers['origin'] = 'https://www.premierleague.com'
while True:
res = s.get(link,params=payload)
if not res.json()['content']:break
for item in res.json()['content']:
print(base.format(int(item['id'])))
payload['page']+=1
Results are like (truncated):
https://www.premierleague.com/players/19970/
https://www.premierleague.com/players/13279/
https://www.premierleague.com/players/13286/
https://www.premierleague.com/players/10905/
https://www.premierleague.com/players/4852/
https://www.premierleague.com/players/4328/
https://www.premierleague.com/players/90665/
I am trying to scrape multiple pages using grequests and beautifulsoup. I am able to scrape one single page but when I change it to iterate over multiple pages I am getting the above error listed in the title.
CODE:
from bs4 import BeautifulSoup
import pandas as pd
_city = input('Enter the name of the City and State, example format(miami-fl): ')
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0'}
def get_urls():
urls = []
for x in range(1,2):
urls.append(f'https://www.apartments.com/miami-fl/{x}/')
return urls
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.map(reqs)
return resp
def parse(resp):
apartments = []
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find_all('li', {'class': 'mortar-wrapper'})
for item in results:
apartment = {
'Property_name': item.find('span', {'class': 'js-placardTitle title'}).text,
'Unit_name': item.find(''),
'Formatted_address': item.find('div', {'class': 'property-address js-url'}).text,
'City&State': _city,
'Bedrooms': item.find('div', {'class': 'bed-range'}).text,
'Price_Range': item.find('div', {'class': 'price-range'}).text,
'Availability': item.find('div', {'class': 'availability'}).text,
'Property_Amenities': item.find('div', {'class': 'property-amenities'}).text.strip(),
'Phone_Number': item.find('a', {'class': 'phone-link js-phone'}).attrs['href'],
}
apartments.append(apartment)
print(apartments)
return apartments
#def output(apartments):
aptdf = pd.DataFrame(apartments)
aptdf.to_csv('apts.csv', index=False)
print('Saved to CSV')
return
if __name__ == '__main__':
urls = get_urls()
resp = get_data(urls)
df = pd.DataFrame(parse(resp))
df.to_csv('apts.csv', index=False)
#output(apartments)```
edited code to correct format but still wont run or debug
def parse():
html = get_html(URL)
if html.status_code == 200:
phones = []
pages_count = pages(html.text)
for page in range(1, pages_count + 1):
print(f'Parsing a page {page} from {pages_count}...')
html = get_html(URL, params={'p': page})
phones.extend(get_content(html.text))
print(phones)
else:
print('Error')
Hi, I want to list items, but I get an error
File "C:/Users/User/PycharmProjects/Parser/parser.py", line 52, in <module>
parse()
File "C:/Users/User/PycharmProjects/Parser/parser.py", line 46, in parse
phones.extend(get_content(html.text))
TypeError: 'NoneType' object is not iterab
This is all the code:
import requests
from bs4 import BeautifulSoup
URL = 'https://comfy.ua/smartfon/'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('li', class_='pager__number')
if pagination:
return int(pagination[-2].get_text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="product-item__i")
phone = []
for item in items:
phone.append({
'title': item.find('p', class_="product-item__name").get_text(strip=True),
'link': item.find('a', class_="product-item__name-link js-gtm-product-title").get('href'),
'price': item.find('div', class_="price-box__content-i").get_text(strip=True).replace(u'\xa0', u' ')
})
print(phone)
def parse():
html = get_html(URL)
if html.status_code == 200:
phones = []
pages_count = pages(html.text)
for page in range(1, pages_count + 1):
print(f'Parsing a page {page} from {pages_count}...')
html = get_html(URL, params={'p': page})
phones.extend(get_content(html.text))
print(phones)
else:
print('Error')
parse()
I get an empty list, but should get the phones. Also i get an error.
phones.extend(get_content(html.text))
TypeError: 'NoneType' object is not iterab
This error is telling you that you're trying to iterate over None. Since extend() takes an iterable, this is therefore telling you that get_content() is returning None. This often happens when a function returns nothing at all: no return statement is equivalent to return None in Python.
Sure enough, your code for get_content() doesn't have a return statement. You need to add it:
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="product-item__i")
phone = []
for item in items:
phone.append({
'title': item.find('p', class_="product-item__name").get_text(strip=True),
'link': item.find('a', class_="product-item__name-link js-gtm-product-title").get('href'),
'price': item.find('div', class_="price-box__content-i").get_text(strip=True).replace(u'\xa0', u' ')
})
print(phone)
return phone # <--- add this