The battle to finish my first scraping script continues. I think that I'm almost finishing but I hit a new roadblock.
So, the problem is that when I reach the last pagination page I'm getting this error:
Traceback (most recent call last):
File "C:/Users/Andre/Desktop/scripts python/scrape_learn/ttc_quase.py", line 50, in <module>
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
AttributeError: 'NoneType' object has no attribute 'find'
I think that the error is related with the way I'm finding url_tag, but I'm not seeign any other way to grab the "next page". I tried to use the Try/Except method but when apply it I just get the listings on the first page.
So I'm not sure what should be my next step. If someone could help I will appreciate.
My full code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://timetochoose.co.ao/?ct_keyword&ct_ct_status&ct_property_type&ct_beds&search-listings=true&ct_country=portugal&ct_state&ct_city&ct_price_to&ct_mls&lat&lng"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
anuncios_ttc = {}
anuncios_nr = 0
while True:
response = requests.get(url, headers=headers)
print(response)
data = response.text
print(data)
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
anuncios = soup.find_all("div", {"class": "grid-listing-info"})
for anuncios in anuncios:
titles = anuncios.find("a",{"class": "listing-link"}).text
location = anuncios.find("p",{"class": "location muted marB0"}).text
link = anuncios.find("a",{"class": "listing-link"}).get("href")
anuncios_response = requests.get(link, headers=headers)
anuncios_data = anuncios_response.text
anuncios_soup = BeautifulSoup(anuncios_data, 'html.parser')
conteudo = anuncios_soup.find("div", {"id":"listing-content"}).text
preco = anuncios_soup.find("span",{"class": "listing-price"})
preco_imo = preco.text if preco else "N/A"
quartos = anuncios_soup.find("li", {"class": "row beds"})
nr_quartos = quartos.text if quartos else "N/A"
wcs = anuncios_soup.find("li", {"class": "row baths"})
nr_wcs = wcs.text if wcs else "N/A"
tipo = anuncios_soup.find("li", {"class": "row property-type"})
tipo_imo = tipo.text if tipo else "N/A"
bairro = anuncios_soup.find("li", {"class": "row community"})
bairro1 = bairro.text if bairro else "N/A"
ref = anuncios_soup.find("li", {"class": "row propid"}).text
anuncios_nr+=1
anuncios_ttc[anuncios_nr] = [titles, location, bairro1, preco_imo, tipo_imo, nr_quartos, nr_wcs, conteudo, ref, link]
print("Título", titles, "\nLocalização", location, "\nPreço", preco_imo, "\nLink", link, "\nReferencia", ref, "\nTipo", tipo_imo, "\nQuartos", nr_quartos, "\nWC", nr_wcs, "\nBairro", bairro1, "\nConteudo", conteudo)
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
if url_tag.get('href'):
url = url_tag.get('href')
print(url)
else:
break
print("Nr Total de Anuncios: ", anuncios_nr)
anuncios_ttc_df = pd.DataFrame.from_dict(anuncios_ttc, orient = 'index', columns =['Titulo', 'Localização', 'Bairro', 'Preço', 'Tipo', 'Quartos', 'WCs', 'Descrição', 'Referência', 'Ligação'])
anuncios_ttc_df.head()
anuncios_ttc_df.to_csv('ttc_python.csv')
The answer for this question ended up to be provided in other thread where I was trying to identify better the URL_Tag element.
With the help of #Andrej Kesely I was able to solve the problem with:
url_tag = soup.find('li', {"id": "next-page-link"})
if not url_tag:
break
url = url_tag.find('a')['href']
Now the script is able to run until the end and to generate the csv file as intended.
Related
I'm trying to scrape an ecommerce store but getting Attribute error: nonetype object has no attribute get_text. This happens whenever i try to iterate between each products through the product link. I'm confused if am running into a javascript or captcha or whatnot don't know. Here's my code
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find_all('li', attrs={'style': 'box-sizing: border-box; padding: 0px; margin: 0px;'})
a = features[0].get_text(strip=True)
b = features[1].get_text(strip=True)
c = features[2].get_text(strip=True)
d = features[3].get_text(strip=True)
e = features[4].get_text(strip=True)
f = features[5].get_text(strip=True)
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")
print(f"e: {e}")
print(f"f: {f}")
print('')
Here's the error message:
Traceback (most recent call last):
File "c:\Users\LP\Documents\jumia\jumia.py", line 32, in <module>
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
AttributeError: 'NoneType' object has no attribute 'get_text'
PS C:\Users\LP\Documents\jumia> here
Change the variable baseurl to https://www.jumia.com.ng and change the features variable to features = soup.find('article', class_='col8 -pvs').find_all('li'). After fixing those two issues, you'll probably get an IndexError because not every page has six features listed. You can use something like the following code to iterate through the features and print them:
for i, feature in enumerate(features):
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
With this for loop, you don't need the a to f variables. The chr(ord("a")+i part gets the letter corresponding to index i. However, if there are more than 26 features this will print punctuation characters or garbage. This can be trivially fixed by breaking the loop when i>25. This trick won't work on EBCDIC systems, only ASCII ones.
Even after making these three changes, there was an AttributeError when it tried to scrape a link to a product unrelated to iPhones, which showed up on page 5 of the results. I don't know how the script got that link; it was a medicinal cream. To fix that, either wrap the body of the second for loop in a try except like the following or put the last line of the first for loop under a if 'iphone' in link.
for link in productlinks:
try:
# body of for loop goes here
except AttributeError:
continue
With these changes, the script would look like this:
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.jumia.com.ng'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
productlinks = []
for x in range(1,51):
r = requests.get(f'https://www.jumia.com.ng/ios-phones/?page={x}#catalog-listing/')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('article', class_='prd _fb col c-prd')
for product in productlist:
for link in product.find_all('a', href=True):
if 'iphone' in link['href']:
productlinks.append(baseurl + link['href'])
for link in productlinks:
r = requests.get(link, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
name = soup.find('h1', class_='-fs20 -pts -pbxs').get_text(strip=True)
amount = soup.find('span', class_='-b -ltr -tal -fs24').get_text(strip=True)
review = soup.find('div', class_='stars _s _al').get_text(strip=True)
rating = soup.find('a', class_='-plxs _more').get_text(strip=True)
features = soup.find('article', class_='col8 -pvs').find_all('li')
print(f"Name: {name}")
print(f"Amount: {amount}")
print(f"Review: {review}")
print(f"Rating: {rating}")
print('Key Features')
for i, feature in enumerate(features):
if i > 25: # we ran out of letters
break
print(chr(ord("a")+i) + ":", feature.get_text(strip=True))
print('')
except AttributeError:
continue
From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)
two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.
I am trying to scrape multiple pages using grequests and beautifulsoup. I am able to scrape one single page but when I change it to iterate over multiple pages I am getting the above error listed in the title.
CODE:
from bs4 import BeautifulSoup
import pandas as pd
_city = input('Enter the name of the City and State, example format(miami-fl): ')
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0'}
def get_urls():
urls = []
for x in range(1,2):
urls.append(f'https://www.apartments.com/miami-fl/{x}/')
return urls
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.map(reqs)
return resp
def parse(resp):
apartments = []
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find_all('li', {'class': 'mortar-wrapper'})
for item in results:
apartment = {
'Property_name': item.find('span', {'class': 'js-placardTitle title'}).text,
'Unit_name': item.find(''),
'Formatted_address': item.find('div', {'class': 'property-address js-url'}).text,
'City&State': _city,
'Bedrooms': item.find('div', {'class': 'bed-range'}).text,
'Price_Range': item.find('div', {'class': 'price-range'}).text,
'Availability': item.find('div', {'class': 'availability'}).text,
'Property_Amenities': item.find('div', {'class': 'property-amenities'}).text.strip(),
'Phone_Number': item.find('a', {'class': 'phone-link js-phone'}).attrs['href'],
}
apartments.append(apartment)
print(apartments)
return apartments
#def output(apartments):
aptdf = pd.DataFrame(apartments)
aptdf.to_csv('apts.csv', index=False)
print('Saved to CSV')
return
if __name__ == '__main__':
urls = get_urls()
resp = get_data(urls)
df = pd.DataFrame(parse(resp))
df.to_csv('apts.csv', index=False)
#output(apartments)```
edited code to correct format but still wont run or debug
The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType object has no attribute find_all in table.find_all('tr).
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
You can use Try and Except (documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass
you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again
There are quite similar scenarios regarding this; but I've been comparing with others.
Getting from Clustered Nodes etc. But somehow; I'm unsure why my for loop isn't iterating and grabbing the text from other elements but only from the first element of the node.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': 'section-trending-search-list'}):
topic = container.select_one(
'div._1waRmo')
if topic:
print(1)
d = {
'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': '_25qBG5'}):
topic = container.select_one('div._1waRmo')
if topic:
d = {'Titles': topic.text.replace("\n", "")}
l.append(d)
return d
except:
d = None
findDiv()
print(l)
Output:
[{'Titles': 'school backpack'}, {'Titles': 'oppo case'}, {'Titles': 'baby chair'}, {'Titles': 'car holder'}, {'Titles': 'sling beg'}]
Again I suggest you use selenium. If you run this again you will see that you will get a different set of 5 dictionaries within the list. Every time you are making a request they are giving 5 random trending items. But they do have a 'change' button. If you use selenium, you might be able to just click that and keep scraping all trending items.
Try this:
toplevel is finding the root of the options, then we find all divs under that.
I hope this is what you want.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
toplevel = html_soup.find('._25qBG5')
for container in toplevel.find_all('div'):
topic = container.select_one('._1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
This enumerates fine with a local file. When I tried with the url given, the website wasn't returning the html you show.
from requests import get
from bs4 import BeautifulSoup
url = 'path_in_here\\test.html'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
example = open(url,"r")
text = example.read()
#response = get(url, headers=headers)
#html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup = BeautifulSoup(text, 'html.parser')
print (text)
def findDiv():
#try:
print("finding toplevel")
toplevel = html_soup.find("div", { "class": "_25qBG5"} )
print ("found toplevel")
divs = toplevel.findChildren("div", recursive=True)
print("found divs")
for container in divs:
print ("loop")
topic = container.select_one('.1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
#except:
# d = None
# print ("error")
findDiv()
print(l)