I am very new to Python and Web Scraping, http://books.toscrape.com/index.html for a project but I am stuck with the pagination logic. So far i managed to get every category, the book links and the informations i needed within them but i am struggling to scrape the next page URL for every category. The first problem is that the next page URL is incomplete (but that i can manage), the second probleme is that the base URL i have to use changes for every category.
Here is my code :
import requests
from bs4 import BeautifulSoup
project = []
url = 'http://books.toscrape.com'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = []
categories = soup.findAll("ul", class_="nav nav-list")
for category in categories:
hrefs = category.find_all('a', href=True)
for href in hrefs:
links.append(href['href'])
new_links = [element.replace("catalogue", "http://books.toscrape.com/catalogue") for element in links]
del new_links[0]
page = 0
books = []
for link in new_links:
r2 = requests.get(link).text
book_soup = BeautifulSoup(r2, "html.parser")
print("category: " + link)
nextpage = True
while nextpage:
book_link = book_soup.find_all(class_="product_pod")
for product in book_link:
a = product.find('a')
full_link = a['href'].replace("../../..", "")
print("book: " + full_link)
books.append("http://books.toscrape.com/catalogue" + full_link)
if book_soup.find('li', class_='next') is None:
nextpage = False
page += 1
print("end of pagination")
else:
next_page = book_soup.select_one('li.next>a')
print(next_page)
The part i am struggling is with the WHILE loop in "for link in new_links".
I am mostly looking for any example that can help me. Thank you!
If you do not want to scrape the links via http://books.toscrape.com/index.html directly while paging all the results you could get your goal like this:
from bs4 import BeautifulSoup
import requests
base_url = 'http://books.toscrape.com/'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
for cat in soup.select('.nav-list ul a'):
cat_url = base_url+cat.get('href').rsplit('/',1)[0]
url = cat_url
while True:
soup = BeautifulSoup(requests.get(url).text)
##print(url)
books.extend(['http://books.toscrape.com/catalogue/'+a.get('href').strip('../../../') for a in soup.select('article h3 a')])
if soup.select_one('li.next a'):
url = f"{cat_url}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Cause the result would be the same I would recommend to skip the way over categories:
from bs4 import BeautifulSoup
import requests
baseurl = 'http://books.toscrape.com/'
url = 'https://books.toscrape.com/catalogue/page-1.html'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
while True:
soup = BeautifulSoup(requests.get(url).text)
for a in soup.select('article h3 a'):
bsoup = BeautifulSoup(requests.get(base_url+'catalogue/'+a.get('href')).content)
print(base_url+'catalogue/'+a.get('href'))
data = {
'title': bsoup.h1.text.strip(),
'category': bsoup.select('.breadcrumb li')[-2].text.strip(),
'url': base_url+'catalogue/'+a.get('href')
### add what ever is needed
}
data.update(dict(row.stripped_strings for row in bsoup.select('table tr')))
books.append(data)
if soup.select_one('li.next a'):
url = f"{url.rsplit('/',1)[0]}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Output
[{'title': 'A Light in the Attic',
'category': 'Poetry',
'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
'UPC': 'a897fe39b1053632',
'Product Type': 'Books',
'Price (excl. tax)': '£51.77',
'Price (incl. tax)': '£51.77',
'Tax': '£0.00',
'Availability': 'In stock (22 available)',
'Number of reviews': '0'},
{'title': 'Tipping the Velvet',
'category': 'Historical Fiction',
'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
'UPC': '90fa61229261140a',
'Product Type': 'Books',
'Price (excl. tax)': '£53.74',
'Price (incl. tax)': '£53.74',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},
{'title': 'Soumission',
'category': 'Fiction',
'url': 'http://books.toscrape.com/catalogue/soumission_998/index.html',
'UPC': '6957f44c3847a760',
'Product Type': 'Books',
'Price (excl. tax)': '£50.10',
'Price (incl. tax)': '£50.10',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},...]
Related
I am trying to webscrape this link. As an example, I just want to scrape the first page. I would like to collect titles and authors for each of the 10 link you find in the first page.
To gather titles and authors, I write the following line of code:
from bs4 import BeautifulSoup
import requests
import numpy as np
url = 'https://www.bis.org/cbspeeches/index.htm?m=1123'
r = BeautifulSoup(requests.get(url).content, features = "lxml")
r.select('#cbspeeches_list a') # '#cbspeeches_list a' got via SelectorGadget
However, I get an empty list. What am I doing wrong?
Thanks!
Data is loaded from external source by API as post method. Just you have to use the API url.
from bs4 import BeautifulSoup
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
print(req)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
title = card.select_one('.title a').get_text()
author = card.select_one('.authorlnk.dashed').get_text().strip()
data.append({
'title':title,
'author':author
})
print(data)
Output
[{'title': 'Pablo Hernández de Cos: Closing ceremony of the academic year 2021-2022', 'author': '\nPablo Hernández de Cos'}, {'title': 'Klaas Knot: Keti Koti 2022 marks turning point for the Netherlands Bank ', 'author': '\nKlaas Knot'}, {'title': 'Luis de Guindos: Challenges for monetary policy', 'author': '\nLuis de Guindos'}, {'title': 'Fabio Panetta: Europe as a common
shield - protecting the euro area economy from global shocks', 'author': '\nFabio Panetta'},
{'title': 'Victoria Cleland: Rowing in unison to enhance cross-border payments', 'author': '\nVictoria Cleland'}, {'title': 'Yaron Amir: A look at the future world of payments - trends, the market, and regulation', 'author': '\nYaron Amir'}, {'title': 'Ásgeir Jónsson: Speech – 61st Annual Meeting of the Central Bank of Iceland', 'author': '\nÁsgeir Jónsson'}, {'title': 'Lesetja Kganyago: Project Khokha 2 report launch', 'author': '\nLesetja Kganyago'}, {'title': 'Huw Pill: What did the monetarists ever do for us?', 'author': '\nHuw Pill'}, {'title': 'Shaktikanta Das: Inaugural address - Statistics Day Conference ', 'author': '\nShaktikanta Das'}]
Try this:
data = {
'from': '',
'till': '',
'objid': 'cbspeeches',
'page': '',
'paging_length': '25',
'sort_list': 'date_desc',
'theme': 'cbspeeches',
'ml': 'false',
'mlurl': '',
'emptylisttext': ''
}
response = requests.post('https://www.bis.org/doclist/cbspeeches.htm', data=data)
soup = BeautifulSoup(response.content)
for elem in soup.find_all("tr"):
# the title
print(elem.find("a").text)
# the author
print(elem.find("a", class_="authorlnk dashed").text)
print()
Prints out:
Pablo Hernández de Cos: Closing ceremony of the academic year 2021-2022
Pablo Hernández de Cos
Klaas Knot: Keti Koti 2022 marks turning point for the Netherlands Bank
Klaas Knot
Whenever i try to extract the data, it returns an output of "None" which I am not sure of is it the code (I followed the rules of using bs4) or is it just the website that's different to scrape?
My code:
import requests
import bs4 as bs
url = 'https://www.zomato.com/jakarta/pondok-indah-restaurants'
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = req.text
soup = bs.BeautifulSoup(html, "html.parser")
listings = soup.find('div', class_='sc-gAmQfK fKxEbD')
rest_name = listings.find('h4', class_='sc-1hp8d8a-0 sc-eTyWNx gKsZcT').text
##Output: AttributeError: 'NoneType' object has no attribute 'find'
print(listings)
##returns None
Here is the inspected tag of the website which i try to get the h4 class showing the restaurant's name:
inspected element
What happens?
Classes are generated dynamically and may differ from your inspections via developer tools - So you won't find what you are looking for.
How to fix?
It would be a better approach to select your targets via tag or id if available, cause these are more static than css classes.
listings = soup.select('a:has(h4)')
Example
Iterating listings and scrape several infromation:
import requests
import bs4 as bs
url = 'https://www.zomato.com/jakarta/pondok-indah-restaurants'
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = req.text
soup = bs.BeautifulSoup(html, "html.parser")
data = []
for item in soup.select('a:has(h4)'):
data.append({
'title':item.h4.text,
'url':item['href'],
'etc':'...'
})
print(data)
Output
[{'title': 'Radio Dalam Diner', 'url': '/jakarta/radio-dalam-diner-pondok-indah/info', 'etc': '...'}, {'title': 'Aneka Bubur 786', 'url': '/jakarta/aneka-bubur-786-pondok-indah/info', 'etc': '...'}, {'title': "McDonald's", 'url': '/jakarta/mcdonalds-pondok-indah/info', 'etc': '...'}, {'title': 'KOPIKOBOY', 'url': '/jakarta/kopikoboy-pondok-indah/info', 'etc': '...'}, {'title': 'Kopitelu', 'url': '/jakarta/kopitelu-pondok-indah/info', 'etc': '...'}, {'title': 'KFC', 'url': '/jakarta/kfc-pondok-indah/info', 'etc': '...'}, {'title': 'HokBen Delivery', 'url': '/jakarta/hokben-delivery-pondok-indah/info', 'etc': '...'}, {'title': 'PHD', 'url': '/jakarta/phd-pondok-indah/info', 'etc': '...'}, {'title': 'Casa De Jose', 'url': '/jakarta/casa-de-jose-pondok-indah/info', 'etc': '...'}]
I am trying to scrape data. Somehow the loop doesn't work correctly. It loops just once. I want to scrape all the name of the goods and the price.
The goods are inside "td" eg : "Sendok Semen 7 Bulat" and the price are inside "div" eg : "8.500"
Here is my code :
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
#divs = html.find_all('div', class_ = "col-md-12 col-xs-12")
divs = html.findAll('div', class_ = "row d-block")
cnt = 0
for div in divs:
cnt += 1
#print(div, end="\n"*2)
price = div.find('span', class_ = 'float-right')
print(price.text.strip())
print(cnt)
Any help will be appreciated.
Thanks
What happens?
Somehow the loop doesn't work correctly. It loops just once.
It is not the loop that won't work correctly, it is rather the way you are selecting things. So html.findAll('div', class_ = "row d-block") will find only one <div> that matches your criteria.
How to fix?
Make you are selecting more specific, cause what you are really want to iterate are the <tr> in the table - I often use css selectors and the following will get the correct selection, so just replace your html.findAll('div', class_ = "row d-block") Note In new code use find_all() instead of findAll() it is the newer syntax:
html.select('.d-block tbody tr')
Example
Will give you a well structured list of dicts:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
data = []
for row in html.select('.d-block tbody tr'):
data.append(
dict(
zip(['pos','name','currency','price'],list(row.stripped_strings))
)
)
data
Output
[{'pos': '1',
'name': 'Sendok Semen 7 Bulat',
'currency': 'Rp',
'price': '8.500'},
{'pos': '2',
'name': 'Sendok Semen 8 Bulat Gagang Kayu',
'currency': 'Rp',
'price': '10.000'},
{'pos': '3', 'name': 'SEMEN', 'currency': 'Rp', 'price': '10.000'},
{'pos': '4',
'name': 'Sendok Semen 8 Gagang Kayu SWARDFISH',
'currency': 'Rp',
'price': '10.000'},...]
But Be Aware
It will just help you to get the Top 10 - List Of Popular Semen Prices In Ralali and not all goods and prices on the page --> That is something you should clarify in your question.
Getting more data from all products
Option#1
Use an api that is provided by the website and iterate by parameter pages:
import requests
url = 'https://rarasearch.ralali.com/v2/search/item?q=semen'
res = requests.get(url)
data = []
for p in range(1, round(res.json()['total_item']/20)):
url = f'https://rarasearch.ralali.com/v2/search/item?q=semen&p={p}'
res = requests.get(url)
data.extend(res.json()['items'])
print(data)
Output:
[{'id': 114797,
'name': 'TIGA RODA Semen NON semen putih',
'image': 'assets/img/Libraries/114797_TIGA_RODA_Semen_NON_semen_putih_1_UrwztohXHo9u1yRY_1625473149.png',
'alias': 'tiga-roda-semen-non-semen-putih-157561001',
'vendor_id': 21156,
'vendor_alias': 'prokonstruksi',
'rating': '5.00',
'vendor_status': 'A',
'vendor_name': 'Pro Konstruksi',
'vendor_location': 'Palembang',
'price': '101500.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 1,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},
{'id': 268711,
'name': 'Sendok Semen Ukuran 6',
'image': 'assets/img/Libraries/268711_Sendok-Semen-Ukuran-6_HCLcQq6TUh5IiEPZ_1553521818.jpeg',
'alias': 'Sendok-Semen-Ukuran-6',
'vendor_id': 305459,
'vendor_alias': 'distributorbangunan',
'rating': None,
'vendor_status': 'A',
'vendor_name': 'Distributor Bangunan',
'vendor_location': 'Bandung',
'price': '11000.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 0,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'Unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},...]
Option#2
Use selenium, scroll to the bottom of the page toa load all products, push the driver.page_source to your soup and start selecting, ...
I am trying to extract some data from two tables from the same HTML with BeautifulSoup. Actually, I already extracted part from both tables but not all. This is the code that I have:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html_content = urlopen('https://www.icewarehouse.com/Bauer_Vapor_X25_Ice_Hockey_Skates/descpage-V25XS.html')
soup = BeautifulSoup(html_content, "lxml")
tables = soup.find_all('table', attrs={'class' : 'orderingtable fl'})
for table_skates in tables:
t_headers = []
t_data = []
t_row = {}
for tr in table_skates.find_all('th'):
t_headers.append(tr.text.replace('\n', '').strip())
for td in table_skates.find_all('td'):
t_data.append(td.text.replace('\n', '').strip())
t_row = dict(zip(t_headers, t_data))
print(t_row)
Here is the output that I get:
{'Size': '1.0', 'Price': '$109.99', 'Stock': '1', 'Qty': ''}
{'Size': '7.0', 'Price': '$159.99', 'Stock': '2+', 'Qty': ''}
You can easily get it by using 'read_html' in 'pandas'.
df = pd.read_html(html_content, attrs={'class' : 'orderingtable fl'})
I want to scrape the FirstName and the LastName of this website to use it on a automated browser input.
from lxml import html
import requests
page = requests.get('https://www.getnewidentity.com/uk-identity-generator.php')
tree = html.fromstring(page.content)
firstname = tree.xpath('//*[#id="reslist"]/tbody/tr[3]/td[2]/text()')
lastname = tree.xpath('//*[#id="reslist"]/tbody/tr[4]/td[2]/text()')
print ('FirstName: ', firstname)
print ('LastName: ', lastname)
input("close")
The website is this https://www.getnewidentity.com/uk-identity-generator.php
<table class="table table-bordered table-striped" id="reslist"><thead><tr><th colspan="2" class="bg-primary">General Information</th></tr></thead><tbody><tr><td style="width:150px;">Name</td><td><b>Kamila Harmon</b></td></tr>
<tr><td>Gender</td><td>Female</td></tr>
<tr><td>First Name</td><td>Kamila</td></tr>
<tr><td>Last Name</td><td>Harmon</td></tr>
<tr><td>Birthday</td><td>12/26/1989</td></tr>
find_all()-returns a collection of elements.
strip()- in-built function of Python is used to remove all the leading and trailing spaces from a string.
Ex.
from bs4 import BeautifulSoup
import requests
request = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php'
,data={"num":"undefine","add":"address","unique":"true"})
soup = BeautifulSoup(request.content,'lxml')
td = soup.find_all("td")
data = {}
for x in range(0,len(td)-1,2):
data[td[x].text.strip()] = td[x+1].text.strip()
print(data)
O/P:
{'Name': 'Jayda Key', 'Gender': 'Female', 'First Name': 'Jayda', 'Last Name': 'Key',
'Birthday': '55', 'NINO': 'EB 29 38 84 B', 'Address': 'Flat 31l\nMartin Walk, Leoberg, S81
0HT', 'Street Address': 'Flat 31l\nMartin Walk', 'State': 'Leoberg', 'Zip Code': 'S81 0HT',
'Phone': '+44(0)9487 957056', 'Credit Card Type': 'MasterCard', 'Credit Card Number':
'5246585772859818', 'CVV': '899', 'Expires': '02/2022', 'Username': 'twinhero', 'Email':
'Gamestomper#gmail.com', 'Password': 'Go7ByznZ', 'User Agent': 'Mozilla/5.0 (Macintosh;
Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2
Safari/601.7.7', 'Height': '1.85m (6.17ft)', 'Weight': '75.22kg (158.31pounds)',
'Blood type': 'O−'}
You say you want first name and last name; with bs4 4.7.1+ you can use :contains to target appropriately. As already detailed in other answer, content is dynamically retrieved from post xhr
from bs4 import BeautifulSoup as bs
import requests
r = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php',data={"num":"undefine","add":"address","unique":"true"})
soup = bs(r.content,'lxml')
first_name = soup.select_one('td:contains("First Name") + td').text
last_name = soup.select_one('td:contains("Last Name") + td').text
full_name = soup.select_one('td:contains("Name") + td').text
print(first_name, last_name, full_name)