I am trying to extract some data from two tables from the same HTML with BeautifulSoup. Actually, I already extracted part from both tables but not all. This is the code that I have:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html_content = urlopen('https://www.icewarehouse.com/Bauer_Vapor_X25_Ice_Hockey_Skates/descpage-V25XS.html')
soup = BeautifulSoup(html_content, "lxml")
tables = soup.find_all('table', attrs={'class' : 'orderingtable fl'})
for table_skates in tables:
t_headers = []
t_data = []
t_row = {}
for tr in table_skates.find_all('th'):
t_headers.append(tr.text.replace('\n', '').strip())
for td in table_skates.find_all('td'):
t_data.append(td.text.replace('\n', '').strip())
t_row = dict(zip(t_headers, t_data))
print(t_row)
Here is the output that I get:
{'Size': '1.0', 'Price': '$109.99', 'Stock': '1', 'Qty': ''}
{'Size': '7.0', 'Price': '$159.99', 'Stock': '2+', 'Qty': ''}
You can easily get it by using 'read_html' in 'pandas'.
df = pd.read_html(html_content, attrs={'class' : 'orderingtable fl'})
Related
I'm trying to scrape the "Biggest Gainers" list of coins on https://coinmarketcap.com/
How do I access the nth child (Biggest Gainers) in the div class_ = 'sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy'
I managed to get the data from the "Trending" section but having trouble targeting the "Biggest Gainers" top 3 text items.
I get AttributeError: 'NoneType' object has no attribute 'p'
from bs4 import BeautifulSoup
import requests
source = requests.get('https://coinmarketcap.com/').text
soup = BeautifulSoup(source, 'lxml')
section = soup.find(class_='sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy')
#List the top 3 Gainers
for top_gainers in section.find_all(class_='sc-16r8icm-0 sc-1uagfi2-0 bdEGog sc-1rmt1nr-1 eCWTbV')[1]:
top_gainers = top_gainers.find(class_='sc-1eb5slv-0 iworPT')
top_coins = top_gainers.p.text
print(top_coins)
I would avoid those dynamic classes and instead use -:soup-contains and combinators to first locate desired block via text, then with the combinators specify the relationship of the final elements to extract info from.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
soup = bs(requests.get("https://coinmarketcap.com/").text, "lxml")
biggest_gainers = []
for i in soup.select(
'div[color=text]:has(span:-soup-contains("Biggest Gainers")) > div ~ div'
):
biggest_gainers.append(
{
"rank": int(i.select_one(".rank").text),
"currency": i.select_one(".alias").text,
"% change": f"{i.select_one('.icon-Caret-up').next_sibling}",
}
)
gainers = pd.DataFrame(biggest_gainers)
gainers
As mentioned by #QHarr you should avoid dynamic identifier similar to his approach the selection comes via :-soup-contains() and the known text of the element:
soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div')
To extract the texts I used stripped_strings and zipped it with the keys to a dict:
dict(zip(['rank','name','alias','change'],e.stripped_strings))
Example
from bs4 import BeautifulSoup
import requests
url = 'https://coinmarketcap.com/'
soup=BeautifulSoup(requests.get(url).content)
data = []
for e in soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div'):
data.append(dict(zip(['rank','name','alias','change'],e.stripped_strings)))
Output
[{'rank': '1', 'name': 'Tenset', 'alias': '10SET', 'change': '1406.99'},
{'rank': '2', 'name': 'Burn To Earn', 'alias': 'BTE', 'change': '348.89'},
{'rank': '3', 'name': 'MetaCars', 'alias': 'MTC', 'change': '332.05'}]
You can use :nth-of-type to locate the "Biggest Gainers" parent div:
import requests
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://coinmarketcap.com/').text, 'html.parser')
bg = d.select_one('div:nth-of-type(2).sc-16r8icm-0.sc-1uagfi2-0.bdEGog.sc-1rmt1nr-1.eCWTbV')
data = [{'rank':i.select_one('span.rank').text,
'name':i.select_one('p.sc-1eb5slv-0.iworPT').text,
'change':i.select_one('span.sc-27sy12-0.gLZJFn').text}
for i in bg.select('div.sc-1rmt1nr-0.sc-1rmt1nr-4.eQRTPY')]
Output:
[{'rank': '1', 'name': 'Tenset', 'change': '1308.72%'}, {'rank': '2', 'name': 'Burn To Earn', 'change': '421.82%'}, {'rank': '3', 'name': 'Aigang', 'change': '329.63%'}]
I am very new to Python and Web Scraping, http://books.toscrape.com/index.html for a project but I am stuck with the pagination logic. So far i managed to get every category, the book links and the informations i needed within them but i am struggling to scrape the next page URL for every category. The first problem is that the next page URL is incomplete (but that i can manage), the second probleme is that the base URL i have to use changes for every category.
Here is my code :
import requests
from bs4 import BeautifulSoup
project = []
url = 'http://books.toscrape.com'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = []
categories = soup.findAll("ul", class_="nav nav-list")
for category in categories:
hrefs = category.find_all('a', href=True)
for href in hrefs:
links.append(href['href'])
new_links = [element.replace("catalogue", "http://books.toscrape.com/catalogue") for element in links]
del new_links[0]
page = 0
books = []
for link in new_links:
r2 = requests.get(link).text
book_soup = BeautifulSoup(r2, "html.parser")
print("category: " + link)
nextpage = True
while nextpage:
book_link = book_soup.find_all(class_="product_pod")
for product in book_link:
a = product.find('a')
full_link = a['href'].replace("../../..", "")
print("book: " + full_link)
books.append("http://books.toscrape.com/catalogue" + full_link)
if book_soup.find('li', class_='next') is None:
nextpage = False
page += 1
print("end of pagination")
else:
next_page = book_soup.select_one('li.next>a')
print(next_page)
The part i am struggling is with the WHILE loop in "for link in new_links".
I am mostly looking for any example that can help me. Thank you!
If you do not want to scrape the links via http://books.toscrape.com/index.html directly while paging all the results you could get your goal like this:
from bs4 import BeautifulSoup
import requests
base_url = 'http://books.toscrape.com/'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
for cat in soup.select('.nav-list ul a'):
cat_url = base_url+cat.get('href').rsplit('/',1)[0]
url = cat_url
while True:
soup = BeautifulSoup(requests.get(url).text)
##print(url)
books.extend(['http://books.toscrape.com/catalogue/'+a.get('href').strip('../../../') for a in soup.select('article h3 a')])
if soup.select_one('li.next a'):
url = f"{cat_url}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Cause the result would be the same I would recommend to skip the way over categories:
from bs4 import BeautifulSoup
import requests
baseurl = 'http://books.toscrape.com/'
url = 'https://books.toscrape.com/catalogue/page-1.html'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
while True:
soup = BeautifulSoup(requests.get(url).text)
for a in soup.select('article h3 a'):
bsoup = BeautifulSoup(requests.get(base_url+'catalogue/'+a.get('href')).content)
print(base_url+'catalogue/'+a.get('href'))
data = {
'title': bsoup.h1.text.strip(),
'category': bsoup.select('.breadcrumb li')[-2].text.strip(),
'url': base_url+'catalogue/'+a.get('href')
### add what ever is needed
}
data.update(dict(row.stripped_strings for row in bsoup.select('table tr')))
books.append(data)
if soup.select_one('li.next a'):
url = f"{url.rsplit('/',1)[0]}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Output
[{'title': 'A Light in the Attic',
'category': 'Poetry',
'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
'UPC': 'a897fe39b1053632',
'Product Type': 'Books',
'Price (excl. tax)': '£51.77',
'Price (incl. tax)': '£51.77',
'Tax': '£0.00',
'Availability': 'In stock (22 available)',
'Number of reviews': '0'},
{'title': 'Tipping the Velvet',
'category': 'Historical Fiction',
'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
'UPC': '90fa61229261140a',
'Product Type': 'Books',
'Price (excl. tax)': '£53.74',
'Price (incl. tax)': '£53.74',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},
{'title': 'Soumission',
'category': 'Fiction',
'url': 'http://books.toscrape.com/catalogue/soumission_998/index.html',
'UPC': '6957f44c3847a760',
'Product Type': 'Books',
'Price (excl. tax)': '£50.10',
'Price (incl. tax)': '£50.10',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},...]
I am trying to scrape data. Somehow the loop doesn't work correctly. It loops just once. I want to scrape all the name of the goods and the price.
The goods are inside "td" eg : "Sendok Semen 7 Bulat" and the price are inside "div" eg : "8.500"
Here is my code :
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
#divs = html.find_all('div', class_ = "col-md-12 col-xs-12")
divs = html.findAll('div', class_ = "row d-block")
cnt = 0
for div in divs:
cnt += 1
#print(div, end="\n"*2)
price = div.find('span', class_ = 'float-right')
print(price.text.strip())
print(cnt)
Any help will be appreciated.
Thanks
What happens?
Somehow the loop doesn't work correctly. It loops just once.
It is not the loop that won't work correctly, it is rather the way you are selecting things. So html.findAll('div', class_ = "row d-block") will find only one <div> that matches your criteria.
How to fix?
Make you are selecting more specific, cause what you are really want to iterate are the <tr> in the table - I often use css selectors and the following will get the correct selection, so just replace your html.findAll('div', class_ = "row d-block") Note In new code use find_all() instead of findAll() it is the newer syntax:
html.select('.d-block tbody tr')
Example
Will give you a well structured list of dicts:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
data = []
for row in html.select('.d-block tbody tr'):
data.append(
dict(
zip(['pos','name','currency','price'],list(row.stripped_strings))
)
)
data
Output
[{'pos': '1',
'name': 'Sendok Semen 7 Bulat',
'currency': 'Rp',
'price': '8.500'},
{'pos': '2',
'name': 'Sendok Semen 8 Bulat Gagang Kayu',
'currency': 'Rp',
'price': '10.000'},
{'pos': '3', 'name': 'SEMEN', 'currency': 'Rp', 'price': '10.000'},
{'pos': '4',
'name': 'Sendok Semen 8 Gagang Kayu SWARDFISH',
'currency': 'Rp',
'price': '10.000'},...]
But Be Aware
It will just help you to get the Top 10 - List Of Popular Semen Prices In Ralali and not all goods and prices on the page --> That is something you should clarify in your question.
Getting more data from all products
Option#1
Use an api that is provided by the website and iterate by parameter pages:
import requests
url = 'https://rarasearch.ralali.com/v2/search/item?q=semen'
res = requests.get(url)
data = []
for p in range(1, round(res.json()['total_item']/20)):
url = f'https://rarasearch.ralali.com/v2/search/item?q=semen&p={p}'
res = requests.get(url)
data.extend(res.json()['items'])
print(data)
Output:
[{'id': 114797,
'name': 'TIGA RODA Semen NON semen putih',
'image': 'assets/img/Libraries/114797_TIGA_RODA_Semen_NON_semen_putih_1_UrwztohXHo9u1yRY_1625473149.png',
'alias': 'tiga-roda-semen-non-semen-putih-157561001',
'vendor_id': 21156,
'vendor_alias': 'prokonstruksi',
'rating': '5.00',
'vendor_status': 'A',
'vendor_name': 'Pro Konstruksi',
'vendor_location': 'Palembang',
'price': '101500.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 1,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},
{'id': 268711,
'name': 'Sendok Semen Ukuran 6',
'image': 'assets/img/Libraries/268711_Sendok-Semen-Ukuran-6_HCLcQq6TUh5IiEPZ_1553521818.jpeg',
'alias': 'Sendok-Semen-Ukuran-6',
'vendor_id': 305459,
'vendor_alias': 'distributorbangunan',
'rating': None,
'vendor_status': 'A',
'vendor_name': 'Distributor Bangunan',
'vendor_location': 'Bandung',
'price': '11000.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 0,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'Unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},...]
Option#2
Use selenium, scroll to the bottom of the page toa load all products, push the driver.page_source to your soup and start selecting, ...
I want to scrape the FirstName and the LastName of this website to use it on a automated browser input.
from lxml import html
import requests
page = requests.get('https://www.getnewidentity.com/uk-identity-generator.php')
tree = html.fromstring(page.content)
firstname = tree.xpath('//*[#id="reslist"]/tbody/tr[3]/td[2]/text()')
lastname = tree.xpath('//*[#id="reslist"]/tbody/tr[4]/td[2]/text()')
print ('FirstName: ', firstname)
print ('LastName: ', lastname)
input("close")
The website is this https://www.getnewidentity.com/uk-identity-generator.php
<table class="table table-bordered table-striped" id="reslist"><thead><tr><th colspan="2" class="bg-primary">General Information</th></tr></thead><tbody><tr><td style="width:150px;">Name</td><td><b>Kamila Harmon</b></td></tr>
<tr><td>Gender</td><td>Female</td></tr>
<tr><td>First Name</td><td>Kamila</td></tr>
<tr><td>Last Name</td><td>Harmon</td></tr>
<tr><td>Birthday</td><td>12/26/1989</td></tr>
find_all()-returns a collection of elements.
strip()- in-built function of Python is used to remove all the leading and trailing spaces from a string.
Ex.
from bs4 import BeautifulSoup
import requests
request = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php'
,data={"num":"undefine","add":"address","unique":"true"})
soup = BeautifulSoup(request.content,'lxml')
td = soup.find_all("td")
data = {}
for x in range(0,len(td)-1,2):
data[td[x].text.strip()] = td[x+1].text.strip()
print(data)
O/P:
{'Name': 'Jayda Key', 'Gender': 'Female', 'First Name': 'Jayda', 'Last Name': 'Key',
'Birthday': '55', 'NINO': 'EB 29 38 84 B', 'Address': 'Flat 31l\nMartin Walk, Leoberg, S81
0HT', 'Street Address': 'Flat 31l\nMartin Walk', 'State': 'Leoberg', 'Zip Code': 'S81 0HT',
'Phone': '+44(0)9487 957056', 'Credit Card Type': 'MasterCard', 'Credit Card Number':
'5246585772859818', 'CVV': '899', 'Expires': '02/2022', 'Username': 'twinhero', 'Email':
'Gamestomper#gmail.com', 'Password': 'Go7ByznZ', 'User Agent': 'Mozilla/5.0 (Macintosh;
Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2
Safari/601.7.7', 'Height': '1.85m (6.17ft)', 'Weight': '75.22kg (158.31pounds)',
'Blood type': 'O−'}
You say you want first name and last name; with bs4 4.7.1+ you can use :contains to target appropriately. As already detailed in other answer, content is dynamically retrieved from post xhr
from bs4 import BeautifulSoup as bs
import requests
r = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php',data={"num":"undefine","add":"address","unique":"true"})
soup = bs(r.content,'lxml')
first_name = soup.select_one('td:contains("First Name") + td').text
last_name = soup.select_one('td:contains("Last Name") + td').text
full_name = soup.select_one('td:contains("Name") + td').text
print(first_name, last_name, full_name)
From this Tag:
<div class="matchDate renderMatchDateContainer" data-kickoff="1313244000000">Sat 13 Aug 2011</div>
I want to extract the "Sat 13 Aug 2011" using bs4 Beautiful Soup.
My current Code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.premierleague.com/match/7468'
j = requests.get(url)
soup = BeautifulSoup(j.content, "lxml")
containedDateTag_string = soup.find_all('div', class_="matchDate renderMatchDateContainer")
print (containedDateTag_string)
When I run it the printed output does not contain the "Sat 13 Aug 2011" and is simply stored and printed as:
[<div class="matchDate renderMatchDateContainer" data-kickoff="1313244000000"></div>]
Is there a way that I can get this string to be displayed? I have also tried parsing further through the tag with ".next_sibling" and ".text" with both displaying "[]" rather than the desired string which is why I reverted back to trying just 'div' to see if I could at least get the text to display.
Scraping the content using .page_source using selenium/ChromeDriver is the way to go here, since the date text is being generated by JavaScript:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://www.premierleague.com/match/7468"
driver = webdriver.Chrome()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
Then you can do your .find the way you were doing:
>>> soup.find('div', {'class':"matchDate renderMatchDateContainer"}).text
'Sat 13 Aug 2011'
A batteries included solution with selenium itself:
>>> driver.find_element_by_css_selector("div.matchDate.renderMatchDateContainer").text
'Sat 13 Aug 2011'
Without Selenium - but using requests and the sites own API - it would look like something this (sure, you'd grab a bunch of other data about each game, but here's just for code for the date-part):
import requests
from time import sleep
def scraper(match_id):
headers = {
"Origin":"https://www.premierleague.com",
"Referer":"https://www.premierleague.com/match/%d" % match_id
}
api_endpoint = "https://footballapi.pulselive.com/football/broadcasting-schedule/fixtures/%d" % match_id
r = requests.get(api_endpoint, headers=headers)
if not r.status_code == 200:
return None
else:
data = r.json()
# this will return something like this:
# {'broadcasters': [],
# 'fixture': {'attendance': 25700,
# 'clock': {'label': "90 +4'00", 'secs': 5640},
# 'gameweek': {'gameweek': 1, 'id': 744},
# 'ground': {'city': 'London', 'id': 16, 'name': 'Craven Cottage'},
# 'id': 7468,
# 'kickoff': {'completeness': 3,
# 'gmtOffset': 1.0,
# 'label': 'Sat 13 Aug 2011, 15:00 BST',
# 'millis': 1313244000000},
# 'neutralGround': False,
# 'outcome': 'D',
# 'phase': 'F',
# 'replay': False,
# 'status': 'C',
# 'teams': [{'score': 0,
# 'team': {'club': {'abbr': 'FUL',
# 'id': 34,
# 'name': 'Fulham'},
# 'id': 34,
# 'name': 'Fulham',
# 'shortName': 'Fulham',
# 'teamType': 'FIRST'}},
# {'score': 0,
# 'team': {'club': {'abbr': 'AVL',
# 'id': 2,
# 'name': 'Aston Villa'},
# 'id': 2,
# 'name': 'Aston Villa',
# 'shortName': 'Aston Villa',
# 'teamType': 'FIRST'}}]}}
return data
match_id = 7468
json_blob = scraper(match_id)
if json_blob is not None:
date = json_blob['fixture']['kickoff']['label']
print(date)
You need the header with those two parameters to get the data.
So if you had a bunch of match_id's you could just loop through them with this function going:
for match_id in range(7000,8000,1):
json_blob = scraper(match_id)
if json_blob is not None:
date = json_blob['fixture']['kickoff']['label']
print(date)
sleep(1)