I'm trying to scrape the "Biggest Gainers" list of coins on https://coinmarketcap.com/
How do I access the nth child (Biggest Gainers) in the div class_ = 'sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy'
I managed to get the data from the "Trending" section but having trouble targeting the "Biggest Gainers" top 3 text items.
I get AttributeError: 'NoneType' object has no attribute 'p'
from bs4 import BeautifulSoup
import requests
source = requests.get('https://coinmarketcap.com/').text
soup = BeautifulSoup(source, 'lxml')
section = soup.find(class_='sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy')
#List the top 3 Gainers
for top_gainers in section.find_all(class_='sc-16r8icm-0 sc-1uagfi2-0 bdEGog sc-1rmt1nr-1 eCWTbV')[1]:
top_gainers = top_gainers.find(class_='sc-1eb5slv-0 iworPT')
top_coins = top_gainers.p.text
print(top_coins)
I would avoid those dynamic classes and instead use -:soup-contains and combinators to first locate desired block via text, then with the combinators specify the relationship of the final elements to extract info from.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
soup = bs(requests.get("https://coinmarketcap.com/").text, "lxml")
biggest_gainers = []
for i in soup.select(
'div[color=text]:has(span:-soup-contains("Biggest Gainers")) > div ~ div'
):
biggest_gainers.append(
{
"rank": int(i.select_one(".rank").text),
"currency": i.select_one(".alias").text,
"% change": f"{i.select_one('.icon-Caret-up').next_sibling}",
}
)
gainers = pd.DataFrame(biggest_gainers)
gainers
As mentioned by #QHarr you should avoid dynamic identifier similar to his approach the selection comes via :-soup-contains() and the known text of the element:
soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div')
To extract the texts I used stripped_strings and zipped it with the keys to a dict:
dict(zip(['rank','name','alias','change'],e.stripped_strings))
Example
from bs4 import BeautifulSoup
import requests
url = 'https://coinmarketcap.com/'
soup=BeautifulSoup(requests.get(url).content)
data = []
for e in soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div'):
data.append(dict(zip(['rank','name','alias','change'],e.stripped_strings)))
Output
[{'rank': '1', 'name': 'Tenset', 'alias': '10SET', 'change': '1406.99'},
{'rank': '2', 'name': 'Burn To Earn', 'alias': 'BTE', 'change': '348.89'},
{'rank': '3', 'name': 'MetaCars', 'alias': 'MTC', 'change': '332.05'}]
You can use :nth-of-type to locate the "Biggest Gainers" parent div:
import requests
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://coinmarketcap.com/').text, 'html.parser')
bg = d.select_one('div:nth-of-type(2).sc-16r8icm-0.sc-1uagfi2-0.bdEGog.sc-1rmt1nr-1.eCWTbV')
data = [{'rank':i.select_one('span.rank').text,
'name':i.select_one('p.sc-1eb5slv-0.iworPT').text,
'change':i.select_one('span.sc-27sy12-0.gLZJFn').text}
for i in bg.select('div.sc-1rmt1nr-0.sc-1rmt1nr-4.eQRTPY')]
Output:
[{'rank': '1', 'name': 'Tenset', 'change': '1308.72%'}, {'rank': '2', 'name': 'Burn To Earn', 'change': '421.82%'}, {'rank': '3', 'name': 'Aigang', 'change': '329.63%'}]
Related
I want to fetch the all the list tags under the ul tag with id= "demofour" from https://www.parliament.lk/en/members-of-parliament/directory-of-members/?cletter=A.
Below is the code:
print(soup.find('ul',id='demoFour'))
But the output which is being displayed is
<ul id="demoFour"></ul>
Content is served dynamically based on data of an additional XHR request, so you have to call this instead. You can inspect this by taking a look into devtools of browser on XHR tab.
Example
Instead of appending only the obvious to a list of dicts you could also iterate all detailpages while requesting them.
from bs4 import BeautifulSoup
import requests, string
data = []
for letter in list(string.ascii_uppercase):
result = requests.post(f'https://www.parliament.lk/members-of-parliament/directory-of-members/index2.php?option=com_members&task=all&tmpl=component&letter={letter}&wordfilter=&search_district=')
for e in result.json():
#result = requests.get(f"https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/{e['mem_intranet_id']}")
data.append({
'url':f"https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/{e['mem_intranet_id']}",
'id':e['mem_intranet_id'],
'name':e['member_sname_eng']
})
data
Output
[{'url': 'https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/3266',
'id': '3266',
'name': 'A. Aravindh Kumar'},
{'url': 'https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/50',
'id': '50',
'name': 'Abdul Haleem'},
{'url': 'https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/3325',
'id': '3325',
'name': 'Ajith Rajapakse'},
{'url': 'https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/3296',
'id': '3296',
'name': 'Akila Ellawala'},
{'url': 'https://www.parliament.lk/en/members-of-parliament/directory-of-members/viewMember/3355',
'id': '3355',
'name': 'Ali Sabri Raheem'},...]
Whenever i try to extract the data, it returns an output of "None" which I am not sure of is it the code (I followed the rules of using bs4) or is it just the website that's different to scrape?
My code:
import requests
import bs4 as bs
url = 'https://www.zomato.com/jakarta/pondok-indah-restaurants'
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = req.text
soup = bs.BeautifulSoup(html, "html.parser")
listings = soup.find('div', class_='sc-gAmQfK fKxEbD')
rest_name = listings.find('h4', class_='sc-1hp8d8a-0 sc-eTyWNx gKsZcT').text
##Output: AttributeError: 'NoneType' object has no attribute 'find'
print(listings)
##returns None
Here is the inspected tag of the website which i try to get the h4 class showing the restaurant's name:
inspected element
What happens?
Classes are generated dynamically and may differ from your inspections via developer tools - So you won't find what you are looking for.
How to fix?
It would be a better approach to select your targets via tag or id if available, cause these are more static than css classes.
listings = soup.select('a:has(h4)')
Example
Iterating listings and scrape several infromation:
import requests
import bs4 as bs
url = 'https://www.zomato.com/jakarta/pondok-indah-restaurants'
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = req.text
soup = bs.BeautifulSoup(html, "html.parser")
data = []
for item in soup.select('a:has(h4)'):
data.append({
'title':item.h4.text,
'url':item['href'],
'etc':'...'
})
print(data)
Output
[{'title': 'Radio Dalam Diner', 'url': '/jakarta/radio-dalam-diner-pondok-indah/info', 'etc': '...'}, {'title': 'Aneka Bubur 786', 'url': '/jakarta/aneka-bubur-786-pondok-indah/info', 'etc': '...'}, {'title': "McDonald's", 'url': '/jakarta/mcdonalds-pondok-indah/info', 'etc': '...'}, {'title': 'KOPIKOBOY', 'url': '/jakarta/kopikoboy-pondok-indah/info', 'etc': '...'}, {'title': 'Kopitelu', 'url': '/jakarta/kopitelu-pondok-indah/info', 'etc': '...'}, {'title': 'KFC', 'url': '/jakarta/kfc-pondok-indah/info', 'etc': '...'}, {'title': 'HokBen Delivery', 'url': '/jakarta/hokben-delivery-pondok-indah/info', 'etc': '...'}, {'title': 'PHD', 'url': '/jakarta/phd-pondok-indah/info', 'etc': '...'}, {'title': 'Casa De Jose', 'url': '/jakarta/casa-de-jose-pondok-indah/info', 'etc': '...'}]
I am trying to scrape data. Somehow the loop doesn't work correctly. It loops just once. I want to scrape all the name of the goods and the price.
The goods are inside "td" eg : "Sendok Semen 7 Bulat" and the price are inside "div" eg : "8.500"
Here is my code :
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
#divs = html.find_all('div', class_ = "col-md-12 col-xs-12")
divs = html.findAll('div', class_ = "row d-block")
cnt = 0
for div in divs:
cnt += 1
#print(div, end="\n"*2)
price = div.find('span', class_ = 'float-right')
print(price.text.strip())
print(cnt)
Any help will be appreciated.
Thanks
What happens?
Somehow the loop doesn't work correctly. It loops just once.
It is not the loop that won't work correctly, it is rather the way you are selecting things. So html.findAll('div', class_ = "row d-block") will find only one <div> that matches your criteria.
How to fix?
Make you are selecting more specific, cause what you are really want to iterate are the <tr> in the table - I often use css selectors and the following will get the correct selection, so just replace your html.findAll('div', class_ = "row d-block") Note In new code use find_all() instead of findAll() it is the newer syntax:
html.select('.d-block tbody tr')
Example
Will give you a well structured list of dicts:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ralali.com/search/semen'
res = requests.get(url)
html = BeautifulSoup(res.content,"html.parser")
data = []
for row in html.select('.d-block tbody tr'):
data.append(
dict(
zip(['pos','name','currency','price'],list(row.stripped_strings))
)
)
data
Output
[{'pos': '1',
'name': 'Sendok Semen 7 Bulat',
'currency': 'Rp',
'price': '8.500'},
{'pos': '2',
'name': 'Sendok Semen 8 Bulat Gagang Kayu',
'currency': 'Rp',
'price': '10.000'},
{'pos': '3', 'name': 'SEMEN', 'currency': 'Rp', 'price': '10.000'},
{'pos': '4',
'name': 'Sendok Semen 8 Gagang Kayu SWARDFISH',
'currency': 'Rp',
'price': '10.000'},...]
But Be Aware
It will just help you to get the Top 10 - List Of Popular Semen Prices In Ralali and not all goods and prices on the page --> That is something you should clarify in your question.
Getting more data from all products
Option#1
Use an api that is provided by the website and iterate by parameter pages:
import requests
url = 'https://rarasearch.ralali.com/v2/search/item?q=semen'
res = requests.get(url)
data = []
for p in range(1, round(res.json()['total_item']/20)):
url = f'https://rarasearch.ralali.com/v2/search/item?q=semen&p={p}'
res = requests.get(url)
data.extend(res.json()['items'])
print(data)
Output:
[{'id': 114797,
'name': 'TIGA RODA Semen NON semen putih',
'image': 'assets/img/Libraries/114797_TIGA_RODA_Semen_NON_semen_putih_1_UrwztohXHo9u1yRY_1625473149.png',
'alias': 'tiga-roda-semen-non-semen-putih-157561001',
'vendor_id': 21156,
'vendor_alias': 'prokonstruksi',
'rating': '5.00',
'vendor_status': 'A',
'vendor_name': 'Pro Konstruksi',
'vendor_location': 'Palembang',
'price': '101500.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 1,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},
{'id': 268711,
'name': 'Sendok Semen Ukuran 6',
'image': 'assets/img/Libraries/268711_Sendok-Semen-Ukuran-6_HCLcQq6TUh5IiEPZ_1553521818.jpeg',
'alias': 'Sendok-Semen-Ukuran-6',
'vendor_id': 305459,
'vendor_alias': 'distributorbangunan',
'rating': None,
'vendor_status': 'A',
'vendor_name': 'Distributor Bangunan',
'vendor_location': 'Bandung',
'price': '11000.00',
'discount': 0,
'discount_percentage': 0,
'free_ongkir_lokal': 0,
'free_ongkir_nusantara': 0,
'is_stock_available': 1,
'minimum_order': 1,
'maximum_order': 999999999,
'unit_type': 'Unit',
'ss_type': 0,
'is_open': 'Y',
'wholesale_price': []},...]
Option#2
Use selenium, scroll to the bottom of the page toa load all products, push the driver.page_source to your soup and start selecting, ...
I am trying to extract some data from two tables from the same HTML with BeautifulSoup. Actually, I already extracted part from both tables but not all. This is the code that I have:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html_content = urlopen('https://www.icewarehouse.com/Bauer_Vapor_X25_Ice_Hockey_Skates/descpage-V25XS.html')
soup = BeautifulSoup(html_content, "lxml")
tables = soup.find_all('table', attrs={'class' : 'orderingtable fl'})
for table_skates in tables:
t_headers = []
t_data = []
t_row = {}
for tr in table_skates.find_all('th'):
t_headers.append(tr.text.replace('\n', '').strip())
for td in table_skates.find_all('td'):
t_data.append(td.text.replace('\n', '').strip())
t_row = dict(zip(t_headers, t_data))
print(t_row)
Here is the output that I get:
{'Size': '1.0', 'Price': '$109.99', 'Stock': '1', 'Qty': ''}
{'Size': '7.0', 'Price': '$159.99', 'Stock': '2+', 'Qty': ''}
You can easily get it by using 'read_html' in 'pandas'.
df = pd.read_html(html_content, attrs={'class' : 'orderingtable fl'})
From this Tag:
<div class="matchDate renderMatchDateContainer" data-kickoff="1313244000000">Sat 13 Aug 2011</div>
I want to extract the "Sat 13 Aug 2011" using bs4 Beautiful Soup.
My current Code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.premierleague.com/match/7468'
j = requests.get(url)
soup = BeautifulSoup(j.content, "lxml")
containedDateTag_string = soup.find_all('div', class_="matchDate renderMatchDateContainer")
print (containedDateTag_string)
When I run it the printed output does not contain the "Sat 13 Aug 2011" and is simply stored and printed as:
[<div class="matchDate renderMatchDateContainer" data-kickoff="1313244000000"></div>]
Is there a way that I can get this string to be displayed? I have also tried parsing further through the tag with ".next_sibling" and ".text" with both displaying "[]" rather than the desired string which is why I reverted back to trying just 'div' to see if I could at least get the text to display.
Scraping the content using .page_source using selenium/ChromeDriver is the way to go here, since the date text is being generated by JavaScript:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://www.premierleague.com/match/7468"
driver = webdriver.Chrome()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
Then you can do your .find the way you were doing:
>>> soup.find('div', {'class':"matchDate renderMatchDateContainer"}).text
'Sat 13 Aug 2011'
A batteries included solution with selenium itself:
>>> driver.find_element_by_css_selector("div.matchDate.renderMatchDateContainer").text
'Sat 13 Aug 2011'
Without Selenium - but using requests and the sites own API - it would look like something this (sure, you'd grab a bunch of other data about each game, but here's just for code for the date-part):
import requests
from time import sleep
def scraper(match_id):
headers = {
"Origin":"https://www.premierleague.com",
"Referer":"https://www.premierleague.com/match/%d" % match_id
}
api_endpoint = "https://footballapi.pulselive.com/football/broadcasting-schedule/fixtures/%d" % match_id
r = requests.get(api_endpoint, headers=headers)
if not r.status_code == 200:
return None
else:
data = r.json()
# this will return something like this:
# {'broadcasters': [],
# 'fixture': {'attendance': 25700,
# 'clock': {'label': "90 +4'00", 'secs': 5640},
# 'gameweek': {'gameweek': 1, 'id': 744},
# 'ground': {'city': 'London', 'id': 16, 'name': 'Craven Cottage'},
# 'id': 7468,
# 'kickoff': {'completeness': 3,
# 'gmtOffset': 1.0,
# 'label': 'Sat 13 Aug 2011, 15:00 BST',
# 'millis': 1313244000000},
# 'neutralGround': False,
# 'outcome': 'D',
# 'phase': 'F',
# 'replay': False,
# 'status': 'C',
# 'teams': [{'score': 0,
# 'team': {'club': {'abbr': 'FUL',
# 'id': 34,
# 'name': 'Fulham'},
# 'id': 34,
# 'name': 'Fulham',
# 'shortName': 'Fulham',
# 'teamType': 'FIRST'}},
# {'score': 0,
# 'team': {'club': {'abbr': 'AVL',
# 'id': 2,
# 'name': 'Aston Villa'},
# 'id': 2,
# 'name': 'Aston Villa',
# 'shortName': 'Aston Villa',
# 'teamType': 'FIRST'}}]}}
return data
match_id = 7468
json_blob = scraper(match_id)
if json_blob is not None:
date = json_blob['fixture']['kickoff']['label']
print(date)
You need the header with those two parameters to get the data.
So if you had a bunch of match_id's you could just loop through them with this function going:
for match_id in range(7000,8000,1):
json_blob = scraper(match_id)
if json_blob is not None:
date = json_blob['fixture']['kickoff']['label']
print(date)
sleep(1)