print text inside parent div beautifulsoup - python

i'm trying to fetch each product's name and price from
https://www.daraz.pk/catalog/?q=risk but nothing shows up.
containers = page_soup.find_all("div",{"class":"c2p6A5"})
for container in containers:
pname = container.findAll("div", {"class": "c29Vt5"})
name = pname[0].text
price1 = container.findAll("span", {"class": "c29VZV"})
price = price1[0].text
print(name)
print(price)

There is JSON data in the page, you can get it in the <script> tag using beautifulsoup but I dont think this is needed, because you can get it directly with json and re
import requests, json, re
html = requests.get('https://.......').text
jsonStr = re.search(r'window.pageData=(.*?)</script>', html).group(1)
jsonObject = json.loads(jsonStr)
for item in jsonObject['mods']['listItems']:
print(item['name'])
print(item['price'])

if the page is dynamic, Selenium should take care of that
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.daraz.pk/catalog/?q=risk')
r = browser.page_source
page_soup = bs4.BeautifulSoup(r,'html.parser')
containers = page_soup.find_all("div",{"class":"c2p6A5"})
for container in containers:
pname = container.findAll("div", {"class": "c29Vt5"})
name = pname[0].text
price1 = container.findAll("span", {"class": "c29VZV"})
price = price1[0].text
print(name)
print(price)
browser.close()
output:
Risk Strategy Game
Rs. 5,900
Risk Classic Board Game
Rs. 945
RISK - The Game of Global Domination
Rs. 1,295
Risk Board Game
Rs. 1,950
Risk Board Game - Yellow
Rs. 3,184
Risk Board Game - Yellow
Rs. 1,814
Risk Board Game - Yellow
Rs. 2,086
Risk Board Game - The Game of Global Domination
Rs. 975
...

I was wrong. The info to calculate the page count is present in the json so you can get all results. No regex needed as you can extract the relevant script tag. Also, you can create the page url in a loop.
import requests
from bs4 import BeautifulSoup
import json
import math
def getNameAndPrice(url):
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
data = json.loads(soup.select('script')[2].text.strip('window.pageData='))
if url == startingPage:
resultCount = int(data['mainInfo']['totalResults'])
resultsPerPage = int(data['mainInfo']['pageSize'])
numPages = math.ceil(resultCount/resultsPerPage)
result = [[item['name'],item['price']] for item in data['mods']['listItems']]
return result
resultCount = 0
resultsPerPage = 0
numPages = 0
link = "https://www.daraz.pk/catalog/?page={}&q=risk"
startingPage = "https://www.daraz.pk/catalog/?page=1&q=risk"
results = []
results.append(getNameAndPrice(startingPage))
for links in [link.format(page) for page in range(2,numPages + 1)]:
results.append(getNameAndPrice(links))

Referring to the JSON answer to someone who is very new like me.
You can use Selenium to navigate to search result page like this:
PS: Thanks for #ewwink very much. You saved my day!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time #time delay when load web
import requests, json, re
keyword = 'fan'
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
driver = webdriver.Chrome(options = opt)
# driver = webdriver.Chrome()
url = 'https://www.lazada.co.th/'
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys(keyword)
search.send_keys(Keys.RETURN)
time.sleep(3) #wait for web load for 3 secs
page_html = driver.page_source #Selenium way of page_html = webopen.read() for BS
driver.close()
jsonStr = re.search(r'window.pageData=(.*?)</script>', page_html).group(1)
jsonObject = json.loads(jsonStr)
for item in jsonObject['mods']['listItems']:
print(item['name'])
print(item['sellerName'])

Related

Grabbing dynamic data returns wrong value and a None

The snippet tries to grab 2 data items from a URL however, one returns the wrong value, and the other returns a None.
import requests, time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome('chromedriver.exe')
url = "https://poocoin.app/tokens/0xb081cbaaa86959fc4033fde02bc539c4e897f0a1"
driver.get(url)
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
ts = soup.find('div', class_='px-3')
mc = soup.find('span', class_='text-success')
print (ts)
print (mc.text)
driver.quit()
Current Output:
Total Supply:100,000,000,000Market Cap: (Includes locked, excludes burned)$335,743Pc v2 | BUNDL/BNB LP Holdings:187.41 BNB ($94,329) | Chart | HoldersBUNDL TransactionsBUNDL ContractBUNDL HoldersDev Wallet Checker Bitquery Explorer PooCoin Visits chart
$2.22
Wanted Output:
Total Supply:
100,000,000,000
Market Cap: (Includes locked, excludes burned)
$238,945
You can use .stripped_strings and select the required data from the list of strings.
import requests, time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome('chromedriver.exe')
url = "https://poocoin.app/tokens/0xb081cbaaa86959fc4033fde02bc539c4e897f0a1"
driver.get(url)
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
ts = soup.find('div', class_='px-3')
x = list(ts.stripped_strings)[:5]
print(f'{x[0]}\n{x[1]}\n\n{x[2]}{x[3]}\n{x[4]}')
driver.quit()
Total Supply:
100,000,000,000
Market Cap:(Includes locked, excludes burned)
$331,584
You can access the data using the contents method. This way you can get the required info:
ts = soup.find('div', class_='px-3')
print(ts.contents[0])
print(ts.contents[2])
Which will output:
Total Supply:
100,000,000,000
Using the same procedure you can extract the remaining data.
Try:
Hope,it will help you. Acurate selection.
ts = soup.select_one('div.text-small>div.px-3')
mc = soup.select_one('div.text-small>div.px-3 >span.text-success')
print (ts.text)
print (mc.text)

Crawling data from a div

Good morning, I have little issue when I would like crawle data from a div. For example I have a structure on the website like:
<div class="score-result">
Player1Name
Player1Surname
<div>score</div>
</div>
I would like to get names, surnames and score of the players. I've written smth like this, but it doesn't print anything.
def trade_spider(max_hall,max_period):
hall=2
period=1
while hall <= max_hall:
url ='https://tabletennis.setkacup.com/en/schedule?date=2021-08-27&hall=' +str(hall)+'&'+'period='+str(period)
source_code = requests.get(url)
plain_text=source_code.text
soup=BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('table', {'class': 'score-result'}):
score = link.get('score-result')
print(score)
hall=+1
period=+1
Please check this code on your side.
import requests
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
service = webdriver.chrome.service.Service(os.path.abspath('chromedriver'))
service.start()
option = webdriver.ChromeOptions()
driver = webdriver.Chrome(os.path.abspath('chromedriver'), options=option)
hall = 2
period = 1
while hall <= 5:
url = 'https://tabletennis.setkacup.com/en/schedule?date=2021-08-27&hall=' + \
str(hall)+'&'+'period='+str(period)
driver.get(url)
time.sleep(5)
divs = driver.find_elements_by_css_selector("div.score-result")
for div in divs:
# you can add this code
try :
fund = div.find_element_by_tag_name("div").text
print(fund)
catch :
pass
print(div.text)
hall = hall + 1
Hope to be helpful for you.

beautifulsoup can't get 'tr' in table

I'm trying to get a list of company names (e.g. 01Venture) and types (e.g. GENERAL PATERNER) from this website https://www.bvca.co.uk/Member-Directory. I'm using the code below:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bvca.co.uk/Member-Directory'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
table = soup.find('table', attrs={'id':'searchresults'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
print(rows)
And I got an empty list.
Use the selenium package, you will need to install chromedriver.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
URL = 'https://www.bvca.co.uk/Member-Directory'
BrowserOptions = Options()
BrowserOptions.add_argument("--headless")
Browser = webdriver.Chrome(executable_path=r'chromedriver.exe', options=BrowserOptions)
Browser.get(URL)
while True:
if Browser.find_elements_by_class_name('companyName'):
break
html_source_code = Browser.execute_script("return document.body.innerHTML;")
soup = BeautifulSoup(html_source_code, 'html.parser')
x = [r.text for r in soup.find_all('h5',class_='companyName')]
print(x)
>>> ['01 Ventures', '01 Ventures', '17Capital LLP', '17Capital LLP', '1818 Venture Capital', ..., 'Zouk Capital LLP', 'Zouk Capital LLP']
The while loop waits until the company names are loaded before the html code is saved
The output was too large to put into the answer, so I could only show some of it.

bs4 scraping python get contents until specific class name

I want to scrape this site
https://www.eduvision.edu.pk/institutions-detail.php?city=51I&institute=5_allama-iqbal-open-university-islamabad
and i want only the bachelor data in this url which is under class name=academicsList and i don't want below MS(MASTERS) data.
I want my scraper to stop before ms data. my logic is that we can set temporary incrementor on class=academicsHead and it should stop when it gets second academicsHead
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
header = {'user-agent':ua.chrome}
response = requests.get('https://www.eduvision.edu.pk/institutions-detail.php?city=51I&institute=5_allama-iqbal-open-university-islamabad',headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
disciplines = soup.findAll("ul", {"class": "academicsList"})
#temp = soup.findAll("ul",{"class":"academicsHead"})
#stop at second academicsHead
for d in disciplines:
print(d.findAll('li')[0].text)
We can check if the class is 'academicsHead' and if it is just check if the text is BACHELOR if not break the loop.
Something like this would work:
disciplines = soup.findAll('ul',attrs={'class':re.compile(r'academics+(.)+')})
for i in disciplines:
if i['class'][0] == 'academicsHead':
if i.find('li').text.strip() != 'BACHELOR':
break
else:
print(i.find('li').text.strip())

ESPN.com Python web scraping issue

I am trying to pull data for the rosters for all college football teams because I want to run some analysis on team performance based on composition of their roster.
My script is working on the first page, and it iterates over each team and can open the rosters link for each team, but then the Beautiful Soup commands I am running on the rosters page for a team keep throwing Index Errors. When I look at the HTML, it seems as if the commands I am writing should work yet when I print the page source from the Beautiful Soup I don't see what I see in Developer Tools in Chrome. Is this some instance of JS being used to serve up the content? If so, I thought Selenium got around this?
My code...
import requests
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_soup = BeautifulSoup(teams_html, "html5lib")
i = 0
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
roster_driver = webdriver.Firefox()
roster_driver.get(roster_link)
roster_html = teams_driver.page_source
roster_soup = BeautifulSoup(roster_html, "html5lib")
team_name_html = roster_soup.find_all('a', class_='sub-brand-title')[0]
team_name = team_name_html.find_all('b')[0].text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name)
print('\t', player_name)
roster_driver.close()
teams_driver.close()
In your for loop you're using the html of the 1st page (roster_html = teams_driver.page_source), so you get an index error when you try to select the 1st item of team_name_html because find_all returns an empty list.
Also you don't need to have all those instances of Firefox open, you can close the driver when you have the html.
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_driver.quit()
But you don't have to use selenium for this task, you can get all the data with requests and bs4.
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.espn.com/college-football/teams")
teams_soup = BeautifulSoup(r.text, "html5lib")
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
r = requests.get(roster_link)
roster_soup = BeautifulSoup(r.text, "html5lib")
team_name = roster_soup.find('a', class_='sub-brand-title').find('b').text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name, player_name, player_pos, player_height, player_weight, player_year, player_hometown)

Categories