Crawling data from a div - python

Good morning, I have little issue when I would like crawle data from a div. For example I have a structure on the website like:
<div class="score-result">
Player1Name
Player1Surname
<div>score</div>
</div>
I would like to get names, surnames and score of the players. I've written smth like this, but it doesn't print anything.
def trade_spider(max_hall,max_period):
hall=2
period=1
while hall <= max_hall:
url ='https://tabletennis.setkacup.com/en/schedule?date=2021-08-27&hall=' +str(hall)+'&'+'period='+str(period)
source_code = requests.get(url)
plain_text=source_code.text
soup=BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('table', {'class': 'score-result'}):
score = link.get('score-result')
print(score)
hall=+1
period=+1

Please check this code on your side.
import requests
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
service = webdriver.chrome.service.Service(os.path.abspath('chromedriver'))
service.start()
option = webdriver.ChromeOptions()
driver = webdriver.Chrome(os.path.abspath('chromedriver'), options=option)
hall = 2
period = 1
while hall <= 5:
url = 'https://tabletennis.setkacup.com/en/schedule?date=2021-08-27&hall=' + \
str(hall)+'&'+'period='+str(period)
driver.get(url)
time.sleep(5)
divs = driver.find_elements_by_css_selector("div.score-result")
for div in divs:
# you can add this code
try :
fund = div.find_element_by_tag_name("div").text
print(fund)
catch :
pass
print(div.text)
hall = hall + 1
Hope to be helpful for you.

Related

BeautifulSoup can't find element class in HTML

I'm trying to scrape this page have 10 class='name main-name', like this:sample source
but when i code:
import requests
from bs4 import BeautifulSoup
result = requests.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
c = result.text
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.find_all('div', class_="name main-name")
print(len(comment_items)
but return : 0 not return : 10. I have tried search and use many solutions in stackoverflow but cann't fix
Because div name main-name doens't appear in your DOM . In this case using Selenium is more powerful than BeautifulSoap
from selenium import webdriver
driver_path = r'Your Chrome driver path'
browser = webdriver.Chrome(executable_path=driver_path)
browser.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
get_element = browser.find_elements_by_css_selector("div[class='name main-name']")
print len(get_element)
browser.close()
OUTPUT :
10
And you can also get names like:
for users in get_element:
print(users.text)
OUTPUT :
Phạm Thị Kim Chi
My Linh Nguyen
Mr Vinh Bảo Hiểm Sức Khoẻ Sắc Đẹp
Ngô Thị Tuyết
Huỳnh Thị Bích Trâm
Linh Trúc Diêm
Nguyen Tu
Nguyen Thom
Hồ Thu Trang
Trầnthịtrắng
As I stated in the comments, it's generated dynamically. So here's an implementation with Selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong"
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
c = driver.page_source
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.find_all('div', {'class':"name main-name"})
print (len(comment_items))
driver.close()
Output:
print (len(comment_items))
10
You can use beautifulsoup4 select function
import requests
from bs4 import BeautifulSoup
result = requests.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
c = result.text
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.select("div.name.main-name")
print(len(comment_items))

print text inside parent div beautifulsoup

i'm trying to fetch each product's name and price from
https://www.daraz.pk/catalog/?q=risk but nothing shows up.
containers = page_soup.find_all("div",{"class":"c2p6A5"})
for container in containers:
pname = container.findAll("div", {"class": "c29Vt5"})
name = pname[0].text
price1 = container.findAll("span", {"class": "c29VZV"})
price = price1[0].text
print(name)
print(price)
There is JSON data in the page, you can get it in the <script> tag using beautifulsoup but I dont think this is needed, because you can get it directly with json and re
import requests, json, re
html = requests.get('https://.......').text
jsonStr = re.search(r'window.pageData=(.*?)</script>', html).group(1)
jsonObject = json.loads(jsonStr)
for item in jsonObject['mods']['listItems']:
print(item['name'])
print(item['price'])
if the page is dynamic, Selenium should take care of that
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.daraz.pk/catalog/?q=risk')
r = browser.page_source
page_soup = bs4.BeautifulSoup(r,'html.parser')
containers = page_soup.find_all("div",{"class":"c2p6A5"})
for container in containers:
pname = container.findAll("div", {"class": "c29Vt5"})
name = pname[0].text
price1 = container.findAll("span", {"class": "c29VZV"})
price = price1[0].text
print(name)
print(price)
browser.close()
output:
Risk Strategy Game
Rs. 5,900
Risk Classic Board Game
Rs. 945
RISK - The Game of Global Domination
Rs. 1,295
Risk Board Game
Rs. 1,950
Risk Board Game - Yellow
Rs. 3,184
Risk Board Game - Yellow
Rs. 1,814
Risk Board Game - Yellow
Rs. 2,086
Risk Board Game - The Game of Global Domination
Rs. 975
...
I was wrong. The info to calculate the page count is present in the json so you can get all results. No regex needed as you can extract the relevant script tag. Also, you can create the page url in a loop.
import requests
from bs4 import BeautifulSoup
import json
import math
def getNameAndPrice(url):
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
data = json.loads(soup.select('script')[2].text.strip('window.pageData='))
if url == startingPage:
resultCount = int(data['mainInfo']['totalResults'])
resultsPerPage = int(data['mainInfo']['pageSize'])
numPages = math.ceil(resultCount/resultsPerPage)
result = [[item['name'],item['price']] for item in data['mods']['listItems']]
return result
resultCount = 0
resultsPerPage = 0
numPages = 0
link = "https://www.daraz.pk/catalog/?page={}&q=risk"
startingPage = "https://www.daraz.pk/catalog/?page=1&q=risk"
results = []
results.append(getNameAndPrice(startingPage))
for links in [link.format(page) for page in range(2,numPages + 1)]:
results.append(getNameAndPrice(links))
Referring to the JSON answer to someone who is very new like me.
You can use Selenium to navigate to search result page like this:
PS: Thanks for #ewwink very much. You saved my day!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time #time delay when load web
import requests, json, re
keyword = 'fan'
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
driver = webdriver.Chrome(options = opt)
# driver = webdriver.Chrome()
url = 'https://www.lazada.co.th/'
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys(keyword)
search.send_keys(Keys.RETURN)
time.sleep(3) #wait for web load for 3 secs
page_html = driver.page_source #Selenium way of page_html = webopen.read() for BS
driver.close()
jsonStr = re.search(r'window.pageData=(.*?)</script>', page_html).group(1)
jsonObject = json.loads(jsonStr)
for item in jsonObject['mods']['listItems']:
print(item['name'])
print(item['sellerName'])

ESPN.com Python web scraping issue

I am trying to pull data for the rosters for all college football teams because I want to run some analysis on team performance based on composition of their roster.
My script is working on the first page, and it iterates over each team and can open the rosters link for each team, but then the Beautiful Soup commands I am running on the rosters page for a team keep throwing Index Errors. When I look at the HTML, it seems as if the commands I am writing should work yet when I print the page source from the Beautiful Soup I don't see what I see in Developer Tools in Chrome. Is this some instance of JS being used to serve up the content? If so, I thought Selenium got around this?
My code...
import requests
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_soup = BeautifulSoup(teams_html, "html5lib")
i = 0
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
roster_driver = webdriver.Firefox()
roster_driver.get(roster_link)
roster_html = teams_driver.page_source
roster_soup = BeautifulSoup(roster_html, "html5lib")
team_name_html = roster_soup.find_all('a', class_='sub-brand-title')[0]
team_name = team_name_html.find_all('b')[0].text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name)
print('\t', player_name)
roster_driver.close()
teams_driver.close()
In your for loop you're using the html of the 1st page (roster_html = teams_driver.page_source), so you get an index error when you try to select the 1st item of team_name_html because find_all returns an empty list.
Also you don't need to have all those instances of Firefox open, you can close the driver when you have the html.
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_driver.quit()
But you don't have to use selenium for this task, you can get all the data with requests and bs4.
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.espn.com/college-football/teams")
teams_soup = BeautifulSoup(r.text, "html5lib")
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
r = requests.get(roster_link)
roster_soup = BeautifulSoup(r.text, "html5lib")
team_name = roster_soup.find('a', class_='sub-brand-title').find('b').text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name, player_name, player_pos, player_height, player_weight, player_year, player_hometown)

python web scraping - tags not found

I'm trying to scrape some info from a website that requires a login.
Here is my code:
from selenium import webdriver
from bs4 import BeautifulSoup
from pandas import DataFrame
import requests
import pandas as pd
import numpy as np
import time
import requests
from lxml import html
driver = webdriver.Chrome(executable_path='/Applications/chromedriver')
driver.get("website/login")
#Login
username = driver.find_element_by_name('email')
username.send_keys('my email')
password = driver.find_element_by_name('password')
password.send_keys('my password')
form = driver.find_element_by_id('login-button')
form.submit()
#Search activities
driver.get('website/activities/search')
city = driver.find_element_by_name('location')
city.send_keys('London, England, United Kingdom')
key = driver.find_element_by_name('keywords')
key.send_keys('run')
activity = driver.find_element_by_name('activity_type')
activity.send_keys('Run')
search = driver.find_element_by_id('ride-search-button')
search.submit()
#Webscrape
pg = 15
for pg in range(1, pg):
print('page = ', pg)
list_links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("//a[contains(#href,'activities')]")][1:10]
print(list_links)
for i, link in enumerate(list_links):
driver.get(link)
page = requests.get(link)
htmlpg = BeautifulSoup(page.content, 'html.parser')
try:
item = htmlpg.find_all('strong')
#Distance = ?? to be completed as item doe not find anything with 7.1 in it
except:
driver.back()
continue
driver.back()
driver.quit()
The problem is that when I try to scrape the distance (7.1) using find_all, bs4 does not appear to find anything. It almost seems it is not parsing info on the current page (see html below).
Can someone tell me what I'm doing wrong? Thank you,
Fede

How to click one of the href links from output that doesn't have a particular word in it?

I've parsed a list of href links and it's titles from a webpage. I want to click all the links that don't have the word "[$]". Here is my code.
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import webbrowser
from selenium import webdriver
import urllib.request
import time
from bs4 import BeautifulSoup
import re
browser = webdriver.Chrome(r"C:\Users\vasanth\Downloads\Compressed\chromedriver.exe")
browser.get("http://englishworldwide.ning.com/events/event/listUpcoming")
tip = browser.page_source
soup = BeautifulSoup(tip, 'html.parser')
link = soup.find_all('div', {'class': "wrap xg_lightborder"})
for dept in link:
lilly = dept.find_all('ul', {'class': 'clist'})
for h3 in lilly:
sill = h3.find_all('li')
for sec in sill:
tap = sec.find_all('div', {'class': 'tb'})
for lip in tap:
tappy = lip.find_all('h3')
for lips in tappy:
tom = lips.find_all('a')
for pos, lee in enumerate(tom):
sappy = lee.get('href')
result = re.sub(r'<.*?>', "", str(lee))
print(result)
print(sappy)
Here is my output. And I want to click all those links which don't have the word "[$]" on its title.
C:\Users\vasanth\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/vasanth/PycharmProjects/Youtube/jill.py
LEWWWP's round the clock Google+ Hangout Club!
http://englishworldwide.ning.com/events/lewwwp-s-24-7-google-hangout-club
Weekly Wednesday LEWWWP Site Text Chat
http://englishworldwide.ning.com/events/weekly-wednesday-lewwwp-site-text-chat-952
Improve your speaking fluency [$] faster-paced
http://englishworldwide.ning.com/events/improve-your-speaking-fluency-faster-paced-45
Exam Prep speaking practice [$] Answer, Discuss, Repeat
http://englishworldwide.ning.com/events/exam-prep-speaking-practice-answer-discuss-repeat-29
Transcription / Pronunciation class [SLOWER-paced / Novice level]
http://englishworldwide.ning.com/events/transcription-pronunciation-class-395
Process finished with exit code 0
EDIT 1:
I have found another step ahead to find those links which don't have "[$]" in it. But I can't open those links by its positions. But the following method doesn't open those specific links.
Here is the rest of my code...
tricky = BeautifulSoup(str(tom), 'html.parser')
href_links = lambda tag: (getattr(tag, 'name', None) == 'a' and not '$' in tag.get_text())
for pos, final in enumerate(tricky.find_all(href_links)):
simmpy = final.get('href')
print(simmpy)
if pos == 2:
webbrowser.open(simmpy)
else:
break
Just check if link contains dollar sign:
s = "This be a string"
if s.find("$") == -1:
print "$ not found"
else:
print "Found $ in string"

Categories