I try to scrape this webpage https://www.oddsportal.com/moving-margins
But the code sometime work, and sometimes don't, and even if work don't scrape all the data I need per match.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
table.append(row)
My goal is to store data into csv file with those columns:
sport country competiton handicap match_date match hdp_open hdp_close bookmaker
Tennis Czech Ostrava.. AH 0 Games Today12:00 Karatsev A. - Otte O. 0.5 -1.5 Nordicbet
I think the problem in you code is that the page has, in some cases, a single "dark" row for many "odds" rows. So when you loop the elements, you create a single record for a table that actually has more records.
This code should fit you needs, but keep in mind that it's not optimal since doesn't take care of possible exceptions, but it's a starting point:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
tables = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//table")
tableData =[]
for table in tables:
trDark = table.find_element_by_xpath('.//tr[#class="dark"]')
trOdds = table.find_elements_by_xpath('.//tr[#class="odd"]')
row = [trDark.text.strip().replace("\n", " ")]
for odd in trOdds:
tds = [
td.text.strip().replace("\n", " ")
for td in odd.find_elements_by_xpath('.//td')
]
row = row + tds
tableData.append(row)
print(tableData)
Related
I'm pretty new to web scraping and would appreciate any advice for the scenarios below:
I'm trying to produce a home loans listing table using data from https://www.canstar.com.au/home-loans/
I'm mainly trying to get listings values like the ones below:
Homestar Finance | Star Essentials P&I 80% | Variable
Unloan | Home Loan LVR <80% | Variable
TicToc Home Loans | Live-in Variable P&I | Variable
ubank | Neat Home Loan Owner Occupied P&I 70-80% | Variable
and push them into a nested table
results = [[Homestar Finance, Star Essentials P&I 80%, Variable], etc, etc]
My first attempt, I've used BeautifulSoup entirely and practice on an offline version of the site.
import pandas as pd
from bs4 import BeautifulSoup
with open('/local/path/canstar.html', 'r') as canstar_offline :
content = canstar_offline.read()
results = [['Affiliate', 'Product Name', 'Product Type']]
soup = BeautifulSoup(content, 'lxml')
for listing in soup.find_all('div', class_='table-cards-container') :
for listing1 in listing.find_all('a') :
if listing1.text.strip() != 'More details' and listing1.text.strip() != '' :
results.append(listing1.text.strip().split(' | '))
df = pd.DataFrame(results[1:], columns=results[0]).to_dict('list')
df2 = pd.DataFrame(df)
print(df2)
I pretty much got very close to what I wanted, but unfortunately it doesn't work for the actual site cause it looks like I'm getting blocked for repeated requests.
So I tried this again on Selenium but now I'm stuck.
I tried using as much of the transferrable filtering logic that I used from BS, but I can't get anywhere close to what I had using Selenium.
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
listing = table.find_element(By.TAG_NAME, 'a')
print(listing.text)
This version (above) only returns one listing (I'm trying to get the entire table through iteration)
import time
from selenium.webdriver.common.by import By
url = 'https://www.canstar.com.au/home-loans'
results = []
driver = webdriver.Chrome()
driver.get(url)
# content = driver.page_source
# soup = BeautifulSoup(content)
time.sleep(3)
tables = driver.find_elements(By.CLASS_NAME, 'table-cards-container')
for table in tables :
# listing = table.find_element(By.TAG_NAME, 'a')
print(table.text)
This version (above) looks like it gets all the text from the 'table-cards-container' class, but I'm unable to filter through it to just get the listings.
I think you can try something like this, I hope the comments in the code explain what it is doing.
# Needed libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Initiate the driver and navigate
driver = webdriver.Chrome()
url = 'https://www.canstar.com.au/home-loans'
driver.get(url)
# We save the loans list
loans = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//cnslib-table-card")))
# We make a loop once per loan in the loop
for i in range(1, len(loans)):
# With this Xpath I save the title of the loan
loan_title = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//a)[1]"))).text
print(loan_title)
# With this Xpath I save the first percentaje we see for the loan
loan_first_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[1]"))).text
print(loan_first_percentaje)
# With this Xpath I save the second percentaje we see for the loan
loan_second_percentaje = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[3]"))).text
print(loan_second_percentaje)
# With this Xpath I save the amount we see for the loan
loan_amount = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f"((//cnslib-table-card)[{i}]//span)[5]"))).text
print(loan_amount)
I would like to make web scraping using Selenium in all pages of the website below, but, until now, I could make it only in the first page. I also put data on a Pandas dataframe. How can I do this operation in all pages of this website? For now, I have:
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome(executable_path=r"C:/Users/Usuario/.spyder-py3/chromedriver.exe")
driver.get("https://www.mercadolivre.com.br/ofertas")
driver.implicitly_wait(3)
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
df = pd.DataFrame()
produtos = []
for x in tituloProduto:
produtos.append(x.text)
preco = []
for x in price:
preco.append(x.text)
df['produto'] = produtos
df['preco'] = preco
df.head()
produto preco
Furadeira Parafusadeira Com Impacto 20v 2 Bate... R$ 34232
Sony Playstation 4 Slim 1tb Mega Pack: Ghost O... R$ 2.549
Tablet Galaxy A7 Lite T225 4g Ram 64gb Grafite... R$ 1.199
Smart Tv Philco Ptv55q20snbl Dled 4k 55 110v/220v R$ 2.799
Nintendo Switch 32gb Standard Cor Vermelho-néo... R$ 2.349
I found the website you want to scrape has 209 pages in total and can be accessed with the page number: https://www.mercadolivre.com.br/ofertas?page=2, so it should be not too difficult.
One thing you can do is to loop 209 times for getting the data from each page. A better approach would be to identify the "next page" button and loop until it's unavailable, but simply using the given page number (209) is easier, so will use that.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
driver = webdriver.Chrome(executable_path=r".../chromedriver.exe")
...
# Initialize outside the loop
preco = []
produtos = []
for i in range(209):
# Parse each page with the code you already have.
driver.get('https://www.mercadolivre.com.br/ofertas?page=' + str(i))
# You may have to wait for each page to load
wait = WebDriverWait(driver, 10)
wait.until(ec.visibility_of_element_located((By.CSS_SELECTOR, "a.sc-2vbwj7-22.blyzsR")))
# If you want to speed things up, you can process them in parallel
# But you should do this only if it's worth it since it will take development time.
# Get the variables you want
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
for x in tituloProduto:
produtos.append(x.text)
for x in price:
preco.append(x.text)
Store list in DataFrame and do what you want with it.
You can use this code.
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome(
executable_path=r"C:/Users/Usuario/.spyder-py3/chromedriver.exe")
url = "https://www.mercadolivre.com.br/ofertas?page="
df = pd.DataFrame()
produtos = []
preco = []
for i in range(1, 209):
driver.get(url + str(i))
driver.implicitly_wait(3)
tituloProduto = driver.find_elements_by_class_name('promotion-item__title')
precoProduto = driver.find_elements_by_class_name('promotion-item__price')
for x in tituloProduto:
produtos.append(x.text)
for x in precoProduto:
preco.append(x.text)
df['produto'] = produtos
df['preco'] = preco
print(df)
Hope to be helpful for you. Thanks.
What you could do is find the pagination button and set it to a next_page variable like so:
next_page = response.xpath('XPATH HERE').css('a::attr(href)').extract_first()
and then call it like so:
yield scrapy.Request(next_page, callback=self.parse)
I want to scrape the table from this website:
https://www.oddsportal.com/moving-margins/
I need data inside the table #moving_margins_content_overall
I tried this code but some games contains many class="odd" and I don't know how to associate the class="odd" data with the class="dark" data
import requests
from bs4 import BeautifulSoup
import time
import json
import csv
from selenium import webdriver
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.get(u)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
driver.implicitly_wait(60) # seconds
time.sleep(2)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
soup = BeautifulSoup(source_code, 'html.parser')
for k in soup.select('#moving_margins_content_overall .table-main tbody tr'):
sport = k.select_one('tr.dark th > a').get_text(strip=True) #sport
country = soup.select_one('tr.dark th a:nth-child(3) span').get_text(strip=True) #country
competition = soup.select_one('tr.dark th a:nth-child(5)').get_text(strip=True) #sport
You can use below code to store all the data in a list in which each row in the page is stored as list.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
row.append(data.find_element_by_xpath(".//following-sibling::tr//th[#class='first2']").text)# Add data in first2 th
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
row.append(odd_row[-1].find_element_by_xpath('.//a').get_attribute("title")) #Add bookmaker name
table.append(row)
for t in table:
print(t)
Output As you can see for rugby union match there are two odds so list for that game is long.
I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.
Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)
i am using selenium to go search on agoda and scrape all the hotel name in the page, but the output only return 2 names.
Then i tried to add a line to scroll to the bottom, now the output gives me first 2 names and last 2 names (first two from beginning, last two from bottom)
I don't understand what's the problem, i added time.sleep() for each step so the whole page should have been loaded completely. Does selenium limit by page view that it can only scrape those element in sight?
my code below:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(30)
def scrape():
r = requests.get(current_page)
if r.status_code == requests.codes.ok:
print('start scraping!')
hotel = driver.find_elements_by_class_name('hotel-name')
hotels = []
for h in hotel:
if hotel:
hotels.append(h.text)
print(hotels, file=open("output.txt", 'a', encoding="utf-8"))
scrape()
Here is the page i want to scrape
Try to use below script to scroll page down until no more results appeared on page and then scrape all available names:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=8wUBc629jr0%2B3O%2BxycijdcaVIGtokeWrEO7ShJumN8xsNvkFkEV9bUgNnbx6%2Bx22ncbzTLOPBjT84OgAAKXmu6quf8aEKRA%2FQH%2BGoyXgowLt%2BXyB8OpN1h2WP%2BnBM%2FwNPzD%2BpaeII93w%2Bs4dMWI4QPJNbZJ8DWvRiPsrPVVBJY7ilpMPlUermwV1UKIKfuyeis3BqRkJh9FzJOs0E98zXQ%3D%3D&city=9590&cid=-142&tick=636818018163&languageId=20&userId=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&sessionId=d4qzq2tgymjrwsf22lnadxpc&pageTypeId=1&origin=HK&locale=zh-TW&aid=130589¤cyCode=HKD&htmlLanguage=zh-tw&cultureInfoName=zh-TW&ckuid=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&prid=0&checkIn=2019-01-16&checkOut=2019-01-17&rooms=1&adults=2&children=0&priceCur=HKD&los=1&textToSearch=%E5%A4%A7%E9%98%AA&productType=-1&travellerType=1')
# Get initial list of names
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
while True:
# Scroll down to last name in list
driver.execute_script('arguments[0].scrollIntoView();', hotels[-1])
try:
# Wait for more names to be loaded
wait(driver, 15).until(lambda driver: len(wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))) > len(hotels))
# Update names list
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
except:
# Break the loop in case no new names loaded after page scrolled down
break
# Print names list
print([hotel.text for hotel in hotels])