scrape table where rows need to be associated with previous elements - python

I want to scrape the table from this website:
https://www.oddsportal.com/moving-margins/
I need data inside the table #moving_margins_content_overall
I tried this code but some games contains many class="odd" and I don't know how to associate the class="odd" data with the class="dark" data
import requests
from bs4 import BeautifulSoup
import time
import json
import csv
from selenium import webdriver
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.get(u)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
driver.implicitly_wait(60) # seconds
time.sleep(2)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
soup = BeautifulSoup(source_code, 'html.parser')
for k in soup.select('#moving_margins_content_overall .table-main tbody tr'):
sport = k.select_one('tr.dark th > a').get_text(strip=True) #sport
country = soup.select_one('tr.dark th a:nth-child(3) span').get_text(strip=True) #country
competition = soup.select_one('tr.dark th a:nth-child(5)').get_text(strip=True) #sport

You can use below code to store all the data in a list in which each row in the page is stored as list.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
row.append(data.find_element_by_xpath(".//following-sibling::tr//th[#class='first2']").text)# Add data in first2 th
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
row.append(odd_row[-1].find_element_by_xpath('.//a').get_attribute("title")) #Add bookmaker name
table.append(row)
for t in table:
print(t)
Output As you can see for rugby union match there are two odds so list for that game is long.

Related

scraping sports table from oddsportal

I try to scrape this webpage https://www.oddsportal.com/moving-margins
But the code sometime work, and sometimes don't, and even if work don't scrape all the data I need per match.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
table.append(row)
My goal is to store data into csv file with those columns:
sport country competiton handicap match_date match hdp_open hdp_close bookmaker
Tennis Czech Ostrava.. AH 0 Games Today12:00 Karatsev A. - Otte O. 0.5 -1.5 Nordicbet
I think the problem in you code is that the page has, in some cases, a single "dark" row for many "odds" rows. So when you loop the elements, you create a single record for a table that actually has more records.
This code should fit you needs, but keep in mind that it's not optimal since doesn't take care of possible exceptions, but it's a starting point:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
tables = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//table")
tableData =[]
for table in tables:
trDark = table.find_element_by_xpath('.//tr[#class="dark"]')
trOdds = table.find_elements_by_xpath('.//tr[#class="odd"]')
row = [trDark.text.strip().replace("\n", " ")]
for odd in trOdds:
tds = [
td.text.strip().replace("\n", " ")
for td in odd.find_elements_by_xpath('.//td')
]
row = row + tds
tableData.append(row)
print(tableData)

how do i webscrape using python and beautifulsoup?

Im very new to this, but I have an idea for a website and I want to give it a good go, my aim is to scrape the Asda website for prices and products, more specifically in this case whiskey. I want to grab the name and price of all the whiskey on the Asda website and put it into a nice table on my website, however I am having problems doing so, my code so far is getting syntax error, can anyone help?
the code so far is..
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
res = driver.execute_script('return document.documentElement.outerHTML')
html_soup = BeautifulSoup(res, 'html.parser')
type(html_soup)
driver.quit
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
whiskey_container = html_soup.find('div', {'class': 'co-product-lazy-container'})
for whiskey in whiskey_container:
name = whiskey.find('a', {'class': 'co-product__anchor'})
price = whiskey.find('div', {'class': 'co-product__price'})
print(name, price)
Try it:
# for wait time better than time.sleep()
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time # or WebDriverWait
import csv # for saving data in table
# save csv file
def save_csv(dct):
'''
dct - dictionary with our data:
"cap",
"title",
"price"
'''
name = "file.csv" # file name, it can choice what you want
print("[INFO] saving...") # for see that function works
with open(name, 'a', encoding="utf-8") as f: # open file for writing "a"
# this need for writing data to table
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['title'],
dct['price'],
))
def scroll(driver):
# for open all interesting us data
for i in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650?facets=shelf%3A1579926650%3A0000&nutrition=&sortBy=&page=0")
for i in range(2): # 2 because we have only two page with data
element = WebDriverWait(driver, 30) # or time.sleep(30)
scroll(driver) # for open all interesting us data
# get all data to one list in beautifulsoup type
data = driver.find_elements_by_css_selector(".co-lazy-product-container .co-item")
# iterating interesting data and create dictionary with data
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["title"] = body[1]
items["price"] = body[-2]
save_csv(items)
# pagination
driver.find_element_by_css_selector(".co-pagination__last-page").click()
# close driver
driver.quit()
you have syntax error, you have ")" missing :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
it should be :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
--
btw your code won't work. you have couple of logical errors.
and I doubt you can scrape that page with your current code.

Python Selenium iterating through table and click correspondingly each row

I am trying to iterate through a table and download xml files, but, i am only downloading the contents of the first element in the table. How can I iterate correctly to download the content from each row ?
Where should I include row after for row in table: to irerate correctly?
from selenium import webdriver
options.add_argument("--incognito")
driver = webdriver.Chrome(options=options)
driver.get('https://fnet.bmfbovespa.com.br/fnet/publico/abrirGerenciadorDocumentosCVM?cnpjFundo=30983020000190')
driver.find_element_by_css_selector(f'input[type="search"]').click()
driver.find_element_by_css_selector(f'input[type="search"]').send_keys('rendimentos')
time.sleep(1)
table = driver.find_elements_by_xpath("//table[#id='tblDocumentosEnviados']//tr")
for row in table:
try:
WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH,"//table[#id='tblDocumentosEnviados']//td[text()='Rendimentos e Amortizações']/following-sibling::td[.//span[text()='Ativo']]/following-sibling::td//a[#title='Download do Documento']"))).click()
x = x + 1
print(x)
except:
print('except')
EDIT
I need to add the row iteration in this line to be successful:
try:
WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH,
"//table[#id='tblDocumentosEnviados']//td[text()='Rendimentos e Amortizações']/following-sibling::td[.//span[text()='Ativo']]/following-sibling::td//a[#title='Download do Documento']"))).click()
Instead of using selenium to download the file I prefer to us BeautifulSoup.
Change your table to the one below, to get the html
from bs4 import BeautifulSoup
table = driver.find_elements_by_xpath("//table[#id='tblDocumentosEnviados']")
table_html = table[0].get_attribute('outerHTML')
table_html = BeautifulSoup(table_html, 'lxml')
list_url = []
for tr in table_html.find_all('tr'):
for td in tr.find_all('td'):
file_anchor = td.find('a', {'title': 'Download do Documento'})
if file_anchor:
complete_url = 'https://fnet.bmfbovespa.com.br/fnet/publico/{}'.format(file_anchor.get('href'))
list_url.append(complete_url)
Now you can use request.get to download the file, Hope this helps !!!
file download - https://www.tutorialspoint.com/downloading-files-from-web-using-python
Try below code this will target the row you are after.
options.add_argument("--incognito")
driver = webdriver.Chrome(options=options)
driver.get('https://fnet.bmfbovespa.com.br/fnet/publico/abrirGerenciadorDocumentosCVM?cnpjFundo=30983020000190')
driver.find_element_by_css_selector('input[type="search"]').click()
driver.find_element_by_css_selector('input[type="search"]').send_keys('rendimentos')
time.sleep(1)
table = driver.find_elements_by_xpath("//table[#id='tblDocumentosEnviados']//tr")
print(len(table))
for row in range(len(table)):
try:
WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH,"//table[#id='tblDocumentosEnviados']//tr[" + str(row) + "]//td[text()='Rendimentos e Amortizações']/following-sibling::td[.//span[text()='Ativo']]/following-sibling::td//a[#title='Download do Documento']"))).click()
x = row + 1
print(x)
except:
print('except')

Python: Find condition to break out of loop

The purpose of my code is to web scrape a table that has multiple pages.
So far, with the use of selenium & bs4, I've managed to do just that. However, I am having trouble breaking out of my loop seeing as the last page still has the 'next' button, as a result, the program keeps scraping the last page over and over.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import csv
import datetime as dt
# website url
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do?method=redirect&forward=main.no.sidebar.sokresultat"
# website
driver = webdriver.Chrome()
driver.get(url)
# click sök kungörelse
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
# click avancerad sökning
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
# select "annan period"
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
# select "skuldsanering"
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
# select "inledande av skuldsanering"
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
#calculate date
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
# insert search date
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
# click on "sök"
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
#get updated url
html = driver.page_source
#scrape table
with open('skuldsanering.txt', 'w', encoding='utf-8') as r:
while True:
html = driver.page_source
soup = bs(html, 'html.parser')
table = soup.find('tbody')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.get_text(strip=True) for i in td]
csv_writer = csv.writer(r)
csv_writer.writerows([row])
try:
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
soup = bs(html, 'html.parser')
except:
#insert condition to break out of loop
break
I was thinking perhaps maybe it would be possible to include a click counter and break out of the loop when the amount of clicks (x) equals y in "Page x of y"? If that's a good solution, how do I move forward? If not, what would be a better solution?
Thank you very much in advance!
The results page shows Page x of y, you can check if x==y each time and when it's true break the loop.
Here's the tag I'm talking about.
<em class="gotopagebuttons">Sida 17 av 17</em>
You can split the string or try regex to get both the page numbers and then compare them.
Hmm yeaaah, not really a fan of extracting the page number from raw text - but it seems to be the most convenient option - can't really think of another way of doing it. Try this:
def main():
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import datetime as dt
import re
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
while True:
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
label = soup.find("em", {"class": "gotopagebuttons"}).get_text(strip=True)
pattern = "Sida (\d+) av (\d+)"
match = re.match(pattern, label)
assert match is not None
print(match.group())
for row in soup.find("tbody").find_all("tr"):
for td in row.find_all("td"):
text = td.get_text(strip=True)
print(" " * 4 + text)
print(end="\n\n")
if match.group(1) == match.group(2):
# No more pages
break
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
return 0
if __name__ == "__main__":
import sys
sys.exit(main())

How to scrape multiple divs (and put them in a csv)?

I have this code to scrape tagged users ids from medias on twitter:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
import re
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# go to page
driver.get("http://twitter.com/RussiaUN/media")
#You can adjust it but this works fine
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Now that the page is fully scrolled, grab the source code.
src = driver.page_source
#Past it into BS
soup = BeautifulSoup(src, 'html.parser')
#divs = soup.find_all('div',class_='account')
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
#PRINT RESULT
#print('printing results')
#for div in divs:
# print(div['data-user-id'])
#SAVE IN FILE
print('Saving results')
#with open('file2.csv','w') as f:
# for div in divs:
# f.write(div['data-user-id']+'\n')
with open('file.csv','w', newline='') as f:
writer = csv.writer(f)
for div in divs:
writer.writerow([div['data-user-id']])
-But I would like to also scrape the usernames and then organise all these datas in a csv with a column IDS and a column USERNAMES.
So my guess is that I have to modify this piece of code first:
divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
But I can't find a way to achieve that...
-Then I also have a problem with duplicates. As you can see in the code there are two ways to scrape the data:
1 #divs = soup.find_all('div',class_='account')
2 divs = soup.find_all('div', {"data-user-id" : re.compile(r".*")})
The first phrase seemed to work but was not efficient enough. Number 2 works fine but seems to give me dupplicates at the end as it goes through all the divs and not only the class_='account'.
I'm sorry if some feel that I'm a bit spammy here as I posted 3 questions in 24h...And thanks to those who helped and will be helping.
Python has an inbuilt csv module for writing csv files.
Also the scroll script that you used did not seem to work as it was not scrolling all the way down and stopped after a certain amount of time. I just got ~ 1400 records in the csv file with your script.I have replaced it with pagedown key. You may want to tweak the no_of_pagedowns to control the amount you want to scroll down. Even with 200 pagedowns i got ~2200 records. Note that this number is without removing the duplicates.
I have added some additional modifications to write only the unique data to file.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
driver = webdriver.Firefox()
driver.get("http://twitter.com/RussiaUN/media")
time.sleep(1)
elem = driver.find_element_by_tag_name("html")
no_of_pagedowns = 200
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
no_of_pagedowns-=1
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
divs = soup.find_all('div',class_='account')
all_data=[]
#get only unique data
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
if single not in all_data:
all_data.append(single)
with open('file.csv','w') as f:
writer = csv.writer(f, delimiter=",")
#headers
writer.writerow(["ID","USERNAME"])
writer.writerows(all_data)
Output
ID,USERNAME
255493944,MID_RF
2230446228,Rus_Emb_Sudan
1024596885661802496,ambrus_drc
2905424987,Russie_au_Congo
2174261359,RusEmbUganda
285532415,tass_agency
34200559,rianru
40807205,kpru
177502586,nezavisimaya_g
23936177,vzglyad
255471924,mfa_russia
453639812,pass_blue
...
If you want the duplicates just remove the if condition
for div in divs:
single=[div['data-user-id'],div['data-screen-name']]
all_data.append(single)

Categories