Using selenium to scrape paginated table data (Python) - python

I have this table: https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page=1. It's paginated I want to scrape all the content from the table starting from page 1 to the very end. I am trying to use the xpath but can't seem to get it to work.
Here is my code, any help welcome!
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
# co.add_argument('--ignore-certificate-errors')
#co.add_argument('--no-proxy-server')
#co.add_argument("--proxy-server='direct://'")
#co.add_argument("--proxy-bypass-list=*")
co = webdriver.ChromeOptions()
co.add_argument('--headless')
driver = webdriver.Chrome(executable_path="C:/Users/user/Desktop/IG Trading/chromedriver.exe", chrome_options=co)
driver.get('https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page=1')
stock_names = driver.find_elements(By.XPATH, '/html/body/app-root/app-handshake/div/app-page-content/app-filter-toggle/app-ftse-index-table/section/table')
print(stock_names)
# for stock_name in stock_names:
# print(stock_name)
# text = stock_name.text
# print(text)

This is one way you can obtain that information:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
from tqdm import tqdm
firefox_options = Firefox_Options()
# firefox_options.add_argument("--width=1500")
# firefox_options.add_argument("--height=500")
# firefox_options.headless = True
driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)
big_df = pd.DataFrame()
browser.get('https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table')
try:
WebDriverWait(browser, 3).until(EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))).click()
print('accepted cookies')
except Exception as e:
print('no cookie button!')
t.sleep(2)
for i in tqdm(range(1, 40)):
browser.get(f'https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page={i}')
t.sleep(1)
df = pd.read_html(WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[class='full-width ftse-index-table-table']"))).get_attribute('outerHTML'))[0]
big_df = pd.concat([big_df, df], axis=0, ignore_index=True)
print(big_df)
big_df.to_csv('lse_companies.csv')
print('all done')
browser.quit()
This will display in terminal the big dataframe once all pages scraped, and also save it as a csv file on disk (in the same folder you are running your script from). Setup is Firefox/geckodriver on linux, however you can adapt it to your own, just observe the imports, and the logic after defining the browser/driver.
Selenium docs: https://www.selenium.dev/documentation/
TQDM: https://pypi.org/project/tqdm/

Related

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.
Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]
The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Selenium not clearing website search bar with .clear()

I'm scraping a website where the searchbar auto-fills with the last thing I search. If I use .clear(), the search bar still continues to append the last-searched item.
for sku in skus_to_find:
search_bar = WebDriverWait(driver,100).until(EC.presence_of_element_located((By.XPATH, "//input[contains(#accesskey, 'S')]")))
search_bar.clear()
search_bar.send_keys(sku)
search_bar.send_keys(Keys.RETURN)
How can I better clear out the search bar on a web page prior to .send_keys()?
added logic to click on cross, so it cleans the text box before you add new sku.
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
#PATH = "C:\Program Files (x86)\chromedriver_win32\chromedriver.exe"
baseurl = "http://www.zoro.com"
driver = webdriver.Chrome()
driver.get(baseurl)
def use_driver_current_html(driver):
soup = BeautifulSoup(driver.page_source, 'lxml')
return soup
file = open('zoro_skus.txt')
skus_to_find = []
product_list = []
for line in file:
line = line.replace('\n', '')
skus_to_find.append(line)
for sku in skus_to_find:
search_bar = WebDriverWait(driver,100).until(EC.presence_of_element_located((By.XPATH, "//input[contains(#accesskey, 'S')]")))
time.sleep(2)
search_bar.send_keys(sku)
search_bar.send_keys(Keys.RETURN)
#below try and catch added
try:
cross_click=WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "(//*[#class='svg-icon zcl-icon zcl-icon--small'])[2]")))
cross_click.click()
except Exception as e:
print(e)
pass

Scraping a dynamically loaded table with BeautifulSoup

My code could return values of first two tags, but the behind won't in per tag.
HTML:
My code:
import bs4 as bs
import requests
resp = requests.get('https://q.stock.sohu.com/cn/bk_4401.shtml')
resp.encoding = 'gb2312'
soup = bs.BeautifulSoup(resp.text, 'lxml')
tab_sgtsc_list = soup.find('table').find('tbody').find_all('tr')
for tab_sgtsc in tab_sgtsc_list:
print('**************************************')
print(tab_sgtsc.find_all('td')[0].text)
print(tab_sgtsc.find_all('td')[1].text)
print(tab_sgtsc.find_all('td')[2].text)
print(tab_sgtsc.find_all('td')[3].text)
print('**************************************')
Result:
The table is rendered dynamically by JavaScript so you won't get much from pure HTML.
However, selenium and pandas come to the rescue!
Required:
Chrome driver
selenium
pip install pandas
Here's how:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://q.stock.sohu.com/cn/bk_4401.shtml")
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, 'table.tableMSB'))
).text.replace("点击按代码排序查询", "").split()
table = [element[i:i + 12] for i in range(0, len(element), 12)]
pd.DataFrame(table[1:], columns=table[0]).to_csv("your_table_data.csv", index=False)
Output:

Condition to check if Selenium is done scrolling based on web element?

Currently I have a script that will go to TripAdvisor and try to scrape every image in that particular filter. I was wondering what conditional I should set my if statement to in order for it to break out of the while loop and then parse the list of urls to give me clear url links to each image. I am just confused at how I can tell if I have reached the end once I have reached the last web element. The if statement is right at the end before the last printing loop. Any help is greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#wait one second
time.sleep(1)
if():
end = True
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])
#print(image_url)

Python and scraping data. Getting a specific tab that has no link and going through multiple pages

I am new to scraping. Also, new to Python. But I have a script that goes to WhoScored.com and pulls some data off a specific league and exports it to a .csv file. Here is the code:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
import time
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe")
#Choose any league and click on player statistics now copy that url here. for eg we want premier league data so we'll need the following url
website_URL ="https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020"
driver.get(website_URL)
page = 1
#Check the number of pages of data available of that league. For premier league it's 32 so we set max_page as 32
max_page=32
while True:
try:
if page > max_page :
print("Last page reached")
break
page+=1
for i in driver.find_elements_by_xpath("""//*[#id="player-table-statistics-body"]"""):
p_db=i.get_attribute('innerHTML')
p_db='<table>'+p_db+'</table>'
df=pd.read_html(p_db)[0]
df.drop(df.columns[1], axis=1)
df.to_csv('premier_league_ws.csv', mode='a', header=False,index=False)
driver.find_element_by_link_text("next").click()
time.sleep(5)
print("Navigating to Next Page")
except (TimeoutException, WebDriverException) as e:
print("Last page reached")
break
driver.quit()
So the chrome driver goes to https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020
Then, it pulls out all the data from the Summary Tab. And it only pulls the first page of players.
Can you help me get to the sub-tabs (Defensive/Offensive/Detailed) next to the Summary? There is no direct link to them. I need to pull all that information as well.
The script stops after the first 10 players and it does not go to the next page. How do I fix this?
Thank you!
This should fix your problem. But remember every time you run this script it will accept the Cookie on the site.
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
import time
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
# Finding the path to chromedriver.exe and loading the options
driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\chromedriver.exe", options=options)
# Choose any league and click on player statistics now copy that url here. for eg we want premier league data so we'll need the following url
website_URL = "https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020"
driver.get(website_URL)
page = 1
# Check the number of pages of data available of that league. For premier league it's 32 so we set max_page as 32
max_page = 32
# Finding the Cookie button and accepting it
time.sleep(2)
driver.find_element_by_xpath("""//*[#id="qcCmpButtons"]/button[2]""").click()
# Scrolling down 500 px
time.sleep(3)
driver.execute_script("window.scrollTo(0, 500)")
while True:
try:
if page > max_page:
print("Last page reached")
break
page += 1
for i in driver.find_elements_by_xpath("""//*[#id="player-table-statistics-body"]"""):
p_db = i.get_attribute('innerHTML')
p_db = '<table>' + p_db + '</table>'
df = pd.read_html(p_db)[0]
df.drop(df.columns[1], axis=1)
df.to_csv('premier_league_ws.csv', mode='a', header=False, index=False)
time.sleep(5)
driver.find_element_by_link_text("next").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException) as e:
print("Last page reached")
break
driver.quit()
Things i changed or added
# Finding the path to chromedriver.exe and loading the options
driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\chromedriver.exe", options=options)
# Finding the Cookie button and accepting it
time.sleep(2)
driver.find_element_by_xpath("""//*[#id="qcCmpButtons"]/button[2]""").click()
# Scrolling down 500 px
time.sleep(3)
driver.execute_script("window.scrollTo(0, 500)")

Categories