Handling pagination in python playwright when the url doesn't change - python

I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn't change after you click on the next button. How do I solve this pagination problem?
Here's my code
`
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=50)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
names = []
industry = []
Locations = []
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
`
I've checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.

With some small adjustments you can get it, lets try this:
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)

Thanks for the great question... and answer. In addition / as opposed to using the loading_icon, you could also use a "networkidle", so expanding on #Jaky Ruby's answer adding page.wait_for_load_state(state="networkidle"). I often use the networkidle option to check for the completed loading of the next page, however I've read somewhere it's not necessarily best practice... but it works quite often.
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
page.wait_for_load_state(state="networkidle")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)

Related

Problem. python scrape with requests + selenium

CODE IS HERE
Hi guys
I have some problem with scraping this dynamic site (https://kvartiry-bolgarii.ru/)
I need to get all the links to the home sale ads
I used selenium to load the page and get links to ads after that I move the page down to load new ads. After the new ads are loaded, I start to parse all the links on the page and write them to the list again.
But the data in the list is not updated and the script continues to work with the links that were on the page before scrolling down.
By the way, I set a check so that the script is executed until the last announcement on the site appears in the list, the link to which I found out in advance
How can this problem be corrected?
def get_link_info():
try:
url = "https://kvartiry-bolgarii.ru/"
driver = webdriver.Chrome(
executable_path=r'C:\Users\kk\Desktop\scrape_house\drivers\chromedriver.exe',
options=options
)
driver.get(url)
req = requests.get(url)
req.encoding = 'utf8'
soup = BeautifulSoup(req.text, "lxml")
articles = soup.find_all("div", class_="content")
links_urls = []
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
#print(links_urls)
first_link_number = links_urls[-2].split("-")[-1]
first_link_number = first_link_number[1:]
#print(first_link_number)
last_link_number = links_urls[-1].split("-")[-1]
last_link_number = last_link_number[1:]
#print(last_link_number)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
check = "https://kvartiry-bolgarii.ru/kvartira-v-elitnom-komplekse-s-unikalynym-sadom-o21751"
for a in links_urls:
if a != check:
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
print(links_urls[-1])
else:
print(links_urls[0], links_urls[-1])
print("all links are ready")
Some pointers. You don't need to mix selenium,requests and BeautifulSoup. Just selenium is enough. When you are scrolling infinitely, you need to remove duplicate elements before adding them to your list.
You can try this. This should work.
from selenium import webdriver
import time
def get_link_info():
all_links = []
try:
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
driver.get('https://kvartiry-bolgarii.ru/')
time.sleep(3)
old_links = set() # Empty Set
while True:
# Scroll to get more ads
driver.execute_script("window.scrollBy(0,3825)", "")
# Wait for new ads to load
time.sleep(8)
links_divs = driver.find_elements_by_xpath('//div[#class="content"]//a') # Find Elements
ans = set(links_divs) - set(old_links) # Remove old elements
for link in ans:
# Scroll to the link.
driver.execute_script("arguments[0].scrollIntoView();", link)
fir = link.get_attribute('href')
all_links.append(fir)
# Remove Duplicates
old_links = links_divs
except Exception as e:
raise e
get_link_info()

Why Does My Code Scrape The First Record Only?

My code goes into a website, and clicks on records which causes drop downs.
My current code only prints the first drop down record, and not the others.
For example, the first record of the webpage when clicked, drops down 1 record. This record is shown attached. This is also the first and only dropdown record that gets printed as my output.
The code prints this
How do I get it to pull all drop down titles?
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
print(title)
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
subProduct=driver.find_element_by_xpath(".//li[#class='sub_accordin_presentation']")
otherTitle=subProduct.find_element_by_xpath('.//h4').text.strip()
print(otherTitle)
You don't need selenium at all. Not sure what all the info is that you are after but the following shows you that the content is available, from within those expand blocks, with the response from a simple requests.get().:
import requests
from bs4 import BeautifulSoup as bs
import re
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
print(session.select_one('h4').text)
sub_session = session.select('.sub_accordin_presentation')
if sub_session:
print([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print()
print()
Try:
productlist_length = len(driver.find_elements_by_xpath('//*[#class="jscroll-inner"]/ul/li'))
for product in productlist_length:
title = product.find_element_by_xpath('(.//*[#class="accordin_title"]/div)[3]/h4').text

Scraping after selecting all in a scrolling menu

I am trying to retrieve informations from a table in this link: https://ski-resort-stats.com/ski-resorts-in-europe/
The page has a scrolling menu, which I must act on first to have all the entries on the page and being able to select them on.
But, when I am retrieving the infos I look for after, it does not do it for the whole table... I tried to add a sleeping time between the two actions in case it would be link to that but nothing changes. Could someone help me with that ?
Here is my code below:
driver = webdriver.Chrome("path/chromedriver")
driver.get("https://ski-resort-stats.com/ski-resorts-in-europe/")
content = driver.page_source
soup = BeautifulSoup(content)
#Select "All" in the drop down menu to select all the ski resorts
menu=driver.find_element_by_id("table_1_length")
for option in menu.find_elements_by_tag_name('option'):
if option.text == 'All':
option.click()
break
import time
time.sleep(10)
mydivs = soup.find_all("td",{"class":"column-resort-name"})
print(mydivs)
So the last element printed of mydivs is not the last element of the table...
All data is already in the page in the <table>:
import requests
from bs4 import BeautifulSoup
url = "https://ski-resort-stats.com/ski-resorts-in-europe/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# print some data from rows
for row in soup.select("#table_1 tbody tr"):
r = [td.get_text(strip=True) for td in row.select("td")]
print(r[1])
Prints:
Hemsedal
Geilosiden Geilo
Golm
Hafjell
Voss
Hochschwarzeck
Rossfeld - Berchtesgaden - Oberau
...
Puigmal
Kranzberg-Mittenwald
Wetterstein lifts-Wettersteinbahnen-– Ehrwald
Stuhleck-Spital am Semmering

Selecting multiple options in unchanging url

I need to scrape content from the website by selecting state, district and blocks from the drop down menus.
I tried using python requests and posts, but I'm not able to scrape the content properly as the url of the website never changes for the options i choose.
This is the code I've tried so far :
# importing all necessary packages
import urllib3
import requests
from bs4 import BeautifulSoup
import csv
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlState"}).findAll("option")
# create dictionary 'states' mapping each state with it's code
states = {}
for elem in option[1:]:
key = elem['value']
value = elem.text
states[key] = value
for state in states.keys():
payload_ano = {'ctl00$ContentPlaceHolder1$ddlState': str(state)}
r = requests.post(url, data=payload_ano,verify=False)
break
soup = BeautifulSoup(r.text,"html.parser")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlDistrict"}).findAll("option")
option # only gives [<option selected="selected" value="%">All District</option>] from the home page and not the districts inside the state chosen
I have used a break statement so the code can terminate earlier. Now the problem is that the variable option in the final line should contain the content of the drop down list when the state was chosen. But it only shows the content of the home page.
Any help or suggestions would be really appreciated.
You can use selenium to select an option from the drop downs.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome()
driver.get('http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx')
# get state options
state_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlState"]')
state_select = Select(state_element)
state_options = [state_option.text for state_option in state_select.options]
# choose state option number
print('\nselect state:')
for i, state in enumerate(state_options):
print(f'{i+1} - {state.strip()}')
state = input(':- ')
# select state option
state_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlState"]/option[{state}]')
state_selected.click()
# get district options
district_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]')
district_select = Select(district_element)
district_options = [district_option.text for district_option in district_select.options]
# choose district option number
print('\nselect district:')
for i, district in enumerate(district_options):
print(f'{i+1} - {district.strip()}')
district = input(':- ')
# select district option
district_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]/option[{district}]')
district_selected.click()
# get block options
block_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]')
block_select = Select(block_element)
block_options = [block_option.text for block_option in block_select.options]
# choose block option number
print('\nselect block:')
for i, block in enumerate(block_options):
print(f'{i+1} - {block.strip()}')
block = input(':- ')
# select block option
block_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]/option[{block}]')
block_selected.click()
# get data of each record
try:
table_element = driver.find_element_by_css_selector('table.Table')
except NoSuchElementException:
print('\nRecord not found')
else:
table_rows = table_element.find_elements_by_css_selector('table.Table tr')
print('\nGrampanchayat Sarpanch Details')
for table_row in table_rows[2:]:
table_cols = table_row.find_elements_by_css_selector('table.Table tr td')
for table_col in table_cols:
print(table_col.text, end=',\t')
print()
Note:
You need to download Chrome Driver into your project folder.

How to loop scrape data inside span (arrows) and loop it all in consecutive pages?

This is my first time trying to use python with selenium and bs4.
I'm trying to scrape data from this website
To begin I select GE from cantone dropdown menu, click the checkbox "Conffermo" and the button "Ricerca". Then I can see the data. I have to click each arrow to expand the data and scrape it from every person (this is a loop, isn't it). And then do the same on the next page (by clicking on "Affiggere le seguenti entrate" at the bottom of the page)
I'd like to use relative xpath for the data since not all persons have all the data (I'd like to put an empty cell in excel when data is missing)
This is my code so far:
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
URL = 'http://www.asca.ch/Partners.aspx?lang=it'
time.sleep(10)
page = urllib2.urlopen(quote_page) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, ‘html.parser’)
inputElementCantone = driver.find_element_by_xpath(//*[#id="ctl00_MainContent_ddl_cantons_Input"]).click()
browser.find_element_by_xpath(/html/body/form/div[1]/div/div/ul/li[9]).click()
browser.find_element_by_xpath(//INPUT[#id='MainContent__chkDisclaimer']).click()
driver.find_element_by_xpath(//INPUT[#id='MainContent_btn_submit']).click()
arrow = browser.find_element_by_class_name("footable-toggle")
I'm stuck after this. The data I'd like to scrape (in excel columns) are: Discipline(s) thérapeutique(s), Cognome, Cellulare and email.
Any help is appreciated.
# To find all the table
table = soup.find('table', {'class': 'footable'})
# To get all rows in that table
rows = table.find_all('tr')
# A function to process each row
def processRow(row):
#All rows with hidden data
dataFields = row.find_all('td', {'style': True}
output = {}
#Fixed index numbers are not ideal but in this case will work
output['Discipline'] = dataFields[0].text
output['Cogome'] = dataFields[2].text
output['Cellulare'] = dataFields[8].text
output['email'] = dataFields[10].text
return output
#Declaring a list to store all results
results = []
#Iterating over all the rows and storing the processed result in a list
for row in rows:
results.append(processRow(row))
print(results)

Categories