I need to scrape content from the website by selecting state, district and blocks from the drop down menus.
I tried using python requests and posts, but I'm not able to scrape the content properly as the url of the website never changes for the options i choose.
This is the code I've tried so far :
# importing all necessary packages
import urllib3
import requests
from bs4 import BeautifulSoup
import csv
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlState"}).findAll("option")
# create dictionary 'states' mapping each state with it's code
states = {}
for elem in option[1:]:
key = elem['value']
value = elem.text
states[key] = value
for state in states.keys():
payload_ano = {'ctl00$ContentPlaceHolder1$ddlState': str(state)}
r = requests.post(url, data=payload_ano,verify=False)
break
soup = BeautifulSoup(r.text,"html.parser")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlDistrict"}).findAll("option")
option # only gives [<option selected="selected" value="%">All District</option>] from the home page and not the districts inside the state chosen
I have used a break statement so the code can terminate earlier. Now the problem is that the variable option in the final line should contain the content of the drop down list when the state was chosen. But it only shows the content of the home page.
Any help or suggestions would be really appreciated.
You can use selenium to select an option from the drop downs.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome()
driver.get('http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx')
# get state options
state_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlState"]')
state_select = Select(state_element)
state_options = [state_option.text for state_option in state_select.options]
# choose state option number
print('\nselect state:')
for i, state in enumerate(state_options):
print(f'{i+1} - {state.strip()}')
state = input(':- ')
# select state option
state_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlState"]/option[{state}]')
state_selected.click()
# get district options
district_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]')
district_select = Select(district_element)
district_options = [district_option.text for district_option in district_select.options]
# choose district option number
print('\nselect district:')
for i, district in enumerate(district_options):
print(f'{i+1} - {district.strip()}')
district = input(':- ')
# select district option
district_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]/option[{district}]')
district_selected.click()
# get block options
block_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]')
block_select = Select(block_element)
block_options = [block_option.text for block_option in block_select.options]
# choose block option number
print('\nselect block:')
for i, block in enumerate(block_options):
print(f'{i+1} - {block.strip()}')
block = input(':- ')
# select block option
block_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]/option[{block}]')
block_selected.click()
# get data of each record
try:
table_element = driver.find_element_by_css_selector('table.Table')
except NoSuchElementException:
print('\nRecord not found')
else:
table_rows = table_element.find_elements_by_css_selector('table.Table tr')
print('\nGrampanchayat Sarpanch Details')
for table_row in table_rows[2:]:
table_cols = table_row.find_elements_by_css_selector('table.Table tr td')
for table_col in table_cols:
print(table_col.text, end=',\t')
print()
Note:
You need to download Chrome Driver into your project folder.
Related
I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn't change after you click on the next button. How do I solve this pagination problem?
Here's my code
`
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=50)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
names = []
industry = []
Locations = []
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
`
I've checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.
With some small adjustments you can get it, lets try this:
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
Thanks for the great question... and answer. In addition / as opposed to using the loading_icon, you could also use a "networkidle", so expanding on #Jaky Ruby's answer adding page.wait_for_load_state(state="networkidle"). I often use the networkidle option to check for the completed loading of the next page, however I've read somewhere it's not necessarily best practice... but it works quite often.
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
page.wait_for_load_state(state="networkidle")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
My code goes into a website, and clicks on records which causes drop downs.
My current code only prints the first drop down record, and not the others.
For example, the first record of the webpage when clicked, drops down 1 record. This record is shown attached. This is also the first and only dropdown record that gets printed as my output.
The code prints this
How do I get it to pull all drop down titles?
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
print(title)
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
subProduct=driver.find_element_by_xpath(".//li[#class='sub_accordin_presentation']")
otherTitle=subProduct.find_element_by_xpath('.//h4').text.strip()
print(otherTitle)
You don't need selenium at all. Not sure what all the info is that you are after but the following shows you that the content is available, from within those expand blocks, with the response from a simple requests.get().:
import requests
from bs4 import BeautifulSoup as bs
import re
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
print(session.select_one('h4').text)
sub_session = session.select('.sub_accordin_presentation')
if sub_session:
print([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print()
print()
Try:
productlist_length = len(driver.find_elements_by_xpath('//*[#class="jscroll-inner"]/ul/li'))
for product in productlist_length:
title = product.find_element_by_xpath('(.//*[#class="accordin_title"]/div)[3]/h4').text
My code goes into a webpage, and takes certain data from each row
I however want to also get the "topics" from each row. For example listed as "Presidential Session and Community Psychiatry" in row 1, above the "Speakers" text.
My code is currently able to scrape Titles and Chairs of each row (denoted as Role and Name) but not the topic?
from selenium import webdriver
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://s7.goeshow.com/apa/annual/2021/session_search.cfm?_ga=2.259773066.1015449088.1617295032-97934194.1617037074')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
tables = soup.select('#datatable')
for table in tables:
for title in table.select('tr td.title'):
print(title.text.strip())
title_row = title.parent
speaker_row = title_row.next_sibling
for speaker in speaker_row.select('span.session-speaker'):
role = speaker.select_one('span.session-speaker-role').text.strip()
name = speaker.select_one('span.session-speaker-name').text.strip()
topic=speaker.select_one('span.session-track-label').text.strip()
print(role, name,topic)
print()
driver.quit()
inside speaker for loop you are searching elements under "span.session-speaker" element there is no element "span.session-track-label" under that element
use:
tables = soup.select('#datatable')
for table in tables:
for title in table.select('tr td.title'):
print(title.text.strip())
title_row = title.parent
speaker_row = title_row.next_sibling
for speaker in speaker_row.select('span.session-divider-line'):
role = speaker.select_one('span.session-speaker-role').text.strip()
name = speaker.select_one('span.session-speaker-name').text.strip()
topic = speaker.select_one('span.session-track-label').text.strip()
print(role, name, topic)
print()
I have several URLs which link to Hotel pages and I would like to scrape some data from it.
I'm using the following this script, but I would like to update it:
data=[]
for i in range(0,10):
url = final_list[i]
driver2 = webdriver.Chrome()
driver2.get(url)
sleep(randint(10,20))
soup = BeautifulSoup(driver2.page_source, 'html.parser')
my_table2 = soup.find_all(class_=['title-2', 'rating-score body-3'])
review=soup.find_all(class_='reviews')[-1]
try:
price=soup.find_all('span', attrs={'class':'price'})[-1]
except:
price=soup.find_all('span', attrs={'class':'price'})
for tag in my_table2:
data.append(tag.text.strip())
for p in price:
data.append(p)
for r in review:
data.append(r)
But here's the problem, tag.text.strip() scrape rating numbers like here :
It will strip the number rating into alone value but some hotels don't have the same amout of ratings. Here's a hotel with 7 ratings, the default number is 8. Some have seven ratings, other six, and so on. So in the end, my dataframe is quite screwed. If the hotel doesn't have 8 ratings, the value will be shifted.
My question is : How to tell the script "if there is a value in this tag.text.strip(i) so put the value but if there isn't put None. And of course made that for the eight value.
I tried several things like :
for tag in my_table2:
for i in tag.text.strip()[i]:
if i:
data.append(i)
else:
data.append(None)
But unfortunately, that goes nowhere, so if you could help to figure out the answer, it would be awesome :)
If that could help you, I put link on Hotel that I'm scraping :
https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1
The number ratings are at the end
Thank you.
A few suggestions:
Put your data in a dictionary. You don't have to assume that all tags are present and the order of the tags doesn't matter. You can get the labels and the corresponding ratings with
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
and then iterate over both lists with zip
move your driver outside of the loop, opening it once is enough
don't use wait but you use Selenium's wait functions. You can wait for a particular element to be present or populated with WebDriverWait(driver, 10).until(EC.presence_of_element_located(your_element)
https://selenium-python.readthedocs.io/waits.html
Cache your scraped HTML code to a file. It's faster for you and politer to the website you are scraping
import selenium
import selenium.webdriver
import time
import random
import os
from bs4 import BeautifulSoup
data = []
final_list = [
'https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1',
'https://www.hostelworld.com/pwa/hosteldetails.php/Be-Ramblas-Hostel/Barcelona/435?from=2020-11-27&to=2020-11-28&guests=1'
]
# load your driver only once to save time
driver = selenium.webdriver.Chrome()
for url in final_list:
data.append({})
# cache the HTML code to the filesystem
# generate a filename from the URL where all non-alphanumeric characters (e.g. :/) are replaced with underscores _
filename = ''.join([s if s.isalnum() else '_' for s in url])
if not os.path.isfile(filename):
driver.get(url)
# better use selenium's wait functions here
time.sleep(random.randint(10, 20))
source = driver.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(source)
else:
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
soup = BeautifulSoup(source, 'html.parser')
review = soup.find_all(class_='reviews')[-1]
try:
price = soup.find_all('span', attrs={'class':'price'})[-1]
except:
price = soup.find_all('span', attrs={'class':'price'})
data[-1]['name'] = soup.find_all(class_=['title-2'])[0].text.strip()
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
assert len(rating_labels) == len(rating_scores)
for label, score in zip(rating_labels, rating_scores):
data[-1][label.text.strip()] = score.text.strip()
data[-1]['price'] = price.text.strip()
data[-1]['review'] = review.text.strip()
The data can then be easily put in a nicely formatted table using Pandas
import pandas as pd
df = pd.DataFrame(data)
df
If some data is missing/incomplete, Pandas will replace it with 'NaN'
data.append(data[0].copy())
del(data[-1]['Staff'])
data[-1]['name'] = 'Incomplete Hostel'
pd.DataFrame(data)
I am trying to pull data for the rosters for all college football teams because I want to run some analysis on team performance based on composition of their roster.
My script is working on the first page, and it iterates over each team and can open the rosters link for each team, but then the Beautiful Soup commands I am running on the rosters page for a team keep throwing Index Errors. When I look at the HTML, it seems as if the commands I am writing should work yet when I print the page source from the Beautiful Soup I don't see what I see in Developer Tools in Chrome. Is this some instance of JS being used to serve up the content? If so, I thought Selenium got around this?
My code...
import requests
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_soup = BeautifulSoup(teams_html, "html5lib")
i = 0
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
roster_driver = webdriver.Firefox()
roster_driver.get(roster_link)
roster_html = teams_driver.page_source
roster_soup = BeautifulSoup(roster_html, "html5lib")
team_name_html = roster_soup.find_all('a', class_='sub-brand-title')[0]
team_name = team_name_html.find_all('b')[0].text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name)
print('\t', player_name)
roster_driver.close()
teams_driver.close()
In your for loop you're using the html of the 1st page (roster_html = teams_driver.page_source), so you get an index error when you try to select the 1st item of team_name_html because find_all returns an empty list.
Also you don't need to have all those instances of Firefox open, you can close the driver when you have the html.
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_driver.quit()
But you don't have to use selenium for this task, you can get all the data with requests and bs4.
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.espn.com/college-football/teams")
teams_soup = BeautifulSoup(r.text, "html5lib")
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
r = requests.get(roster_link)
roster_soup = BeautifulSoup(r.text, "html5lib")
team_name = roster_soup.find('a', class_='sub-brand-title').find('b').text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name, player_name, player_pos, player_height, player_weight, player_year, player_hometown)