How To Scrape This Field From Webpage - python

My code goes into a webpage, and takes certain data from each row
I however want to also get the "topics" from each row. For example listed as "Presidential Session and Community Psychiatry" in row 1, above the "Speakers" text.
My code is currently able to scrape Titles and Chairs of each row (denoted as Role and Name) but not the topic?
from selenium import webdriver
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://s7.goeshow.com/apa/annual/2021/session_search.cfm?_ga=2.259773066.1015449088.1617295032-97934194.1617037074')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
tables = soup.select('#datatable')
for table in tables:
for title in table.select('tr td.title'):
print(title.text.strip())
title_row = title.parent
speaker_row = title_row.next_sibling
for speaker in speaker_row.select('span.session-speaker'):
role = speaker.select_one('span.session-speaker-role').text.strip()
name = speaker.select_one('span.session-speaker-name').text.strip()
topic=speaker.select_one('span.session-track-label').text.strip()
print(role, name,topic)
print()
driver.quit()

inside speaker for loop you are searching elements under "span.session-speaker" element there is no element "span.session-track-label" under that element
use:
tables = soup.select('#datatable')
for table in tables:
for title in table.select('tr td.title'):
print(title.text.strip())
title_row = title.parent
speaker_row = title_row.next_sibling
for speaker in speaker_row.select('span.session-divider-line'):
role = speaker.select_one('span.session-speaker-role').text.strip()
name = speaker.select_one('span.session-speaker-name').text.strip()
topic = speaker.select_one('span.session-track-label').text.strip()
print(role, name, topic)
print()

Related

Why Does My Code Scrape The First Record Only?

My code goes into a website, and clicks on records which causes drop downs.
My current code only prints the first drop down record, and not the others.
For example, the first record of the webpage when clicked, drops down 1 record. This record is shown attached. This is also the first and only dropdown record that gets printed as my output.
The code prints this
How do I get it to pull all drop down titles?
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
print(title)
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
subProduct=driver.find_element_by_xpath(".//li[#class='sub_accordin_presentation']")
otherTitle=subProduct.find_element_by_xpath('.//h4').text.strip()
print(otherTitle)
You don't need selenium at all. Not sure what all the info is that you are after but the following shows you that the content is available, from within those expand blocks, with the response from a simple requests.get().:
import requests
from bs4 import BeautifulSoup as bs
import re
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
print(session.select_one('h4').text)
sub_session = session.select('.sub_accordin_presentation')
if sub_session:
print([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print()
print()
Try:
productlist_length = len(driver.find_elements_by_xpath('//*[#class="jscroll-inner"]/ul/li'))
for product in productlist_length:
title = product.find_element_by_xpath('(.//*[#class="accordin_title"]/div)[3]/h4').text

web scraping multiple pages past a login using python

Trying to scrape two tables on separate pages after accessing the site through a login. Tried a few different ways and can't figure it out.
The last attempt showed some promise but only the first data frame was appended to the list of data frames. Something like the following:
from selenium import webdriver
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup as BS
def text_to_chart (url, table) :
df_list = []
driver = webdriver.Chrome(path)
driver.get(login)
driver.find_element_by_xpath(password block).send_keys(password)
driver.find_element_by_xpath(username block).send_keys(username)
driver.find_element_by_xpath(submit).click()
time.sleep(10)
df = pd.DataFrame()
for url, table in zip(urls, tables) :
driver.get(url)
time.sleep(10)
soup = BS(driver.page_source, 'html')
new_table = soup.find_all('table',
attrs = {'class': table})
results_list = pd.read_html(str(new_table[0]))
df = df.append(pd.DataFrame(results_list[0]))
return df
def scrape(url, table)
df_list = []
df_list = df_list.append(text_to_chart(url, table))
scrape(url_list, table_list)
So, What Should I do to scrape multiple pages?
I suggest you must store the values in a list of dictionaries and then convert it to a dataframe.That will be good and easy.
Solved! I made a few changes which resulted in one function that created my list of df's. Then I began the session, logged in, and called the function, saving the output to my variable df_list.
from selenium import webdriver
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup as BS
def text_to_chart (urls, tables) :
df = []
for url, table in zip(urls, tables) :
driver.get(url)
time.sleep(10)
soup = BS(driver.page_source, 'html')
new_table = soup.find_all('table',
attrs = {'class': table})
results_list = pd.read_html(str(new_table[0]))
df.append(pd.DataFrame(results_list[0]))
return df
driver = webdriver.Chrome(path)
driver.get(login)
driver.find_element_by_xpath(password block).send_keys(password)
driver.find_element_by_xpath(username block).send_keys(username)
driver.find_element_by_xpath(submit).click()
time.sleep(10)
df_list = text_to_chart(url_list, table_list)

Selecting multiple options in unchanging url

I need to scrape content from the website by selecting state, district and blocks from the drop down menus.
I tried using python requests and posts, but I'm not able to scrape the content properly as the url of the website never changes for the options i choose.
This is the code I've tried so far :
# importing all necessary packages
import urllib3
import requests
from bs4 import BeautifulSoup
import csv
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx"
session = requests.Session()
html = session.get(url, verify=False).content
soup = BeautifulSoup(html, "lxml")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlState"}).findAll("option")
# create dictionary 'states' mapping each state with it's code
states = {}
for elem in option[1:]:
key = elem['value']
value = elem.text
states[key] = value
for state in states.keys():
payload_ano = {'ctl00$ContentPlaceHolder1$ddlState': str(state)}
r = requests.post(url, data=payload_ano,verify=False)
break
soup = BeautifulSoup(r.text,"html.parser")
option = soup.find("select",{"name":"ctl00$ContentPlaceHolder1$ddlDistrict"}).findAll("option")
option # only gives [<option selected="selected" value="%">All District</option>] from the home page and not the districts inside the state chosen
I have used a break statement so the code can terminate earlier. Now the problem is that the variable option in the final line should contain the content of the drop down list when the state was chosen. But it only shows the content of the home page.
Any help or suggestions would be really appreciated.
You can use selenium to select an option from the drop downs.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome()
driver.get('http://swachhbharatmission.gov.in/tsc/Report_NBA/Panchayat/Rpt_SarpanchDetail.aspx')
# get state options
state_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlState"]')
state_select = Select(state_element)
state_options = [state_option.text for state_option in state_select.options]
# choose state option number
print('\nselect state:')
for i, state in enumerate(state_options):
print(f'{i+1} - {state.strip()}')
state = input(':- ')
# select state option
state_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlState"]/option[{state}]')
state_selected.click()
# get district options
district_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]')
district_select = Select(district_element)
district_options = [district_option.text for district_option in district_select.options]
# choose district option number
print('\nselect district:')
for i, district in enumerate(district_options):
print(f'{i+1} - {district.strip()}')
district = input(':- ')
# select district option
district_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]/option[{district}]')
district_selected.click()
# get block options
block_element = driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]')
block_select = Select(block_element)
block_options = [block_option.text for block_option in block_select.options]
# choose block option number
print('\nselect block:')
for i, block in enumerate(block_options):
print(f'{i+1} - {block.strip()}')
block = input(':- ')
# select block option
block_selected = driver.find_element_by_xpath(f'//*[#id="ctl00_ContentPlaceHolder1_ddlBlock"]/option[{block}]')
block_selected.click()
# get data of each record
try:
table_element = driver.find_element_by_css_selector('table.Table')
except NoSuchElementException:
print('\nRecord not found')
else:
table_rows = table_element.find_elements_by_css_selector('table.Table tr')
print('\nGrampanchayat Sarpanch Details')
for table_row in table_rows[2:]:
table_cols = table_row.find_elements_by_css_selector('table.Table tr td')
for table_col in table_cols:
print(table_col.text, end=',\t')
print()
Note:
You need to download Chrome Driver into your project folder.

extract title from a link using BeautifulSoup

I am using beautifulsoup to scrape a website but need help with this as I am new to python and beautifulsoup
How do I get VET from the following
"[[VET]]"
This is my code so far
import bs4 as bs
import urllib.request
import pandas as pd
#This is the Home page of the website
source = urllib.request.urlopen('file:///C:/Users/Aiden/Downloads/stocks/Stock%20Premarket%20Trading%20Activity%20_%20Biggest%20Movers%20Before%20the%20Market%20Opens.html').read().decode('utf-8')
soup = bs.BeautifulSoup(source,'lxml')
#find the Div and put all info into varTable
table = soup.find('table',{"id":"decliners_tbl"}).tbody
#find all Rows in table and puts into varTableRows
tableRows = table.find_all('tr')
print ("There is ",len(tableRows),"Rows in the Table")
print(tableRows)
columns = [tableRows[1].find_all('td')]
print(columns)
a = [tableRows[1].find_all("a")]
print(a)
So my output from print(a) is "[[<a class="mplink popup_link" href="https://marketchameleon.com/Overview/VET/">VET</a>]]"
and I want to extract VET out
AD
You can use a.text or a.get_text().
If you have multiple elements you'd need list comprehension on this function
Thank you for all the reply, I was able to work it out using the following code
source = urllib.request.urlopen('file:///C:/Users/Aiden/Downloads/stocks/Stock%20Premarket%20Trading%20Activity%20_%20Biggest%20Movers%20Before%20the%20Market%20Opens.html').read().decode('utf-8')
soup = bs.BeautifulSoup(source,'html.parser')
table = soup.find("table",id="decliners_tbl")
for decliners in table.find_all("tbody"):
rows = decliners.find_all("tr")
for row in rows:
ticker = row.find("a").text
volume = row.findAll("td", class_="rightcell")[3].text
print(ticker, volume)

How to loop scrape data inside span (arrows) and loop it all in consecutive pages?

This is my first time trying to use python with selenium and bs4.
I'm trying to scrape data from this website
To begin I select GE from cantone dropdown menu, click the checkbox "Conffermo" and the button "Ricerca". Then I can see the data. I have to click each arrow to expand the data and scrape it from every person (this is a loop, isn't it). And then do the same on the next page (by clicking on "Affiggere le seguenti entrate" at the bottom of the page)
I'd like to use relative xpath for the data since not all persons have all the data (I'd like to put an empty cell in excel when data is missing)
This is my code so far:
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
URL = 'http://www.asca.ch/Partners.aspx?lang=it'
time.sleep(10)
page = urllib2.urlopen(quote_page) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, ‘html.parser’)
inputElementCantone = driver.find_element_by_xpath(//*[#id="ctl00_MainContent_ddl_cantons_Input"]).click()
browser.find_element_by_xpath(/html/body/form/div[1]/div/div/ul/li[9]).click()
browser.find_element_by_xpath(//INPUT[#id='MainContent__chkDisclaimer']).click()
driver.find_element_by_xpath(//INPUT[#id='MainContent_btn_submit']).click()
arrow = browser.find_element_by_class_name("footable-toggle")
I'm stuck after this. The data I'd like to scrape (in excel columns) are: Discipline(s) thérapeutique(s), Cognome, Cellulare and email.
Any help is appreciated.
# To find all the table
table = soup.find('table', {'class': 'footable'})
# To get all rows in that table
rows = table.find_all('tr')
# A function to process each row
def processRow(row):
#All rows with hidden data
dataFields = row.find_all('td', {'style': True}
output = {}
#Fixed index numbers are not ideal but in this case will work
output['Discipline'] = dataFields[0].text
output['Cogome'] = dataFields[2].text
output['Cellulare'] = dataFields[8].text
output['email'] = dataFields[10].text
return output
#Declaring a list to store all results
results = []
#Iterating over all the rows and storing the processed result in a list
for row in rows:
results.append(processRow(row))
print(results)

Categories