How to use beautiful soup to extract elements from collapsible section

How to use beautiful soup to extract elements from collapsible section - python

I'm developing a python scraper using beautiful soup4 and I have difficulty to scrape the information in a collapsible section in this page: https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268.
The collapsible section I want to scrape is "Property History for 1366 West 22nd St". The information I'm trying to get is "date" column and "price" column.
url = "https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268"
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
req = urllib.request.Request(url, headers = headers)
res = urllib.request.urlopen(req, context=ssl.SSLContext())
soup = BeautifulSoup(res, 'html.parser')
dates = [td.text for td in soup.find_all('td', {"class": "date-col nowrap"})]
However, the dates I scraped from date column only have Oct 29, 2018, Aug 24, 2018 and Aug 24, 2018, because soup.find_all('td', {"class": "date-col nowrap"}) cannot find the rest dates after these three dates. The rest dates are collapsed and need to click "See all property history" button to unfold the rest dates. Is there any way to scrape the collapsed dates using Selenium?

Here's the code which should work, it returns the table as a dictionary of tuples.
import selenium
from selenium import webdriver
import time
url = "https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268"
def browser():
driver = webdriver.Chrome()
driver.get(url)
return driver
def main():
driver = browser()
el = driver.find_element_by_xpath('//span[contains(text(), "See all property history")]')
el.click()
# should expand quite quickly, otherwise might need to wait, e.g. time.sleep(5)
row_arg = "//tr[#class=' PropertyHistoryEventRow']" # take note of the space before 'Property'
rows = driver.find_elements_by_xpath(row_arg)
tbl = {}
for i, row in enumerate(rows):
date = row.find_element_by_xpath('.//td[#class="date-col nowrap"]').text
event = row.find_element_by_xpath('.//td[#class="event-col"]').text
price = row.find_element_by_xpath('//td[#class="price-col number"]').text
appre = row.find_element_by_xpath('.//td[#class="appreciation-col number empty"]').text
tbl[i] = (date, event, price, appre)
for k, v in tbl.items():
print(k, v)
return tbl

Related

web scraping can't get data of all links in page at same time

From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)

two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.

How can I make web scraping faster in Python using Selenium and Beautiful Soup?

I wrote a script to scrape the website Vivino, using the Beautiful Soup and Selenium libraries.
In this website, I want to store information of a certain wine's reviews.
I have to use Selenium to do dynamic scraping since the reviews can only be accessed pressing the "Show more reviews" button in the webpage, which appears after scrolling down to the top of the page.
I adapted the code for just one wine so you can see, if needed, how long it takes:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
def scroll_to_bottom_wine_page(driver):
#driver = self.browser
scroll_pause_time = 0.01 #Change time?
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def scroll_to_bottom_review_page(driver, rating_count):
stuck_counter = 0
current_reviews_now = 0
current_reviews_previous = 0
scroll_review_pause_time = 0.8 #Change time?
stop_indicator = rating_count
time.sleep(scroll_review_pause_time)
element_inside_popup = driver.find_element_by_xpath('//*[#id="baseModal"]/div/div[2]/div[3]//a') #Reviews path
while True:
time.sleep(scroll_review_pause_time)
element_inside_popup.send_keys(Keys.END)
results_temp = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(results_temp, 'lxml')
reviews = soup.findAll("div", {"class": "card__card--2R5Wh reviewCard__reviewCard--pAEnA"})
current_reviews_now = len(reviews)
#In case there actually are less reviews than what the rating_count states, we avoid scrolling down forever
if(current_reviews_now == current_reviews_previous):
stuck_counter += 1
if (current_reviews_now > (stop_indicator)) or (stuck_counter > 2):
break
current_reviews_previous = current_reviews_now
return reviews
def get_reviews(wine_ids, wine_urls, rating_counts):
#Create a dataframe
review_info = pd.DataFrame()
#Create a driver
driver = webdriver.Chrome()
for wine_url in wine_urls:
#Pass URL to driver
driver.get(wine_url)
#We scroll down to the bottom of the wine webpage
scroll_to_bottom_wine_page(driver)
#Search for the "Show more reviews button and click it
wait = WebDriverWait(driver,40)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Show more reviews')))
more_reviews_button = driver.find_element_by_link_text('Show more reviews')
more_reviews_button.click()
#Scroll till we reach the number of reviews
reviews = scroll_to_bottom_review_page(driver, rating_counts)
length = len(reviews)
wine_ids_list = [wine_ids] * length
review_user_links = []
review_ratings = []
review_usernames = []
review_dates = []
review_texts = []
review_likes_count = []
review_comments_count = []
for review in reviews:
review_user_links.append([a['href'] for a in review.find_all('a', href=True)][0])
review_ratings.append(float((review.find("div", class_="rating__rating--ZZb_x")["aria-label"]).split()[1]))
review_usernames.append(str((review.find('a', {"class" : 'anchor__anchor--3DOSm reviewCard__userName--2KnRl'})).string))
review_dates.append("".join(((review.find('div', {"class" : 'reviewCard__ratingsText--1LU2T'})).text).rsplit((str(review_usernames[-1])))))
if (review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})) is not None:
review_texts.append(str((review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})).string))
review_texts = [item.strip() for item in review_texts]
else:
review_texts.append('None')
if (review.find("div", class_="likeButton__likeCount--82au4")) is not None:
review_likes_count.append(int(review.find("div", class_="likeButton__likeCount--82au4").text))
else:
review_likes_count.append(int(0))
if (review.find("div", class_="commentsButton__commentsCount--1_Ugm")) is not None:
review_comments_count.append(int(review.find("div", class_="commentsButton__commentsCount--1_Ugm").text))
else:
review_comments_count.append(int(0))
#We put the information in a dataframe
review_info_temp = pd.DataFrame()
review_info_temp.loc[:,'wine_id'] = wine_ids_list
review_info_temp.loc[:,'review_user_links'] = review_user_links
review_info_temp.loc[:,'review_ratings'] = review_ratings
review_info_temp.loc[:,'review_usernames'] = review_usernames
review_info_temp.loc[:,'review_dates'] = review_dates
review_info_temp.loc[:,'review_texts'] = review_texts
review_info_temp.loc[:,'review_likes_count'] = review_likes_count
review_info_temp.loc[:,'review_comments_count'] = review_comments_count
#We update the total dataframe
review_info = pd.concat([review_info,review_info_temp], axis=0, ignore_index=True)
#We close the driver
driver.quit()
return review_info
wine_id = ['123']
wine_url = ['https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981']
wine_rating_count = 186
start_time = time.time()
reviews_info = get_reviews(wine_id, wine_url, wine_rating_count)
elapsed_time = time.time() - start_time
print('The scrape took: ', elapsed_time) #For this particular wine, the code took 38 seconds to run
The script I wrote do the following steps:
With a certain wine link (ie: https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981), I can access that webpage with Selenium driver.
Then, I scroll down to the bottom of the web page.
I find and click the button "Show more reviews"
After pressing this button, a pop-up page appears with the wine reviews
I scroll down in these pop-up window until it reaches a certain amount of reviews
I extract the information I need from the reviews (each review is a Beautiful Soup's soup object)
The problem is that, if I want to scrape the reviews information of thousands of wines, it would take forever. For a single wine with 99 reviews, it takes 35 seconds to do this.
Is there any way I can speed up this process?

My advice is don't use Selenium. Selenium should be your last option to scrape a web page. Instead, learn to understand how a web page make requests using your web browser developres tools. For example, for the web page you posted, this is the URL where you can retrieve the coments: https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1
They have an API!! Its very easy to scrape something like that.
You only need requests and maybe BeautifulSoup.
headers = {"pragma": "no-cache",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"x-requested-with": "XMLHttpRequest"}
url = "https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1"
resp = requests.get(url, headers=headers)
resp.json()
The answer looks like:
{'reviews': [{'id': 118841527,
'rating': 5.0,
'note': 'You need to taste it!! ',
'language': 'en',
'created_at': '2019-02-16T15:33:49.000Z',
'aggregated': True,
'user': {'id': 10310349,
'seo_name': 'miguellourenco0',
'alias': 'Miguel Lourenço',
'is_featured': False,
'visibility': 'all',
'image': {'location': '//images.vivino.com/avatars/0064zilphklf01a4dd1d69f.jpg',
'variations': {'large': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_300x300.jpg',
'small_square': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_50x50.jpg'}},
'statistics': {'followers_count': 14,
'followings_count': 21,
'ratings_count': 113,
'ratings_sum': 0,
'reviews_count': 90},
'background_image': None},

Those reviews are from their api:
import requests
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'}
response = requests.get('https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=100', headers=agent)
reviews = response.json()["reviews"]
print(reviews)

i scraped title and price and links and info table and when i write csv file i get duplicated title and price and links

I want to replace duplicate title and price and links with empty column values.
import requests
import csv
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
import pandas as pd
url = 'http://shop.kvgems-preciousstones.com/'
while True:
session = requests.Session()
session.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
posts = soup.find_all('li',{'class':'item'})
data = []
for url in posts:
title = url.find('h2',{'product-name'}).text
price = url.find('span',{'price'}).text
link = url.find('a').get('href')
url_response = requests.get(link)
url_data = url_response.text
url_soup = BeautifulSoup(url_data, 'html.parser')
desciption = url_soup.find('tr')
for tr in url_soup.find_all('tr'):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
data.append((title,price,planet_data,link))
#data_new = data +","+ data_desciption
#urls = soup.find('a',{'class': 'next i-next'}).get('href')
#url = urls
#print(url)
with open('ineryrge5szdqzrt.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['title','price','name','info','link'])
#The for loop
for title,price,planet_data,link in data:
writer.writerow([title,price,planet_data['name'],planet_data['info'],link])
When I write CSV I got the result of duplicated title, price, link but I want to get only 1 title, price, info and link while the rest are empty.

The first for loop extracts the common values (title, price and link). The second for loop then extracts all the data attributes for each item.
However, you are then writing title, price and link fields to the CSV file for every row of data. You only need to do it for the first row of data.
To detect if your second for loop is on the first row or not, you can change it to use the enumerate function which gives you an extra index variable. You can then use this value to only write the title, price, link if 0:
for index, tr in enumerate(url_soup.find_all('tr')):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
if index == 0:
data.append((title,price,planet_data,link))
else:
data.append((None,None,planet_data,None))
(Also I don't think you need the initial while True: part.)

Not able to scrape the all the reviews

I am trying to scrape this website and trying to get the reviews but I am facing an issue,
The page loads only 50 reviews.
To load more you have to click "Show More Reviews" and I don't know how to get all the data as there is no page link, also "Show more Reviews" doesn't have a URL to explore, the address remains the same.
url =
"https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
a = []
url = requests.get(url)
html = url.text
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
#print(table)
for x in table:
a.append(x.text)
df = pd.DataFrame(a)
df.to_csv("review.csv", sep='\t')
I know this is not pretty code but I am just trying to get the review text first.
kindly help. As I am little new to this.

Looking at the website, the "Show more reviews" button makes an ajax call and returns the additional info, all you have to do is find it's link and send a get request to it (which I've done with some simple regex):
import requests
import re
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36"
}
url = "https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
Data = []
#Each page equivalant to 50 comments:
MaximumCommentPages = 3
with requests.Session() as session:
info = session.get(url)
#Get product ID, needed for getting more comments
productID = re.search(r'"product_id":(\w*)', info.text).group(1)
#Extract info from main data
soup = BeautifulSoup(info.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Number of pages to get:
#Get additional data:
params = {
"page": "",
"product_id": productID
}
while(MaximumCommentPages > 1): # number 1 because one of them was the main page data which we already extracted!
MaximumCommentPages -= 1
params["page"] = str(MaximumCommentPages)
additionalInfo = session.get("https://www.capterra.com/gdm_reviews", params=params)
print(additionalInfo.url)
#print(additionalInfo.text)
#Extract info for additional info:
soup = BeautifulSoup(additionalInfo.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Extract data the old fashioned way:
counter = 1
with open('review.csv', 'w') as f:
for one in Data:
f.write(str(counter))
f.write(one.text)
f.write('\n')
counter += 1
Notice how I'm using a session to preserve cookies for the ajax call.
Edit 1: You can reload the webpage multiple times and call the ajax again to get even more data.
Edit 2: Save data using your own method.
Edit 3: Changed some stuff, now gets any number of pages for you, saves to file with good' ol open()

python mechanize check dates/time for an exam from a website

I am trying to check the dates/times availability for an exam using Python mechanize and send someone an email if a particular date/time becomes available in the result (result page screenshot attached)
import mechanize
from BeautifulSoup import BeautifulSoup
URL = "http://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
br = mechanize.Browser()
response = br.open(URL)
# there are some errors in doctype and hence filtering the page content a bit
response.set_data(response.get_data()[200:])
br.set_response(response)
br.select_form(name="entry_form")
# select Oakland for the 1st set of checkboxes
for i in range(0, len(br.find_control(type="checkbox",name="cb_examSites").items)):
if i ==2:
br.find_control(type="checkbox",name="cb_examSites").items[i].selected =True
# select salesperson for the 2nd set of checkboxes
for i in range(0, len(br.find_control(type="checkbox",name="cb_examTypes").items)):
if i ==1:
br.find_control(type="checkbox",name="cb_examTypes").items[i].selected =True
reponse = br.submit()
print reponse.read()
I am able to get the response but for some reason the data within my table is missing
here are the buttons from the initial html page
<input type="submit" value="Get Exam List" name="B1">
<input type="button" value="Clear" name="B2" onclick="clear_entries()">
<input type="hidden" name="action" value="GO">
one part of the output (submit response) where the actual data is lying
<table summary="California Exams Scheduling" class="General_list" width="100%" cellspacing="0"> <EVERTHING INBETWEEN IS MISSING HERE>
</table>
All the data within the table is missing. I have provided a screenshot of the table element from chrome browser.
Can someone please tell me what could be wrong ?
Can someone please tell me how to get the date/time out of the response (assuming I have to use BeautifulSoup) and so has to be something on these lines. I am trying to find out if a particular date I have in mind (say March 8th) in the response shows up a Begin Time of 1:30 pm..screenshot attached
soup = BeautifulSoup(response.read())
print soup.find(name="table")
update - looks like my issue might be related to this question and am trying my options . I tried the below as per one of the answers but cannot see any tr elements in the data (though can see this in the page source when I check it manually)
soup.findAll('table')[0].findAll('tr')
Update - Modfied this to use selenium, will try and do further at some point soon
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib3
myURL = "http://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
browser = webdriver.Firefox() # Get local session of firefox
browser.get(myURL) # Load page
element = browser.find_element_by_id("Checkbox5")
element.click()
element = browser.find_element_by_id("Checkbox13")
element.click()
element = browser.find_element_by_name("B1")
element.click()

5 years later, maybe this can help someone. I took your problem as a training exercise. I completed it using the Requests package. (I use python 3.9)
The code below is in two parts:
the request to retrieve the data injected into the table after the POST request.
## the request part
url = "https://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"}
params = {
"cb_examSites": [
"'Fresno'",
"'Los+Angeles'",
"'SF/Oakland'",
"'Sacramento'",
"'San+Diego'"
],
"cb_examTypes": [
"'Broker'",
"'Salesperson'"
],
"B1": "Get+Exam+List",
"action": "GO"
}
s = rq.Session()
r = s.get(url, headers=headers)
s.headers.update({"Cookie": "%s=%s" % (r.cookies.keys()[0], r.cookies.values()[0])})
r2 = s.post(url=url, data=params)
soup = bs(r2.content, "lxml") # contain data you want
Parsing the response (a lot of ways to do it mine is maybe a bit stuffy)
table = soup.find_all("table", class_="General_list")[0]
titles = [el.text for el in table.find_all("strong")]
def beetweenBr(soupx):
final_str = []
for br in soupx.findAll('br'):
next_s = br.nextSibling
if not (next_s and isinstance(next_s,NavigableString)):
continue
next2_s = next_s.nextSibling
if next2_s and isinstance(next2_s,Tag) and next2_s.name == 'br':
text = str(next_s).strip()
if text:
final_str.append(next_s.strip())
return "\n".join(final_str)
d = {}
trs = table.find_all("tr")
for tr in trs:
tr_text = tr.text
if tr_text in titles:
curr_title = tr_text
splitx = curr_title.split(" - ")
area, job = splitx[0].split(" ")[0], splitx[1].split(" ")[0]
if not job in d.keys():
d[job] = {}
if not area in d[job].keys():
d[job][area] = []
continue
if (not tr_text in titles) & (tr_text != "DateBegin TimeLocationScheduledCapacity"):
tds = tr.find_all("td")
sub = []
for itd, td in enumerate(tds):
if itd == 2:
sub.append(beetweenBr(td))
else :
sub.append(td.text)
d[job][area].append(sub)
"d" contain data u want. I didn't go as far as sending an email yet.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use beautiful soup to extract elements from collapsible section - python

Related

web scraping can't get data of all links in page at same time

How can I make web scraping faster in Python using Selenium and Beautiful Soup?

i scraped title and price and links and info table and when i write csv file i get duplicated title and price and links

Not able to scrape the all the reviews

python mechanize check dates/time for an exam from a website

Categories

Resources