Find span element based on text written inside li Bs4 scraping - python

I want to find the text located in the <li>, if it exists I want to scrape the <span> text, but if it does not exist I will raise exception, for example:
if 'Floor' found then scrape the span
This is my code and it works perfect but scraping everything without any condition :
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
# Set base url & headers :
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' :
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
test_link = 'https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat'
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
title = soup.find('h1').text.replace('\n','')
loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
sub_loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
floor = soup.find('span' , class_='badge badge-default').text.replace('\n','')
room = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
baths = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
finish = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').text.replace('\n','')
view = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
area = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
date = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
price = soup.find('div' , class_='listing-price-content').find_next('span').text
print(title,loc,sub_loc,floor,room,baths,finish,view,area,date,price)

In general, it would be good to check if the tag you are looking for exists before applying the text method:
title = title.text.strip() if (title := soup.find('h1')) else None
To select tag by text and check if it exists, you can go with css selectors and -soup-contains():
floor = tag.text.strip() if (tag := soup.select_one('ul.list-group li:-soup-contains("Floor") span')) else None
Above works well for some tags, but to go generic and get rid of these confusing property selections, I would suggest the following - Use a dict to store the information in a list of dicts. So you are save if you create a dataframe based on it and a propertiy is missing. Pandas will fill this outomatically with nan.
data = {}
data['title'] = soup.find('h1').text.strip()
data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))
Benefits - You can do adjustments simple, filter if you like and export results in a strucured way.
Example
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' :
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
data = []
def scrape(test_link):
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
data = {}
data['title'] = soup.find('h1').text.strip()
data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))
return data
urlList = ['https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat',
'https://aqarmap.com.eg/en/listing/3124476-for-rent-cairo-new-cairo-el-narges-el-narges-omarat?source=related-listing-source']
for url in urlList:
data.append(scrape(url))
pd.DataFrame(data)
Output
title
loc
sub_loc
Floor
Room
Baths
Finish Type
Size (in meters)
Listing ID
Publish Date
Price
Seller Role
Payment Method
Price Per Meter
View
Furnished Apartment For rent in El Narges Omarat
El Narges
El Narges Omarat
3
3
2
SUPER_LUX
180 M²
EG-3138984
09/01/2022
19,000 EGP
Agent
Cash
106 EGP/M²
nan
Furnished Apartment For rent in El Narges Omarat
El Narges
El Narges Omarat
2
2
2
SUPER_LUX
180 M²
EG-3124476
30/12/2021
19,000 EGP
Agent
Cash
106 EGP/M²
Garden

You might find the following approach useful, particularly an alternative way to extract items from the "bread crumb" list at the top:
import requests
from bs4 import BeautifulSoup as bs
# Set base url & headers :
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
test_link = 'https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat'
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
# Store all available items in a dictionary
ul_list_group = soup.find('ul', class_='list-group')
data = {}
for li_item in ul_list_group.find_all('li'):
items = list(li_item.stripped_strings)
if len(items) == 2:
data[items[0]] = items[1]
req_elements = [
'Floor', 'Room', 'Baths',
'Finish Type', 'Size (in meters)', 'Listing ID',
'Publish Date', 'Price', 'Seller Role',
'Payment Method', 'Price Per Meter',
'Swimming pool', # test case to show missing item
]
# Store all parts of the breadcrump in a list (last 3 required)
ul_breadcrumb = soup.find('ul', class_='breadcrumb')
breadcrumb_data = [li.get_text(strip=True) for li in ul_breadcrumb.find_all('li')]
title = breadcrumb_data[-1]
sub_loc = breadcrumb_data[-2]
loc = breadcrumb_data[-3]
# Build a suitable row of data (give missing entries 'N/A')
row = [title, sub_loc, loc] + [data.get(element, 'N/A') for element in req_elements]
print(row)
Giving you:
['Furnished Apartment For rent in El Narges Omarat', 'El Narges Omarat', 'El Narges', '3', '3', '2', 'SUPER_LUX', '180 M²', 'EG-3138984', '09/01/2022', '19,000 EGP', 'Agent', 'Cash', '106 EGP/M²', 'N/A']

Related

How to separate data per column when writing data to excel from web scraping results

I know how to separate it when the data looks like:
x, y, z
But I can't figure out how to do it when the data format is like:
Doe, John, BookName, Year, abstract with commas, links.
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().split('\n'))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('\n'))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

Scrape data using beautifulsoup

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name and surname in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM
One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page += 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 periagomer#hotmail.com
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 mariabelenabadgarcia#hotmail.com
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 jabad#carlosgallegoabogados.es
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 paloma#abadsaezabogados.com
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 3004#icagr.es
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

Scrape URL loop with BeautifulSoup

I want to scrap information on different pages of the same site, societe.com and I have several questions.
first of all here is the code that I managed to do, I am a bit of a novice I admit it
I only put 2 URLs to see if the loop worked and some information, I can add some when everything works
urls = ["https://www.societe.com/societe/decathlon-france-500569405.html","https://www.societe.com/societe/go-sport-312193899.html"]
for url in urls:
response = requests.get(url, headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})
soup = BeautifulSoup(response.text, "html.parser")
numrcs = soup.find("td", class_="numdisplay")
nomcommercial = soup.find("td", class_="break-word")
print(nomcommercial.text)
print(numrcs.text.strip())
numsiret = soup.select('div[id^=siret_number]')
for div in numsiret:
print(div.text.strip())
formejuri = soup.select('div[id^=catjur-histo-description]')
for div in formejuri:
print(div.text.strip())
infosend = {
'numrcs': numrcs,
'nomcommercial':nomcommercial,
'numsiret':numsiret,
'formejuri':formejuri
}
tableau.append(infosend)
print(tableau)
my_infos = ['Numéro RCS', 'Numéro Siret ','Forme Juridique']
my_columns = [
np.tile(np.array(my_infos), len(nomcommercial))
]
df = pd.DataFrame( tableau,index=nomcommercial, columns=my_columns)
df
When I run the loop I have the right information coming out, like for example
DECATHLON FRANCE
Lille Metropole B 500569405
50056940503239
SASU Société par actions simplifiée à associé unique
but I would like to put all this information in a table but I can't really, only the last company appears and the data makes no sense I tried to follow a tutorial without success.
if you can help me i would be really happy
To get data about the companies you can use next example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
urls = [
"https://www.societe.com/societe/decathlon-france-500569405.html",
"https://www.societe.com/societe/go-sport-312193899.html",
]
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
data = []
for url in urls:
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
title = soup.select_one("#identite_deno").get_text(strip=True)
rcs = soup.select_one('td:-soup-contains("Numéro RCS") + td').get_text(
strip=True
)
siret_number = soup.select_one("#siret_number").get_text(strip=True)
form = soup.select_one("#catjur-histo-description").get_text(strip=True)
data.append([title, url, rcs, siret_number, form])
df = pd.DataFrame(
data,
columns=["Title", "URL", "Numéro RCS", "Numéro Siret", "Forme Juridique"],
)
print(df.to_markdown())
Prints:
Title
URL
Numéro RCS
Numéro Siret
Forme Juridique
0
DECATHLON FRANCE (DECATHLON DIRECTION GENERALE FRANCE)
https://www.societe.com/societe/decathlon-france-500569405.html
Lille Metropole B 500569405
50056940503239
SASU Société par actions simplifiée à associé unique
1
GO SPORT
https://www.societe.com/societe/go-sport-312193899.html
Grenoble B 312193899
31219389900191
Société par actions simplifiée

Scraping elements with the same tag and without class and id attributes

I want to scrape the number of bedrooms and bathrooms and the land area for each property separately from a real estate webpage. However, I found that their tags are the same which is <strong>, there are no class and id either. Therefore, when I write the following code:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
url = "https://www.realestate.co.nz/residential/sale/auckland?oad=true&pm=1"
response = requests.get(url, headers=headers)
content = BeautifulSoup(response.content, "lxml")
rooms = content.findAll('strong', class_=False, id=False)
for room in rooms:
print(room.text)
I get the following:
Sign up
2
2
2
2
3
2
4
3
2.4ha
2
1
2
2
4
3
465m2
1
1
3
2
1
1
5
3
10.1ha
3
2
5
5
600m2
600m2
4
2
138m2
2
1
2
1
2
2
3
2
675m2
2
1
You can see that I got them all together because they are having the same tag. Can someone help me how to get them all but separately? Thanks!
Find main tile means div tag which contains the info regarding property also in some of them data is missing like area,bathroom or etc. so you can try this approach!
from bs4 import BeautifulSoup
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
url = "https://www.realestate.co.nz/residential/sale/auckland?oad=true&pm=1"
response = requests.get(url, headers=headers)
content = BeautifulSoup(response.content, "lxml")
rooms = content.find_all('div', attrs={'data-test':"tile"})
dict1={}
for room in rooms:
apart=room.find_all('strong',class_=False)
if len(apart)==3:
for apa in apart:
dict1['bedroom']=apart[0].text
dict1['bathroom']=apart[1].text
dict1['area']=apart[2].text
elif len(apart)==2:
for apa in apart:
dict1['bedroom']=apart[0].text
dict1['bathroom']=apart[1].text
dict1['area']="NA"
else:
for apa in apart:
dict1['bedroom']="NA"
dict1['bathroom']="NA"
dict1['area']=apart[0].text
print(dict1)
Output:
{'bedroom': '2', 'bathroom': '2', 'area': 'NA'}
{'bedroom': '2', 'bathroom': '2', 'area': 'NA'}
{'bedroom': '3', 'bathroom': '2', 'area': 'NA'}
{'bedroom': '4', 'bathroom': '3', 'area': '2.4ha'}
{'bedroom': '2', 'bathroom': '1', 'area': 'NA'}
...
I would loop over the main tiles and attempt to select for each target node e.g. by its unique class within the html for that tile. You can use if else with test of not None to add a default value where missing. To handle different sort order, I also added a try except. I went with sort by latest, but also tested with your sort order.
I added in a few more items to give context. It would be easy to expand this to loop pages, but that is beyond the scope of your question, and would be a candidate for a new question once you have tried extending if required.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
#'https://www.realestate.co.nz/residential/sale/auckland?oad=true&pm=1'
r = requests.get('https://www.realestate.co.nz/residential/sale/auckland?by=latest&oad=true&pm=1',
headers = {'User-Agent':'Mozilla/5.0'}).text
soup = bs(r, 'lxml')
main_listings = soup.select('.listing-tile')
base = 'https://www.realestate.co.nz/4016546/residential/sale/'
results = {}
for listing in main_listings:
try:
date = listing.select_one('.listed-date > span').next_sibling.strip()
except:
date = listing.select_one('.listed-date').text.strip()
title = listing.select_one('h3').text.strip()
listing_id = listing.select_one('a')['id']
url = base + listing_id
bedrooms = listing.select_one('.icon-bedroom + strong')
if bedrooms is not None:
bedrooms = int(bedrooms.text)
else:
bedrooms = np.nan
bathrooms = listing.select_one('.icon-bathroom + strong')
if bathrooms is not None:
bathrooms = int(bathrooms.text)
else:
bathrooms = np.nan
land_area = listing.select_one('icon-land-area + strong')
if land_area is not None:
land_area = land_area.text
else:
land_area = "Not specified"
price = listing.select_one('.text-right').text
results[listing_id] = [date, title, url, bedrooms, bathrooms, land_area, price]
df = pd.DataFrame(results).T
df.columns = ['Listing Date', 'Title', 'Url', '#Bedroom', '#Bathrooms', 'Land Area', 'Price']
print(df)

Only scrape a portion of the page

I am using Python/requests to gather data from a website. Ideally I only want the latest 'banking' information, which always at the top of the page.
The code I have currently does that, but then it attempts to keep going and hits an index out of range error. I am not very good with aspx pages, but is it possible to only gather the data under the 'banking' heading?
Here's what I have so far:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
print('Scraping South Dakota Banking Activity Actions...')
url2 = 'https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx'
r2 = requests.get(url2, headers=headers)
soup = BeautifulSoup(r2.text, 'html.parser')
mylist5 = []
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
print(tds[0].text, tds[1].text)
Ideally I'd be able to slice the information as well so I can only show the activity or approval status, etc.
With bs4 4.7.1 + you can use :contains to isolate the latest month by filtering out the later months. I explain the principle of filtering out later general siblings using :not in this SO answer. In short, find the row containing "August 2019" (this month is determined dynamically) and grab it and all its siblings, then find the row containing "July 2019" and all its general siblings and remove the latter from the former.
import requests, re
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx')
soup = bs(r.content, 'lxml')
months = [i.text for i in soup.select('[colspan="2"]:has(a)')][0::2]
latest_month = months[0]
next_month = months[1]
rows_of_interest = soup.select(f'tr:contains("{latest_month}"), tr:contains("{latest_month}") ~ tr:not(:contains("{next_month}"), :contains("{next_month}") ~ tr)')
results = []
for row in rows_of_interest:
data = [re.sub('\xa0|\s{2,}',' ',td.text) for td in row.select('td')]
if len(data) == 1:
data.extend([''])
results.append(data)
df = pd.DataFrame(results)
print(df)
Same as before
import requests
from bs4 import BeautifulSoup, Tag
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = 'https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx'
print('Scraping South Dakota Banking Activity Actions...')
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
Inspecting data source, we can find the id of the element you need (the table of values).
banking = soup.find(id='secondarycontent')
After this, we filter out elements of soup that aren't tags (like NavigableString or others). You can see how to get texts too (for other options, check Tag doc).
blocks = [b for b in banking.table.contents if type(b) is Tag] # filter out NavigableString
texts = [b.text for b in blocks]
Now, if it's the goal you're achieving when you talk about latest, we must determine which month is latest and which is the month before.
current_month_idx, last_month_idx = None, None
current_month, last_month = 'August 2019', 'July 2019' # can parse with datetime too
for i, b in enumerate(blocks):
if current_month in b.text:
current_month_idx = i
elif last_month in b.text:
last_month_idx = i
if all(idx is not None for idx in (current_month_idx, last_month_idx)):
break # break when both indeces are not null
assert current_month_idx < last_month_idx
curr_month_blocks = [b for i, b in enumerate(blocks) if current_month_idx < i < last_month_idx]
curr_month_texts = [b.text for b in curr_month_blocks]

Categories