BeautifulSoup 4 - Scraping element (h2) outside of div

BeautifulSoup 4 - Scraping element (h2) outside of div - python

I am attempting to scrape some football match data from the following site below:
https://liveonsat.com/uk-england-all-football.php
Looking at the source code of the site I was able to identify that most of the information (team names, start time and channels) is contained within an outer div ( div class="blockfix"). I am able to scrape this data successfully using the code below:
import requests
import time
import csv
import sys
from bs4 import BeautifulSoup
import tkinter as tk
from tkinter import messagebox
from tkinter import *
from PIL import ImageTk, Image
def makesoup(url):
page=requests.get(url)
return BeautifulSoup(page.text,"lxml")
def matchscrape(g_data):
for item in g_data:
try:
match = item.find_all("div", {"class": "fix"})[0].text
print(match)
except:
pass
try:
starttime = item.find_all("div", {"class": "fLeft_time_live"})[0].text
print(starttime)
except:
pass
try:
channel = item.find_all("td", {"class": "chan_col"})
for i in channel:
print(i.get_text().strip())
except:
pass
def start():
soup=makesoup(url = "https://liveonsat.com/uk-england-all-football.php")
matchscrape(g_data = soup.findAll("div", {"class": "blockfix"}))
root = tk.Tk()
root.resizable(False, False)
root.geometry("600x600")
root.wm_title("liveonsat scraper")
Label = tk.Label(root, text = 'liveonsat scraper', font = ('Comic Sans MS',18))
button = tk.Button(root, text="Scrape Matches", command=start)
button3 = tk.Button(root, text = "Quit Program", command=quit)
Label.pack()
button.pack()
button3.pack()
status_label = tk.Label(text="")
status_label.pack()
root.mainloop()
I receive the following output for example :
The issue I am having is that one element (date of the matches) is contained outside of the div ( div class="blockfix"). I am unsure as to how I am able to retrieve this data. I tried to change the following code below:
def start():
soup=makesoup(url = "https://liveonsat.com/uk-england-all-football.php")
matchscrape(g_data = soup.findAll("div", {"class": "blockfix"}))
to
def start():
soup=makesoup(url = "https://liveonsat.com/uk-england-all-football.php")
matchscrape(g_data = soup.findAll("td", {"height": "50"}))
as this element contained the h2 tag for date of the matches ( h2 class="time_head), but when I attempt this I get a completely different output which is incorrect (see code below)
def matchscrape(g_data):
for item in g_data:
try:
match = item.find_all("div", {"class": "fix"})[0].text
print(match)
except:
pass
try:
matchdate = item.find_all("h2", {"class": "time_head"})[0].text
print(matchdate)
except:
pass
try:
starttime = item.find_all("div", {"class": "fLeft_time_live"})[0].text
print(starttime)
except:
pass
try:
channel = item.find_all("td", {"class": "chan_col"})
for i in channel:
print(i.get_text().strip())
except:
pass
def start():
soup=makesoup(url = "https://liveonsat.com/uk-england-all-football.php")
matchscrape(g_data = soup.findAll("td", {"height": "50"}))
Incorrect Output: (due to only one match name, time and date being outputted with 100's of channel names)
To further clarify. The end result I am trying to achieve is each match, time of each match, channels showing each match and date match is showing to be scraped and outputted (printed).
Thank you to anyone who can provide guidance or assistance to me with this issue. If further clarification or anything else is required I will be more than happy to provide.
Update: Below is HTML code as requested in the comments for one match as an example. The element I am having issue with displaying is h2 class="time_head"
<div style="clear:right"> <div class=floatAndClearL><h2 class = sport_head >Football</h2></div> <!-- sport_head -->
<div class=floatAndClearL><h2 class = time_head>Friday, 10th July</h2></div> <!-- time_head --> <div><span class = comp_head>English Championship - Week 43</span></div>
<div class = blockfix > <!-- block 1-->
<div class=fix> <!-- around fixture and notes 2-->
<div class=fix_text> <!-- around fixture text 3-->
<div class = imgCenter><span><img src="../img/team/england.gif"></span></div>
<div class = fLeft style="width:270px;text-align:center;background-color:#ffd379;color:#800000;font-size:10pt;font-family:Tahoma, Geneva, sans-serif">Huddersfield v Luton Town</div>
<div class = imgCenter><img src="../img/team/england.gif"></div>
</div> <!-- around fixture text 3 ENDS-->
<div class=notes></div>
</div> <!-- around fixture and notes 2 ENDS-->
<div class = fLeft> <!-- around all of channel types 2--> <div> <!-- around channel type group 3-->
<div class=fLeft_icon_live_l> <!-- around icon 4-->
<img src="../img/icon/live3.png"/>
</div>
<div class=fLeft_time_live> <!-- around icon 4-->
ST: 18:00
</div> <!-- around icon 4 ENDS--> <div class = fLeft_live> <!-- around all tables of a channel type 4--> <table border="0" cellspacing="0" cellpadding="0"><tr><td class=chan_col> <a href="https://connect.bein.net/" target="_blank" class = chan_live_iptvcable> beIN Connect MENA 📺</a></td><td width = 0></td>
</tr></table> <table border="0" cellspacing="0" cellpadding="0"><tr><td class=chan_col> <a href="https://tr.beinsports.com/kullanici/giris?ReturnUrl=" target="_blank" class = chan_live_iptvcable> beIN Connect TURKEY 📺</a></td><td width = 0></td>
</tr></table>

Here is how you could achieve it:
import requests
import re
import unidecode
from bs4 import BeautifulSoup
# Get page source
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
response = requests.get('https://liveonsat.com/uk-england-all-football.php', headers=headers)
soup = BeautifulSoup(response.content)
# process results
for match in soup.find_all('div',class_='blockfix'):
#Competitors list. Using Regex, we look for a div containing two competitors name sepatated by a ' v '
competitors = match.find('div', text = re.compile(r'(.*) v (.*)')).text
# Looking at the match date by searching the previous h2 tag with time_head as class attribute
match_date = match.find_previous('h2',class_='time_head').text
fLeft_time_live = match.find('div',class_='fLeft_time_live').text.strip()
#Match time
channels = match.find('div',class_='fLeft_live')
print("Competitors ", competitors)
print("Match date", match_date)
print("Match time", fLeft_time_live)
#Grab tv transmission times
for channel in channels.find_all('a'):
# if the show time is available, it will be contained in a "mouseover" tag
# we try to find this tag, otherwise we just display the channel name
try:
show_date = BeautifulSoup(channel.get('onmouseover')).text
except:
print(" " ,channel.text.strip().replace('ðŸ“º',''), "- no time displayed - ",)
continue
show_date = unidecode.unidecode(show_date )
#Some regex logic to extract the show date
pattern = r"CAPTION, '(.*)'\)"
show_date = re.search(pattern,show_date ).group(1)
print(" ", show_date )
print()
Output
Competitors Huddersfield v Luton Town
Match date Friday, 10th July
Match time ST: 19:00
beIN Connect MENA - no time displayed -
beIN Connect TURKEY - no time displayed -
beIN Sports MENA 12 HD - 2020-07-10 17:00:00
beIN Sports MENA 2 HD - 2020-07-10 17:00:00
beIN Sports Turkey 4 HD - 2020-07-10 17:00:00
Eleven Sports 2 Portugal HD - 2020-07-10 17:00:00
....
EDIT : corrected the match date extraction ...

Related

Can't print text inside 'p' tag using BeautifulSoup

Pretty simple code
import requests
from bs4 import BeautifulSoup
link = 'https://www.birdsnest.com.au/brands/boho-bird/73067-amore-wrap-dress'
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
page_new = soup.find('div', class_='model-info clearfix')
results = page_new.find_all('p')
for result in results:
print(result.text)
Output
usually wears a size .
She is wearing a size in this style.
Her height is .
Show ’s body measurements
The problem is that the model's name is inside <strong> tags and a span inside the <strong> tag.
Like so.
<div class="model-info-header">
<p>
<strong><span class="model-info__name">Marnee</span></strong> usually wears a size <strong><span class="model-info__standard-size">8</span></strong>.
She is wearing a size <strong><span class="model-info__wears-size">10</span></strong> in this style.
</p>
<p class="model-info-header__height">Her height is <strong><span class="model-info__height">178 cm</span></strong>.</p>
<p>
<span class="js-model-info-more model-info__link model-info-header__more">Show <span class="model-info__name">Marnee</span>’s body measurements</span>
</p>
</div>
How to get the BOLD elements inside the <p> tags.

The model name is generated dynamically. Try this:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
link = 'https://www.birdsnest.com.au/brands/boho-bird/73067-amore-wrap-dress'
driver = webdriver.Chrome()
driver.get(link)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
page_new = soup.find('div', class_='model-info clearfix')
results = page_new.find_all('p')
for result in results:
print(result.text)
Output:
Marnee usually wears a size 8.
She is wearing a size 10 in this style.
Her height is 178 cm.
Show Marnee’s body measurements
Marnee’s body measurements are:
Bust 81 cm
Waist 64 cm
Hips 89 cm

How to extract Date using Selenium Webdriver

I have been trying hours to sort this out but unable to do so.
Here is my script using Selenium Webdriver in Python, trying to extract title, date, and link. I am able to extract the title and link. However, I am stuck at extracting the date. Could someone please help me with this. Much appreciated your response.
import selenium.webdriver
import pandas as pd
frame=[]
url = "https://www.oric.gov.au/publications/media-releases"
driver = selenium.webdriver.Chrome("C:/Users/[Computer_Name]/Downloads/chromedriver.exe")
driver.get(url)
all_div = driver.find_elements_by_xpath('//div[contains(#class, "ui-accordion-content")]')
for div in all_div:
all_items = div.find_elements_by_tag_name("a")
for item in all_items:
title = item.get_attribute('textContent')
link = item.get_attribute('href')
date =
frame.append({
'title': title,
'date': date,
'link': link,
})
dfs = pd.DataFrame(frame)
dfs.to_csv('myscraper.csv',index=False,encoding='utf-8-sig')
Here is the html I am interested in:
<div id="ui-accordion-1-panel-0" ...>
<div class="views-field views-field-title">
<span class="field-content">
<a href="/publications/media-release/ngadju-corporation-emerges-special-administration-stronger">
Ngadju corporation emerges from special administration stronger
</a>
</span>
</div>
<div class="views-field views-field-field-document-media-release-no">
<div class="field-content"><span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2020-07-31T00:00:00+10:00">
31 July 2020
</span> (MR2021-06)</div>
</div>
</div>
...

I'd get all rows first.
from pprint import pprint
import selenium.webdriver
frame = []
url = "https://www.oric.gov.au/publications/media-releases"
driver = selenium.webdriver.Chrome()
driver.get(url)
divs = driver.find_elements_by_css_selector('div.ui-accordion-content')
for div in divs:
rows = div.find_elements_by_css_selector('div.views-row')
for row in rows:
item = row.find_element_by_tag_name('a')
title = item.get_attribute('textContent')
link = item.get_attribute('href')
date = row.find_element_by_css_selector(
'span.date-display-single').get_attribute('textContent')
frame.append({
'title': title,
'date': date,
'link': link,
})
driver.quit()
pprint(frame)
print(len(frame))

Ok just search for the <span> with the property dc:date, save it in a WebElement dateElement and take its text dateElement.text. That's your date as string.

Beautifulsoup4 - not selcting all instances of span class

I am attempting to scrape data from a website that uses non-specific span classes to format/display content. The pages present information about chemical products and each product is described within a single div class.
I first parsed by that div class and am working to pull the data I need from there. I have been able to get many things but the parts I cant seem to pull are within the span class "ppisreportspan"
If you look at the code, you will note that it appears multiple times within each chemical description.
<tr>
<td><h4 id='stateprod'>MAINE STATE PRODUCT REPORT</h4><hr class='report'><span style="color:Maroon;" Class="subtitle">Company Number: </span><span style='color:black;' Class="subtitle">38</span><br /><span Class="subtitle">MONSANTO COMPANY <br/>800 N. LINDBERGH BOULEVARD <br/>MAIL STOP FF4B <br/>ST LOUIS MO 63167-0001<br/></span><br/><span style="color:Maroon;" Class="subtitle">Number of Currently Registered Products: </span><span style='color:black; font-size:14px' class="subtitle">80</span><br /><br/><p class='noprint'><img alt='' src='images/epalogo.png' /> View the label in the US EPA Pesticide Product Label System (PPLS).<br /><img alt='' src='images/alstar.png' /> View the label in the Accepted Labels State Tracking and Repository (ALSTAR).<br /></p>
<hr class='report'>
<div class='nopgbrk'>
<span class='ppisreportspanprodname'>PRECEPT INSECTICIDE </span>
<br/>EPA Registration Number: <a href = "http://iaspub.epa.gov/apex/pesticides/f?p=PPLS:102:::NO::P102_REG_NUM:100-1075" target='blank'>100-1075-524 <img alt='EPA PPLS Link' src='images/pplslink.png'/></a>
<span class='line-break'></span>
<span class=ppisProd>ME Product Number: </span>
<**span class="ppisreportspan"**>2014000996</span>
<br />Registration Year: <**span class="ppisreportspan"**>2019</span>
Type: <span class="ppisreportspan">RESTRICTED</span><br/><br/>
<table width='100%'>
<tr>
<td width='13%'>Percent</td>
<td style='width:87%;align:left'>Active Ingredient</td>
</tr>
<tr>
<td><span class="ppisreportspan">3.0000</span></td>
<td><span class="ppisreportspan">Tefluthrin (128912)</span></td>
</tr>
</table><hr />
</div>
<div class='nopgbrk'>
<span class='ppisreportspanprodname' >ACCELERON IC-609 INSECTICIDE SEED TREATMENT FOR CORN</span>
<br/>EPA Registration Number: <a href = "http://iaspub.epa.gov/apex/pesticides/f?p=PPLS:102:::NO::P102_REG_NUM:264-789" target='blank'>264-789-524 <img alt='EPA PPLS Link' src='images/pplslink.png'/>
</a><span class='line-break'></span>
<span class=ppisProd>ME Product Number: <a href = "alstar_label.aspx?LabelId=116671" target = 'blank'>2009005053</span>
<img alt='ALSTAR Link' src='images/alstar.png'/></a>
<br />Registration Year: <span class="ppisreportspan">2019</span>
<br/>
<table width='100%'>
<tr>
<td width='13%'>Percent</td>
<td style='width:87%;align:left'>Active Ingredient</td>
</tr>
<tr>
<td><span class="ppisreportspan">48.0000</span></td>
<td><span class="ppisreportspan">Clothianidin (44309)</span></td>
</tr>
</table><hr />
</div>
This sample includes two chemicals. One has an "alstar" ID and link and one does not. Both have registration years. Those are the data points that are hanging me up.
You may also note that there is a 10 digit code stored in "ppisreportspan" in the first example. I was able to extract that as part of the "ppisProd" span for nay record that doesn't have the Alstar link. I don't understand why, but it reinforces the point that it seems my parsing process ignores that span class.
I have tried various methods for the last 2 days based on all kinds of different answers on SO, so I can't possibly list them all. I seem to be able to either get anything from the first "span" to the end on the last span, or I get "nonetype" errors or empty lists.
This one gets the closest:
It returns the correct spans for many div chunks but it still skips (returns blank tuple []) for any of the ones that have alstar links like the second one in the example.
picture showing data and then a series of three sets of empty brackets where the data should be
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re
url = input('Enter URL:')
hand = open(url)
soup = BeautifulSoup(hand, 'html.parser')
#create a list of chunks by product (div)
products = soup.find_all('div' , class_ ='nopgbrk')
print(type(products))
print(len(products))
tempalstars =[]
rptspanclasses = []
regyears = []
alstarIDs = []
asltrlinks = []
# read the span tags
for product in products:
tempalstar = product.find_all('span', class_= "ppisreportspan")
tempalstars.append(tempalstar)
print(tempalstar)
Ultimately, I want to be able to select the text for the year as well as the Alstar link out of these span statements for each div chunk, but I will be cross that bridge when I can get the code finding all the instances of that class.
Alternately - Is there some easier way I can get the Registration year and the Alstar link (eg. <a href = "alstar_label.aspx?LabelId=116671" target = 'blank'>2009005053</span> <img alt='ALSTAR Link' src='images/alstar.png'/></a>) rather than what I am trying to do?
I am using Python 3.7.2 and Thank you!

I managed to get some data from this site. All you need to know is the company number, in case of monsanto, the number is 38 (this number is shown in after selecting Maine and typing monsanto in the search box:
import re
import requests
from bs4 import BeautifulSoup
url_1 = 'http://npirspublic.ceris.purdue.edu/state/state_menu.aspx?state=ME'
url_2 = 'http://npirspublic.ceris.purdue.edu/state/company.aspx'
company_name = 'monsanto'
company_number = '38'
with requests.session() as s:
r = s.get(url_1)
soup = BeautifulSoup(r.text, 'lxml')
data = {i['name']: '' for i in soup.select('input[name]')}
for i in soup.select('input[value]'):
data[i['name']] = i['value']
data['ctl00$ContentPlaceHolder1$search'] = 'company'
data['ctl00$ContentPlaceHolder1$TextBoxInput1'] = company_name
r = s.post(url_1, data=data)
soup = BeautifulSoup(r.text, 'lxml')
data = {i['name']: '' for i in soup.select('input[name]')}
for i in soup.select('input[value]'):
data[i['name']] = i['value']
data = {k: v for k, v in data.items() if not k.startswith('ctl00$ContentPlaceHolder1$')}
data['ctl00$ContentPlaceHolder1${}'.format(company_number)] = 'Display+Products'
r = s.post(url_2, data=data)
soup = BeautifulSoup(r.text, 'lxml')
for div in soup.select('.nopgbrk'):
#extract name
print(div.select_one('.ppisreportspanprodname').text)
#extract ME product number:
s = ''.join(re.findall(r'\d{10}', div.text))
print(s)
#extract alstar link
s = div.select_one('a[href*="alstar_label.aspx"]')
if s:
print(s['href'])
else:
print('No ALSTAR link')
#extract Registration year:
s = div.find(text=lambda t: 'Registration Year:' in t)
if s:
print(s.next.text)
else:
print('No registration year.')
print('-' * 80)
Prints:
PRECEPT INSECTICIDE
2014000996
No ALSTAR link
2019
--------------------------------------------------------------------------------
ACCELERON IC-609 INSECTICIDE SEED TREATMENT FOR CORN
2009005053
alstar_label.aspx?LabelId=117531
2019
--------------------------------------------------------------------------------
ACCELERON D-342 FUNGICIDE SEED TREATMENT
2015000498
alstar_label.aspx?LabelId=117538
2019
--------------------------------------------------------------------------------
ACCELERON DX-309
2009005026
alstar_label.aspx?LabelId=117559
2019
--------------------------------------------------------------------------------
... and so on.

how to .find() from the actual div in my for

I'm parsing a huge file, the following HTML code is only a little part. I have many times the first div. In this div I want to get differents tags in <a> I don't care if I also get the element into the a.
I'm doing this but It doesn't work :
from bs4 import BeautifulSoup
import requests
import re
page_url = 'https://paris-sportifs.pmu.fr/'
page = requests.get(page_url)
soup = BeautifulSoup(page.text, 'html.parser')
with open('pmu.html', 'a+')as file:
for div in soup.find_all('div', class_ = 'time_group', attrs={ 'data-time_group' : re.compile("group[1-9]") }):
event_information = div.find('a', class_ = 'trow--event tc-track-element-events')
print(re.sub(r'\s+', ' ', event_information.text))
An exemple of HTML :
<div class="time_group" data-time_group="group0">
<div class="row">
<div class="col-sm-12">
<a class="trow--event tc-track-element-events" href="/event/522788/football/football/maroc-botola-pro-1/rsb-berkane-rapide-oued-zem" data-event_id="rsb_berkane__rapide_oued_zem" data-compet_id="maroc_-_botola_pro_1" data-sport_id="football" data-name="sportif.clic.paris_live.details" data-toggle="tooltip" data-placement="bottom" title="Football - Maroc - Botola Pro 1 - RSB Berkane // Rapide Oued Zem - 29 mars 2018 - 19h00">
<em class="trow--event--name">
<span>RSB Berkane // Rapide Oued Zem</span>
</em>
</a>
</div>
</div>
</div>
With the for loop i get into the different div which interest me but I don't know how I can use this div to do the next : div.find I want to do the find in the element on this div not outside (in the soup).
What I except :
<a class="trow--event tc-track-element-events" href="/event/522788/football/football/maroc-botola-pro-1/rsb-berkane-rapide-oued-zem" data-event_id="rsb_berkane__rapide_oued_zem" data-compet_id="maroc_-_botola_pro_1" data-sport_id="football" data-name="sportif.clic.paris_live.details" data-toggle="tooltip" data-placement="bottom" title="Football - Maroc - Botola Pro 1 - RSB Berkane // Rapide Oued Zem - 29 mars 2018 - 19h00">
<em class="trow--event--name">
<span>RSB Berkane // Rapide Oued Zem</span>
</em>
</a>
Then I just have to find the different tag values in my var.
I hope my english isn't horrible.
Thank you, in advance for your valuable assistance
EDIT 1 :
Let's take an exemple of source code : https://pastebin.com/KZBp9c3y
in this file when i do for div in soup.find_all('div', class_ = 'time_group', attrs={ 'data-time_group' : re.compile("group[1-9]") }): I find the first div but imagine we have multiple match in the for loop.
Then I want to find in this div the element with tag a and class trow--event... div.find('a', class_ = 'trow--event tc-track-element-events')
An exemple of possible result is:
data-event_id="brescia__pescara"
data-compet_id="italie_-_serie_b"
data-sport_id="football"
score-both :
Anyway the problem is that I don't know how to do a find from the div where I am. I'm in <div class="time_group" data-time_group="group1"> and I want to get different information. I want to parse the div from the top to the bottom.
concretely :
for div in soup:
if current_div is:
do this.....
else if:
do this...
How can I get the current_div ?
Tell me if you don't understand what I want.
Thanks you

I've find something it's not exactly what I wanted but it works :
from bs4 import BeautifulSoup
import requests
import re
page_url = 'https://paris-sportifs.pmu.fr/'
page = requests.get(page_url)
soup = BeautifulSoup(page.text, 'html.parser')
soupdiv = soup.find_all('div', class_ = 'time_group', attrs={ 'data-time_group' : re.compile("group[1-9]") })
for div in soupdiv:
test = div.find("a", {"class":"trow--event tc-track-element-events"})
print(test.text)
I doing my find from the current div in the for.
thanks you.

Python (soup): get nested data and get last item in a tag

So I have an html document that looks something like this:
<title>Speaker Name: Title of Talk | Subtitle | website.com</title>
... [Other Stuff]
<div class='meta'><span class='meta__item'>
Posted
<span class='meta__val'>
Jun 2006
</span></span><span class='meta__row'>
Rated
<span class='meta__val'>
Funny, Informative
</span></span></div>
<div class='talk-article__body talk-transcript__body'> TEXT
<data class='talk-transcript__para__time'>15:57</data>
I have 2200 files like this, and I am hoping to put them all into a CSV file with columns of AUTHOR, TITLE, DATE, LENGTH, and TEXT. Right now, what I have is not the prettiest code, but it works:
from bs4 import BeautifulSoup as soup
soup = soup(open(file).read(), "lxml")
at = soup.find("title").text
author = at[0:at.find(':')]
title = at[at.find(":")+1 : at.find("|") ]
text = soup.find("div", attrs={ "class" : "talk-article__body"}) # still needs cleaning
date =
length =
I cannot for the life of me figure out how to get at the date: I suspect it's a combination of soup and re, but I confess that I can't wrap my head around the combination.
The trick with the length is that what I want to find is the LAST time <data class='talk-transcript__para__time'> occurs in the file and grab THAT value.

You can try this
date_spans = soup.find_all('span', {'class' : 'meta__val'})
date = [x.get_text().strip("\n\r") for x in date_spans if re.search(r"(?s)[A-Z][a-z]{2}\s+\d{4}", x.get_text().strip("\n\r"))][0]
print(date)
#date = re.findall(r"(?s)<span class=.*?>\s*([A-Z][a-z]{2}\s+\d{4})", str(soup))
length_data = soup.find_all('data', {'class' : 'talk-transcript__para__time'})
length = [x.get_text().strip("\n\r") for x in length_data if re.search(r"(?s)\d{2}:\d{2}", x.get_text().strip("\n\r"))][-1]
print(length)
#length = re.findall(r"(?s).*<data class=.*?>(.*)</data>", str(soup))
Output
Jun 2006
15:57

You don't need a regex for the date if the first meta__val is the date, you definitely don't need it for the time as you can just use the class name talk-transcript__para__time:
from bs4 import BeautifulSoup
h = """<title>Speaker Name: Title of Talk | Subtitle | website.com</title>
<div class='meta'><span class='meta__item'>
Posted
<span class='meta__val'>
Jun 2006
</span></span><span class='meta__row'>
Rated
<span class='meta__val'>
Funny, Informative
</span></span></div>
<div class='talk-article__body talk-transcript__body'> TEXT
<data class='talk-transcript__para__time'>15:57</data>"""
soup = BeautifulSoup(h,"html.parser")
date = soup.select_one("span.meta__val").text
time = soup.select_one("data.talk-transcript__para__time").text
print(date, time)
Output:
(u'\nJun 2006\n', u'15:57')
If you were using a regex you would pass it to find or find_all:
r = re.compile(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}")
soup = BeautifulSoup(h, "html.parser")
date = soup.find("span", {"class": "meta__val"}, text=r).text.strip()
Which would give you:
'Jun 2006'

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

BeautifulSoup 4 - Scraping element (h2) outside of div - python

Related

Can't print text inside 'p' tag using BeautifulSoup

How to extract Date using Selenium Webdriver

Beautifulsoup4 - not selcting all instances of span class

how to .find() from the actual div in my for

Python (soup): get nested data and get last item in a tag

Categories

Resources