Python: BeautifulSoup extract all the heading text from div class - python

import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
first_event = event_containers[0]
print(first_event.h3.text)
By using this code i'm able to extract the event name,I'm trying for a way to loop and extract all the event names and dates ? and also i'm trying to extract the location information which is visable after clicking on readmore link

event_containers is a bs4.element.ResultSet object, which is basically a list of Tag objects.
Just loop over the tags in event_containers and select h3 for the title, div.date for the date and a for the URL, example:
for tag in event_containers:
print(tag.h3.text)
print(tag.select_one('div.date').text)
print(tag.a['href'])
Now, for the location information you'll have to visit each URL and collect the text in div.date.
Full code:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
base_url = 'http://aicd.companydirectors.com.au'
for tag in event_containers:
link = base_url + tag.a['href']
soup = BeautifulSoup(requests.get(link).text,"lxml")
location = ', '.join(list(soup.select_one('div.event-add').stripped_strings)[1:-1])
print('Title:', tag.h3.text)
print('Date:', tag.select_one('div.date').text)
print('Link:', link)
print('Location:', location)

Try this to get all the events and dates you are after:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all(class_='lead'):
date = item.find_previous_sibling().text.split(" |")[0]
print(item.text,date)

Related

How to get multiple class to BeautifulSoup?

Trying to get torrent links from skidrowreloaded.
On the post detail page we have a div like this, I tried get this div by id but i think id is dynamic so I tried get this div by class but did not work,
<div id="tabs-105235-0-0" aria-labelledby="ui-id-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom" role="tabpanel" aria-hidden="false">
the following code is returning none
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
err:
AttributeError: 'NoneType' object has no attribute 'find_all'
full code:
import os
from bs4 import BeautifulSoup
import requests
import webbrowser
clear = lambda: os.system('cls')
clear()
r = requests.get('https://www.skidrowreloaded.com/')
source = BeautifulSoup(r.content,"lxml")
source2 = source.find_all("h2")
games = []
for i in source2:
games.append(i.a.get("href"))
lastgame = games[0]
r = requests.get(lastgame)
source = BeautifulSoup(r.content,"lxml")
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
source3 = source2.find_all("a")
k = 0;
for i in source3:
if k == 0: #hide steam link.
k = k + 1
else:
if i.get("href") == "https://www.skidrowreloaded.com": #hide null links
pass
else: #throwing links to the browser
print(i.get("href"))
webbrowser.open(i.get("href"))
k = k + 1
To get all the links try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.skidrowreloaded.com/projection-first-light-goldberg/"
soup = BeautifulSoup(requests.get(url).text, "html.parser").find_all("a", {"target": "_blank"})
skip = 'https://www.skidrowreloaded.com'
print([a['href'] for a in soup if a['href'].startswith('https') and a['href'] != skip])
Output:
['https://store.steampowered.com/app/726490/Projection_First_Light/', 'https://mega.nz/file/geogAATS#-0U0PklF-Q5i5l_SELzYx3klh5FZob9HaD4QKcFH_8M', 'https://uptobox.com/rqnlpcp7yb3v', 'https://1fichier.com/?0syphwpyndpo38af04ky', 'https://yadi.sk/d/KAmlsBmGaI1f2A', 'https://pixeldra.in/u/wmcsjuhv', 'https://dropapk.to/v6r7mjfgxjq6', 'https://gofile.io/?c=FRWL1o', 'https://racaty.net/dkvdyjqvg02e', 'https://bayfiles.com/L0k7Qea2pb', 'https://tusfiles.com/2q00y4huuv15', 'https://megaup.net/2f0pv/Projection.First.Light-GoldBerg.zip', 'https://letsupload.org/88t5', 'https://filesupload.org/0d7771dfef54d055', 'https://dl.bdupload.in/17ykjrifizrb', 'https://clicknupload.co/o0k9dnd3iwoy', 'https://dailyuploads.net/n1jihwjwdmjp', 'https://userscloud.com/nircdd4q1t5w', 'https://rapidgator.net/file/b6b8f5782c7c2bdb534214342b58ef18', 'https://turbobit.net/m308zh1hdpba.html', 'https://hitfile.net/5OhkcqZ', 'https://filerio.in/0wbvn4md4i91', 'https://mirrorace.org/m/1Fiic', 'https://go4up.com/dl/0ee9f4866312b5/Projection.First.Light-GoldBerg.zip', 'https://katfile.com/w74l823vuyw5/Projection.First.Light-GoldBerg.zip.html', 'https://multiup.org/download/3d355ba18d58234c792da7a872ab4998/Projection.First.Light-GoldBerg.zip', 'https://dl1.indishare.in/hs55pkx4ex82']
You can use find_all as noted in the BeautifulSoup documentation
import requests
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", class_="ui-tabs-panel ui-widget-content ui-corner-bottom")
# do something with the data
edit -
looking at the response.text, the div exists, but does not have the class you're looking for, hence it returns empty. You can search by using regex like so
import requests, re
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", id=re.compile("^tabs"))
for ele in raw_data:
a_tag = ele.find("a")
# do something with the a_tag

Scraping websites with BS4

I have this code
import requests
from bs4 import BeautifulSoup
result = requests.get("http://www.cvbankas.lt/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for article_tag in soup.find_all("article"):
a_tag = article_tag.find('a')
urls.append(a_tag.attrs['href'])
div_tag = article_tag.find('span')
urls.append(div_tag.attrs['class'])
print(urls)
Can anyone explane me how to get the data marked in red?
You can get span with the class label "salary_amount"
salary_object = article_tag.find("span", class_= "salary_amount")
and then extract the text with the .text attribute of the created object.

get_text doesn't work for a div tag

I am scraping this website: https://icodrops.com/quarkchain/
I want to get the date the token sale ended, which is "14 February". This is stored in a div tag with the class "sale-date". However, when I call ".get_text" on this div tag, I get this:
<bound method Tag.get_text of <div class="sale-date">14 February</div>>
Here is my code:
from bs4 import BeautifulSoup
import requests
page = requests.get("https://icodrops.com/quarkchain/")
soup = BeautifulSoup(page.content, 'html.parser')
pt1 = soup.find(class_ = "white-desk ico-desk")
date = pt1.find(class_= "sale-date").get_text
print(date)
Are there any other ways I can extract the text inside the div tag?
Try this. You forgot to use () at the end of .get_text which should be .get_text():
from bs4 import BeautifulSoup
import requests
page = requests.get("https://icodrops.com/quarkchain/")
soup = BeautifulSoup(page.content, 'html.parser')
date = soup.find(class_= "sale-date").get_text()
print(date)
Change:
date = pt1.find(class_= "sale-date").get_text
To:
date = pt1.find(class_= "sale-date").get_text()

How to extract href links from anchor tags using BeautifulSoup?

I've been trying to extract just the links corresponding to the jobs on each page. But for some reason they dont print when I execute the script. No errors occur.
for the inputs I put engineering, toronto respectively. Here is my code.
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
for tag in prettify.find_all('div', {'data-tn-element':"jobTitle"}):
for links in tag.find_all('a'):
print (links['href'])
You should be looking for the anchor a tag. It looks like this:
<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=3611ac98c0167102&fccid=459dce363200e1be" ...>Project <b>Engineer</b></a>
Call soup.find_all and iterate over the result set, extracting the links through the href attribute.
import requests
from bs4 import BeautifulSoup
# valid query, replace with something else
url = "https://ca.indeed.com/jobs?q=engineer&l=Calgary%2C+AB"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
all_job_url = []
for tag in soup.find_all('a', {'data-tn-element':"jobTitle"}):
all_job_url.append(tag['href'])

How to get next page link in python beautifulsoup?

I have this link:
http://www.brothersoft.com/windows/categories.html
I am trying to to get the link for the item inside the div.
Example:
http://www.brothersoft.com/windows/mp3_audio/midi_tools/
I have tried this code:
import urllib
from bs4 import BeautifulSoup
url = 'http://www.brothersoft.com/windows/categories.html'
pageHtml = urllib.urlopen(url).read()
soup = BeautifulSoup(pageHtml)
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brLeft'})]
for i in sAll:
print "http://www.brothersoft.com"+i['href']
But I only get output:
http://www.brothersoft.com/windows/mp3_audio/
How can I get output that I needed?
Url http://www.brothersoft.com/windows/mp3_audio/midi_tools/ is not in tag <div class='brLeft'>, so if output is http://www.brothersoft.com/windows/mp3_audio/, that's correct.
If you want to get the url you want, change
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brLeft'})]
to
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brRight'})]
UPDATE:
an example to get info inside 'midi_tools'
import urllib
from bs4 import BeautifulSoup
url = 'http://www.brothersoft.com/windows/categories.html'
pageHtml = urllib.urlopen(url).read()
soup = BeautifulSoup(pageHtml)
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brRight'})]
for i in sAll:
suburl = "http://www.brothersoft.com"+i['href'] #which is a url like 'midi_tools'
content = urllib.urlopen(suburl).read()
anosoup = BeautifulSoup(content)
ablock = anosoup.find('table',{'id':'courseTab'})
for atr in ablock.findAll('tr',{'class':'border_bot '}):
print atr.find('dt').a.string #name
print "http://www.brothersoft.com" + atr.find('a',{'class':'tabDownload'})['href'] #link

Categories