Web Scraping with BeautifulSoup and Python : failed to extract text - python

i try to scrape a website. But i failed to extract the description of each item. Here is my code:
from bs4 import BeautifulSoup
import requests
url = "http://engine.ddtc.co.id/putusan-pengadilan-pajak"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
puts =soup.find_all("div",{"class":"p3-search-item"})
for put in puts:
title = put.find("div", {"class":"p3-title"}).text
cat = put.find("div", {"class":"p3-category"}).text
date = put.find("div", {"class":"search-result-item-meta"}).text
link = put.find("a").get("href")
put_response = requests.get(link)
put_data = put_response.text
put_soup = BeautifulSoup(put_data, "html.parser")
put_description = put_soup.find("div",{"id": "modal-contents-pp"}).text
print("Judul Putusan:", title, "\nKategori:", cat, "\nTanggal:", date, "\nLink:", link, "\nDescription:", put_description)
So i failed to extract the description.
The description only show blank and few words. The full description can be shown if we click each item's link.
Really appreciate any help.

I think you need to change the put_description field:
from bs4 import BeautifulSoup
import requests
url = "http://engine.ddtc.co.id/putusan-pengadilan-pajak"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
puts =soup.find_all("div",{"class":"p3-search-item"})
for put in puts:
title = put.find("div", {"class":"p3-title"}).text
cat = put.find("div", {"class":"p3-category"}).text
date = put.find("div", {"class":"search-result-item-meta"}).text
link = put.find("a").get("href")
put_response = requests.get(link)
put_data = put_response.text
put_soup = BeautifulSoup(put_data, "html.parser")
put_description = put_soup.find("div",{"class": "p3-desc"}).text
print("Judul Putusan:", title, "\nKategori:", cat, "\nTanggal:", date, "\nLink:", link, "\nDescription:", put_description)

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

How can I change the code to make it such that the html tags do not appear

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)

Struggling to scrape a telegram public channel with BeautifulSoup

I'm practicing web scraping with BeautifulSoup but I struggle to finish printing a dictionary including the items I've scraped
The web targeted can be any telegram public channel (web version) and I pretend to collect and add as part of the dictionary the text message, timestamp, views and image url (if exist attached to the post).
I've inspected the code for the 4 elements but the one related to the image url has no class or span, so I've ended scraping them it via regex. The other 3 elements are easily retrievable.
Let's go by parts:
Importing modules
from bs4 import BeautifulSoup
import requests
import re
Function to get the images url from the public channel
def pictures(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
link = str(soup.find_all('a', class_ = 'tgme_widget_message_photo_wrap')) #converted to str in order to be able to apply regex
image_url = re.findall(r"https://cdn4.*.*.jpg", link)
return image_url
Soup to get the text message, timestamp and views
picture_list = pictures(url)
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
print(full_message)
I would really appreciate if someone can help me, I'm new to Python and I don't know how I could do it to
Check if the post contains an image and if so, add it to the dictionary
Print the dictionary including image_url as key and the url as value for each post.
Thank you very much
I think you want something like this.
from bs4 import BeautifulSoup
import requests, re
url = "https://t.me/s/computer_science_and_programming"
channel = requests.get(url).text
soup = BeautifulSoup(channel, 'lxml')
tgpost = soup.find_all('div', class_ ='tgme_widget_message')
full_message = {}
for content in tgpost:
full_message['views'] = content.find('span', class_ = 'tgme_widget_message_views').text
full_message['timestamp'] = content.find('time', class_ = 'time').text
full_message['text'] = content.find('div', class_ = 'tgme_widget_message_text').text
if content.find('a', class_ = 'tgme_widget_message_photo_wrap') != None :
link = str(content.find('a', class_ = 'tgme_widget_message_photo_wrap'))
full_message['url_image'] = re.findall(r"https://cdn4.*.*.jpg", link)[0]
elif 'url_image' in full_message:
full_message.pop('url_image')
print(full_message)

Using beautiful soup to scrape data from indeed

i am trying to use bs to scrape resume on indeed but i met some problems
here is the sample site: https://www.indeed.com/resumes?q=java&l=&cb=jt
here is my code:
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app-link'}):
job.append(a['title'])
return(job)
scrape_job_title(soup)
it print out nothing: []
As you can see in the picture, I want to grab the job title "Java developer".
The class is app_link, not app-link. Additionally, a['title'] doesn't do what you want. Use a.contents[0] instead.
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app_link'}):
job.append(a.contents[0])
return(job)
scrape_job_title(soup)
Try this to get all the job titles:
import requests
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html5lib')
for items in soup.select('.sre'):
data = [item.text for item in items.select('.app_link')]
print(data)

Unable to scrape name from google finance

I want to scrape name, url and description of companies as listed on google finance. So far I am successful in getting description and url but unable to fetch the name. In the source code of myUrl, name is 024 Pharma Inc. When I see the div, the class is named 'appbar-snippet-primary'. But still the code doesn't find it. I ma new to web scraping so may be I am missing something. Please guide me in this regard.
from bs4 import BeautifulSoup
import urllib
import csv
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = urllib.urlopen(myUrl).read()
soup = BeautifulSoup(r, 'html.parser')
name_box = soup.find('div', class_='appbar-snippet-primary') # !! This div is not found
#name = name_box.text
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
#print site
from bs4 import BeautifulSoup
import requests
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = requests.get(myUrl).content
soup = BeautifulSoup(r, 'html.parser')
name = soup.find('title').text.split(':')[0] # !! This div is not found
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
write soup.find_all() instead of soup.find()

Categories