Indeed scraper bs4, splitting parsed HTML code after grabbing it

Indeed scraper bs4, splitting parsed HTML code after grabbing it - python

import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(url):
titles = data_grabber(url)
for title in titles:
t = title.find_all('tbody')
return t
this is my source code, and im testing it out in jupyter notebook to make sure my functions work correctly but I've hit a small road block. My html soup from my first function works perfectly. It grabs all the info from indeed, especially the job_seen_beacon class.
Mr job_title function is wrong because it only outputs the first 'tbody' class it finds. refer to image here, I don't have enough points on stack
while for my data_grabber it returns every single job_seen_beacon. If you were able to scroll, you would easily see the multiple job_seen_beacon's.
I'm clearly missing something but I can't see it, any ideas?

What happens?
In moment you are return something from a function you leave it and that happens in first iteration.
Not sure where you will end up with your code, but you can do something like that:
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
Example
from bs4 import BeautifulSoup
import requests
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
def job_location(item):
location = item.select_one('div.companyLocation')
return location.get_text(strip=True) if location else 'No Location'
data = []
for item in data_grabber(url):
data.append({
'title':job_title(item),
'companyLocation':job_location(item)
})
data
Output
[{'title': 'Chef de Projet Big Data H/F', 'companyLocation': 'Lyon (69)'},{'title': 'Chef de Projet Big Data F/H', 'companyLocation': 'Lyon 9e (69)'}]

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!

You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Remove html element from scraping result python

I'm doing scraping Indonesian news website from here. When I'm scraped the news articles from each news links, there is some HTML element on it. The output like this:
I want to remove the elements so the output is just the article. I already use .strip() but still doesn't affect the output. This is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
detik = requests.get('https://www.detik.com/terpopuler')
beautify = BeautifulSoup(detik.content, 'html5lib')
news = beautify.find_all('article', {'class','list-content__item'})
arti = []
for each in news:
try:
title = each.find('h3', {'class','media__title'}).text
lnk = each.a.get('href')
r = requests.get(lnk)
soup = BeautifulSoup(r.text, 'html5lib')
content = soup.find('div', {'class', 'detail__body-text itp_bodycontent'}).text.strip()
print(title)
print(lnk)
arti.append({
'Headline': title,
'Content':content,
'Link': lnk
})
except:
continue
df = pd.DataFrame(arti)
df.to_csv('detik.csv', index=False)
Any help would be appreciated

You might be dealing with invalid tags. This thread might be useful:
https://stackoverflow.com/a/8439761/6100602

How to scrape data from interactive chart using python?

I have a next link which represent an exact graph I want to scrape: https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1
I'm simply can't understand is it a xml or svg graph and how to scrape data. I think I need to use bs4, requests but don't know the way to do that.
Anyone could help?

You will load HTML like this:
import requests
url = "https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
resp = requests.get(url)
data = resp.text
Then you will create a BeatifulSoup object with this HTML.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, features="html.parser")
After this, it is usually very subjective how to parse out what you want. The candidate codes may vary a lot. This is how I did it:
Using BeautifulSoup, I parsed all "rect"s and check if "onmouseover" exists in that rect.
rects = soup.svg.find_all("rect")
yx_points = []
for rect in rects:
if rect.has_attr("onmouseover"):
text = rect["onmouseover"]
x_start_index = text.index("'") + 1
y_finish_index = text[x_start_index:].index("'") + x_start_index
yx = text[x_start_index:y_finish_index].split()
print(text[x_start_index:y_finish_index])
yx_points.append(yx)
As you can see from the image below, I scraped onmouseover= part and get those 02.2015 155,1 parts.
Here, this is how yx_points looks like now:
[['12.2009', '100,0'], ['01.2010', '101,8'], ['02.2010', '103,7'], ...]

from bs4 import BeautifulSoup
import requests
import re
#First get all the text from the url.
url="https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
response = requests.get(url)
html = response.text
#Find all the tags in which the data is stored.
soup = BeautifulSoup(html, 'lxml')
texts = soup.findAll("rect")
final = []
for each in texts:
names = each.get('onmouseover')
try:
q = re.findall(r"'(.*?)'", names)
final.append(q[0])
except Exception as e:
print(e)
#The details are appended to the final variable

none returned when trying to get tag value

In this html snippet from https://letterboxd.com/shesnicky/list/top-50-favourite-films/, I'm trying to go through all the different li tags and get the info from 'data-target-link' so I can then use that to create a new link that takes me to the page for that film, however every time I try and get the data it simply returns None or an error along those lines.
<li class="poster-container numbered-list-item" data-owner-rating="10"> <div class="poster film-poster really-lazy-load" data-image-width="125" data-image-height="187" data-film-slug="/film/donnie-darko/" data-linked="linked" data-menu="menu" data-target-link="/film/donnie-darko/" > <img src="https://s3.ltrbxd.com/static/img/empty-poster-125.c6227b2a.png" class="image" width="125" height="187" alt="Donnie Darko"/><span class="frame"><span class="frame-title"></span></span> </div> <p class="list-number">1</p> </li>
I'm going to be using the links to grab imgs for a twitter bot, so I tried doing this within my code:
class BotStreamer(tweepy.StreamListener):
print "Bot Streamer"
#on_data method of Tweepy’s StreamListener
#passes data from statuses to the on_status method
def on_status(self, status):
print "on status"
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
movies = []
for mov in movies_ul.find('data-film-slug'):
movies.append(mov)
rand = randint(0,51)
newLink = "https://letterboxd.com%s" % (str(movies[rand]))
newPage = requests.get(newLink)
code = BS(newPage.content, 'html.parser')
code_div = code.find\
('div', {'class':'react-component film-poster film-poster-51910 poster'})
image = code_div.find('img')
url = image.get('src')
username = status.user.screen_name
status_id = status.id
tweet_reply(url, username, status_id)
However, I kept getting errors about list being out of range, or not being able to iterate over NoneType. So I made a test prgrm just to see if I could somehow get the data:
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
more = movies_ul.find('li', {'class':'poster-container numbered-list-item'})
k = more.find('data-target-link')
print k
And again, all I get is None. Any help greatly appreciated.

Read doc: find() as first argument expects tag name, not attribute.
You may do
soup.find('div', {'data-target-link': True})
or
soup.find(attrs={'data-target-link': True})
Full example
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
all_items = soup.find_all('div', {'data-target-link': True})
for item in all_items:
print(item['data-target-link'])

Findall to div tag using beautiful soup yields blank return

<div class="columns small-5 medium-4 cell header">Ref No.</div>
<div class="columns small-7 medium-8 cell">110B60329</div>
Website is https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+BOSWALL+PARKWAY%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&DISPLAY_COUNT=10&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY&UARN=110B60329&PPRN=000000000001745&ASSESSOR_IDX=10&DISPLAY_MODE=FULL#results
I would like to run a loop and return '110B60329'. I have ran beautiful soup and done a find_all(div), I then define the 2 different tags as head and data based on their class. I then ran iteration through the 'head' tags hoping it would return the info in the div tag i have defined as data .
Python returns a blank (cmd prompt reprinted the filepth).
Would anyone kindly know how i might fix this. My full code is.....thanks
import requests
from bs4 import BeautifulSoup as soup
import csv
url = 'https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+BOSWALL+PARKWAY%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&DISPLAY_COUNT=10&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY&UARN=110B60329&PPRN=000000000001745&ASSESSOR_IDX=10&DISPLAY_MODE=FULL#results'
baseurl = 'https://www.saa.gov.uk'
session = requests.session()
response = session.get(url)
# content of search page in soup
html= soup(response.content,"lxml")
properties_col = html.find_all('div')
for col in properties_col:
ref = 'n/a'
des = 'n/a'
head = col.find_all("div",{"class": "columns small-5 medium-4 cell header"})
data = col.find_all("div",{"class":"columns small-7 medium-8 cell"})
for i,elem in enumerate(head):
#for i in range(elems):
if head [i].text == "Ref No.":
ref = data[i].text
print ref

You can do this by two ways.
1) If you are sure that the website that your are scraping won't change its content you can find all divs by that class and get the content by providing an index.
2) Find all left side divs (The titles) and if one of them matches what you want get the next sibling to get the text.
Example:
import requests
from bs4 import BeautifulSoup as soup
url = 'https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+BOSWALL+PARKWAY%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&DISPLAY_COUNT=10&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY&UARN=110B60329&PPRN=000000000001745&ASSESSOR_IDX=10&DISPLAY_MODE=FULL#results'
baseurl = 'https://www.saa.gov.uk'
session = requests.session()
response = session.get(url)
# content of search page in soup
html = soup(response.content,"lxml")
#Method 1
LeftBlockData = html.find_all("div", class_="columns small-7 medium-8 cell")
Reference = LeftBlockData[0].get_text().strip()
Description = LeftBlockData[2].get_text().strip()
print(Reference)
print(Description)
#Method 2
for column in html.find_all("div", class_="columns small-5 medium-4 cell header"):
RightColumn = column.next_sibling.next_sibling.get_text().strip()
if "Ref No." in column.get_text().strip():
print (RightColumn)
if "Description" in column.get_text().strip():
print (RightColumn)
The prints will output (in order):
110B60329
STORE
110B60329
STORE
Your problem is that you are trying to match a node text that have a lot of tabs with a non-spaced string.
For example your head [i].textvariable contains
Ref No., so if you compare it with Ref No. it'll give a false result. Striping it will solve.

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+BOSWALL+PARKWAY%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&DISPLAY_COUNT=10&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY&UARN=110B60329&PPRN=000000000001745&ASSESSOR_IDX=10&DISPLAY_MODE=FULL#results")
soup = BeautifulSoup(r.text, 'lxml')
for row in soup.find_all(class_='table-row'):
print(row.get_text(strip=True, separator='|').split('|'))
out:
['Ref No.', '110B60329']
['Office', 'LOTHIAN VJB']
['Description', 'STORE']
['Property Address', '29 BOSWALL PARKWAY', 'EDINBURGH', 'EH5 2BR']
['Proprietor', 'SCOTTISH MIDLAND CO-OP SOCIETY LTD.']
['Tenant', 'PROPRIETOR']
['Occupier']
['Net Annual Value', '£1,750']
['Marker']
['Rateable Value', '£1,750']
['Effective Date', '01-APR-10']
['Other Appeal', 'NO']
['Reval Appeal', 'NO']
get_text() is very powerful tool, you can strip the white space and put separator in the text.
You can use this method to get clean data and filter it.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Indeed scraper bs4, splitting parsed HTML code after grabbing it - python

Related

Pulling p tags from multiple URLs

Remove html element from scraping result python

How to scrape data from interactive chart using python?

none returned when trying to get tag value

Findall to div tag using beautiful soup yields blank return

Categories

Resources