python passing argument containing quote - python

I'm learning to scrape text from the web. Ive written the following function
from bs4 import BeautifulSoup
import requests
def get_url(source_url):
r = requests.get(source_url)
data = r.text
#extract HTML for parsing
soup = BeautifulSoup(data, 'html.parser')
#get H3 tags with class ...
h3list = soup.findAll("h3", { "class" : "entry-title td-module-title" })
#create data structure to store links in
ulist = []
#pull links from each article heading
for href in h3list:
ulist.append(href.a['href'])
return ulist
I am calling this from a separate file...
from print1 import get_url
ulist = get_url("http://www.startupsmart.com.au/")
print(ulist[3])
The problem is that the css selector I am using is quite unique to the site I am parsing. So the function is a bit 'brittle'. I want to pass the css selector as an argument to the function
If I add a parameter to the function definition
def get_url(source_url, css_tag):
and try to pass "h3", { "class" : "entry-title td-module-title" }
it spazzes out
TypeError: get_url() takes exactly 1 argument (2 given)
I tried escaping all the quotes but it still doesn't work.
I'd really appreciate some help. I can't find a previoud answer to this one.

Here's a version that works:
from bs4 import BeautifulSoup
import requests
def get_url(source_url, tag_name, attrs):
r = requests.get(source_url)
data = r.text
# extract HTML for parsing
soup = BeautifulSoup(data, 'html.parser')
# get H3 tags with class ...
h3list = soup.findAll(tag_name, attrs)
# create data structure to store links in
ulist = []
# pull links from each article heading
for href in h3list:
ulist.append(href.a['href'])
return ulist
ulist = get_url("http://www.startupsmart.com.au/", "h3", {"class": "entry-title td-module-title"})
print(ulist[3])

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

How to get the href value of a link with bs4?

I need help where I can extract all the matches from 2020/2021's URLs from this [website][1] and scrape them.
I am sending a request to this link.
The section of the HTML that I want to retrieve is this part:
Here's the code that I am using:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
website = 'https://www.espncricinfo.com/series/ipl-2020-21-1210595/match-results'
response = requests.get(website)
soup = BeautifulSoup(response.content,'html.parser')
match_result = soup.find_all('a',{'class':'match-info-link-FIXTURES'});
soup.get('href')
url_part_1 = 'https://www.espncricinfo.com/'
url_part_2 = []
for item in match_result:
url_part_2.append(item.get('href'))
url_joined = []
for link_2 in url_part_2:
url_joined.append(urllib.parse.urljoin(url_part_1,link_2))
first_link = url_joined[0]
match_url = soup.find_all('div',{'class':'link-container border-bottom'});
soup.get('href')
url_part_3 = 'https://www.espncricinfo.com/'
url_part_4 = []
for item in match_result:
url_part_4.append(item.get('href'))
print(url_part_4)
[1]: https://www.espncricinfo.com/series/ipl-2020-21-1210595/match-results
You don't need the second item.find_all('a',{'class':'match-info-link-FIXTURES'}): call below for item in match_result: since you already have the tags with the hrefs.
You can get the href with item.get('href').
You can do:
url_part_1 = 'https://www.espncricinfo.com/'
url_part_2 = []
for item in match_result:
url_part_2.append(item.get('href'))
The result will look something like:
['/series/ipl-2020-21-1210595/delhi-capitals-vs-mumbai-indians-final-1237181/full-scorecard',
'/series/ipl-2020-21-1210595/delhi-capitals-vs-sunrisers-hyderabad-qualifier-2-1237180/full-scorecard',
'/series/ipl-2020-21-1210595/royal-challengers-bangalore-vs-sunrisers-hyderabad-eliminator-1237178/full-scorecard',
'/series/ipl-2020-21-1210595/delhi-capitals-vs-mumbai-indians-qualifier-1-1237177/full-scorecard',
'/series/ipl-2020-21-1210595/sunrisers-hyderabad-vs-mumbai-indians-56th-match-1216495/full-scorecard',
...
]
From official doc's
:
It’s very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, “class”, is a reserved word in Python. Using class as a keyword argument will give you a syntax error. As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argument class_.
Try
soup.find_all("a", class_="match-info-link-FIXTURES")

Indeed scraper bs4, splitting parsed HTML code after grabbing it

import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(url):
titles = data_grabber(url)
for title in titles:
t = title.find_all('tbody')
return t
this is my source code, and im testing it out in jupyter notebook to make sure my functions work correctly but I've hit a small road block. My html soup from my first function works perfectly. It grabs all the info from indeed, especially the job_seen_beacon class.
Mr job_title function is wrong because it only outputs the first 'tbody' class it finds. refer to image here, I don't have enough points on stack
while for my data_grabber it returns every single job_seen_beacon. If you were able to scroll, you would easily see the multiple job_seen_beacon's.
I'm clearly missing something but I can't see it, any ideas?
What happens?
In moment you are return something from a function you leave it and that happens in first iteration.
Not sure where you will end up with your code, but you can do something like that:
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
Example
from bs4 import BeautifulSoup
import requests
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
def job_location(item):
location = item.select_one('div.companyLocation')
return location.get_text(strip=True) if location else 'No Location'
data = []
for item in data_grabber(url):
data.append({
'title':job_title(item),
'companyLocation':job_location(item)
})
data
Output
[{'title': 'Chef de Projet Big Data H/F', 'companyLocation': 'Lyon (69)'},{'title': 'Chef de Projet Big Data F/H', 'companyLocation': 'Lyon 9e (69)'}]

Access attributes with beautifulSoup and print

I'd like to scrape a site to findall title attributes of h2 tag
<h2 class="1">Titanic_Caprio</h2>
Using this code, I'm accessing the entire h2 tag
from bs4 import BeautifulSoup
import urllib2
url = "http://www.example.it"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
links = soup.findAll('h2')
print "".join([str(x) for x in links] )
using findAll('h2', attrs = {'title'}) doesn't have results. What Am I doing wrong? How can I print out the entire title's list in a file?
The problem is that title is not an attribute of the h2 tag, but of a tag included in it. So you must first search for <h2> tags, and then subtags having a title attribute:
titles = []
h2_list = links = soup.findAll('h2')
for h2 in h2_list:
titles.extend(h2.findAll(lambda x: x.has_attr('title')))
It works because BeautifulSoup can use functions as search filters.
you need to pass key value pairs in attrs
findAll('h2', attrs = {"key":"value"})

BeautifulSoup: Extracting string between tags does not seem to work

I am currently parsing this url. the Url will be the argument for the parse function.
def parse(sitemap):
req = urllib.request.urlopen(sitemap)
soup = BeautifulSoup(req, 'lxml')
soup.prettify()
inventory_url = []
inventory_url_set = set()
for item in soup.find_all('url'):
print(item.find('lastmod'))
# print(item.find('lastmod').text)
inventory_url_set.add(item.find('loc').text)
However, item.find('lastmod').text retuns an AttributeError whereas if I were to print the whole tag item.find('lastmod') it works fine.
I'd like to only obtain the text in between the 'lastmod' tag from within each 'item'.
Thanks
Not all of the url entries contain a lastmod, so you need to test for that. If you use a dictionary, you could store the lastmod as values and still benefit from having unique URLs as follows:
from bs4 import BeautifulSoup
import urllib.request
def parse(sitemap):
req = urllib.request.urlopen(sitemap)
soup = BeautifulSoup(req, 'lxml')
inventory_urls = {}
for url in soup.find_all('url'):
if url.lastmod:
lastmod = url.lastmod.text
else:
lastmod = None
inventory_urls[url.loc.text] = lastmod
for url, lastmod in inventory_urls.items():
print(lastmod, url)
parse("https://www.kith.com/sitemap_products_1.xml")
This would give you a list starting as follows:
2017-02-12T03:55:25Z https://kith.com/products/adidas-originals-stan-smith-wool-pk-grey-white
2017-03-13T18:55:24Z https://kith.com/products/norse-projects-niels-pocket-boucle-tee-black
2017-03-15T17:20:47Z https://kith.com/products/ronnie-fieg-x-fracap-rf120-rust
2017-03-17T01:30:25Z https://kith.com/products/new-balance-696-birch
2017-01-23T08:43:56Z https://kith.com/products/ronnie-fieg-x-diamond-supply-co-x-asics-gel-lyte-v-1
2017-03-17T00:41:03Z https://kith.com/products/off-white-diagonal-ferns-hoodie-black
2017-03-16T15:01:55Z https://kith.com/products/norse-projects-skagen-bubble-crewneck-charcoal
2017-02-21T15:57:56Z https://kith.com/products/vasque-eriksson-gtx-brown-black

Categories