soup.find does not return anything - python

I would like to pull some information from an Amazon page. I've written these few basic lines, but they are not working.
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/Cooler-Master-SickleFlow-120-Radiators/dp/B0046U6DWO/ref=sr_1_3?keywords=green+case+fan&qid=1578069342&sr=8-3'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
price = soup.find(id='priceblock_ourprice')
print(price)

Your code is all-right, but html.parser parses the page content badly. Use html5lib or lxml instead:
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/Cooler-Master-SickleFlow-120-Radiators/dp/B0046U6DWO/ref=sr_1_3?keywords=green+case+fan&qid=1578069342&sr=8-3'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'lxml') # <-- use `html5lib` or `lxml`
price = soup.find(id='priceblock_ourprice')
print(price)
Prints:
<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$10.50</span>

Related

Beautifulsoup : href link is undifined

I want to scrap a website, when I reach any tag the link is "job/undifined" , I used post request to fetch data from the page :
post request with postdata in this code :
from bs4 import BeautifulSoup
import requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"}
postData = {
'search': 'search',
'facets[camp_type]':'day_camp',
'open[choices-made-content]': 'true'}
url = 'https://www.trustme.work/en'
html_1 = requests.post(url, headers=headers, data=postData)
soup1 = BeautifulSoup(html_1.text, 'lxml')
a = soup1.select('div.MuiGrid-root MuiGrid-grid-xs-12 ')
b = soup1.select('span[class="MuiTypography-root MuiTypography-h2"]')
print('soup:',b)
sample from the output :
<span class="MuiTypography-root MuiTypography-h2" style="cursor:pointer">
<a href="job/undefined" style="color:#413E52;text-decoration:none">
Network and Security engineer
</a>
</span>
EDIT
Part of content is served dynamically so, you have to fetch the jobs hashid via api and then create the link yourself or use the data from JSON response:
import requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"}
url = 'https://api.trustme.work/api/job_offers?include=technologies%2Cjob%2Ccompany%2Ccontract_type%2Clevel'
jobs = requests.get(url, headers=headers).json()['included']['jobs']
['https://www.trustme.work/job/' + v['hashid'] for k,v in jobs.items()]
To get the links from each job post change your css selector to select your elements more specific, also try to use static identifiers or HTML structure over classes:
.select('h2 a')
To get a list of all links use a list comprehension:
['https://www.trustme.work' + a.get('href') for a in soup1.select('h2 a')]
Example
from bs4 import BeautifulSoup
import requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"}
postData = {
'search': 'search',
'facets[camp_type]':'day_camp',
'open[choices-made-content]': 'true'}
url = 'https://www.trustme.work/en'
html_1 = requests.post(url, headers=headers, data=postData)
soup1 = BeautifulSoup(html_1.text, 'lxml')
['https://www.trustme.work' + a.get('href') for a in soup1.select('h2 a')]

requests.get not returning anything

I am trying to retrieve a url using requests.get
import requests
from bs4 import BeautifulSoup
baseurl = "https://www.olx.com.eg/"
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
r = requests.get('https://www.olx.com.eg/jobs/')
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.findAll('div',class_ = 'ads__item')
print(product_list)
but it returns an empty list because it does not even open the URL.
What is the issue here?
Add headers= parameter to requests.get:
import requests
from bs4 import BeautifulSoup
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
r = requests.get("https://www.olx.com.eg/jobs/", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
print(len(product_list))
Prints:
45

How do I get the URLs for all the pages?

I have a code to collect all of the URLs from the "oddsportal" website for a page:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/",headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div=soup.find("div",class_="main-menu2 main-menu-gray")
a_tag=main_div.find_all("a")
for i in a_tag:
print(i['href'])
which returns these results:
/soccer/africa/africa-cup-of-nations/results/
/soccer/africa/africa-cup-of-nations-2019/results/
/soccer/africa/africa-cup-of-nations-2017/results/
/soccer/africa/africa-cup-of-nations-2015/results/
/soccer/africa/africa-cup-of-nations-2013/results/
/soccer/africa/africa-cup-of-nations-2012/results/
/soccer/africa/africa-cup-of-nations-2010/results/
/soccer/africa/africa-cup-of-nations-2008/results/
I would like the URLs to be returned as:
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/#/page/2/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/#/page/3/
for all the parent urls generated for results.
I can see that the urls can be appended as seen from inspect element as below for div id = "pagination"
The data under id="pagination" is loaded dynamically, so requests won't support it.
However, you can get the table of all those pages (1-3) via sending a GET request to:
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/MN8PaiBs/X0/1/0/{page}/?_={timestampe}"
where {page} is corresponding to the page number (1-3) and {timestampe} is the current time
You'll also need to add:
"Referer": "https://www.oddsportal.com/"
to your headers.
also, use the lxml parser instead of html.parser to avoid a RecursionError.
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Referer": "https://www.oddsportal.com/",
}
with requests.Session() as session:
session.headers.update(headers)
for page in range(1, 4):
response = session.get(
f"https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/MN8PaiBs/X0/1/0/{page}/?_={datetime.now().timestamp()}"
)
table_data = re.search(r'{"html":"(.*)"}', response.text).group(1)
soup = BeautifulSoup(table_data, "lxml")
print(soup.prettify())

Web Scraping Prices with Python

I was following an online tutorial at the following webpage, https://www.youtube.com/watch?v=nCuPv3tf2Hg&list=PLRzwgpycm-Fio7EyivRKOBN4D3tfQ_rpu&index=1. I have no idea what I am doing wrong. I have tried the code in both Visual Studio and Jupyter notebooks to no avail.
Code:
import requests
from bs4 import BeautifulSoup as bs
bURL = 'https://www.thewhiskyexchange.com/c/540/taiwanese-whisky'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get('https://www.thewhiskyexchange.com/c/540/taiwanese-whisky')
soup = bs(r.content, 'lxml')
productlist = soup.find_all('div', class_='item')
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
print(link['href'])
The structure of that website has changed since the video was posted.
I've fixed your code below:
import requests
from bs4 import BeautifulSoup as bs
bURL = 'https://www.thewhiskyexchange.com/c/540/taiwanese-whisky'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(bURL, headers=headers)
soup = bs(r.text, 'html.parser')
for x in soup.find_all('li', {'class':'product-grid__item'}):
link = x.find('a')
print(x.text, 'https://www.thewhiskyexchange.com'+link['href'])

Unable to get links to the articles in Python web scraping

I tried everything I could think of. How do I fix this?
Here is What I am trying to do:
Scrape categories names & Get categories links and follow them.
Scrape Novels' names and follow their links.
Scrape Novels' info and their chapters.
Open each chapter and scrape the Article with images.
I am a complete beginner...
from requests import get
from bs4 import BeautifulSoup
import re
site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
category = soup.findAll(class_="search-by-genre")
#Getting all categories
categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
print("Category:", link.text)
category_link = site + "category/" + link.text
categories.append(category_link)
#break
#Getting all Novel Headers
for category in categories:
r = get(category_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novels_header = soup.findAll(class_="top-novel-header")
#Getting Novels' Title and Link
for Novel_names in Novels_header:
print(Novel_names.text)
r = get(Novel_names, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novel_link = soup.findAll("a", {"href"})
print(Novel_link.text)
You are nearly there. Your pulling the text, not the url with category_link = site + "category/" + link.text
if you print 'link', you'll see Action
you'd probably be better off getting the href as opposed to the text. if the text doesn't match, then you have an unknown link.
Secondly, you need the links from the Novels_header, which is already there....no need to make another request (which wouldn't work anyway as you are not actually feeding in an url at that point).
Give this a try, and see wher eI made the edits:
from requests import get
from bs4 import BeautifulSoup
import re
site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
category = soup.findAll(class_="search-by-genre")
#Getting all categories
categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
print("Category:", link.text)
category_link = link['href'] #<--- made edit here
categories.append(category_link)
#break
#Getting all Novel Headers
for category in categories:
r = get(category_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novels_header = soup.findAll(class_="top-novel-header")
#Getting Novels' Title and Link
for Novel_names in Novels_header: #<---- edit in this block
print(Novel_names.text.strip())
Novel_link = Novel_names.find('a')['href']
print(Novel_link + '\n')

Categories