Is there someone have success in scraping from Amazon using Beautifulsoup? - python

I want to make a web scraper of Amazon.
But, It looks like that everydata is None type.
I found in google and there are many peoples who make a web scraper of Amazon.
Please, give me some advice to solve this Nonetype issue.
Here is my code:
import requests
from bs4 import BeautifulSoup
amazon_dir = requests.get("https://www.amazon.es/s?k=docking+station&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=34FO3BVVCJS4V&sprefix=docking%2Caps%2C302&ref=nb_sb_ss_ts-doa-p_1_7")
amazon_soup = BeautifulSoup(amazon_dir.text, "html.parser")
product_table = amazon_soup.find("div", {"class": "sg-col-inner"})
print(product_table)
products = product_table.find("div", {"class": "a-section"})
name = products.find("span", {"class": "a-size-base-plus"})
rating = products.find("span", {"class": "a-icon-alt"})
price = products.find("span", {"class": "a-price-whole"})
print(name, rating, price)
Thank you

Portals may check header User-Agent to send different HTML for different browsers or devices and sometimes this can make problem to find elements on page.
But usually portals check this header to block scripts/bots.
For example requests sends User-Agent: python-requests/2.26.0.
If I use header User-Agent from real browser or at least shorter version Mozilla/5.0 then code works.
There is other problem.
There is almost 70 elements <div class="sg-col-inner" ...> and table is as 3th element but find() gives only first element. You have to use find_all() and later use [2] to get 3th element.
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0',
}
url = "https://www.amazon.es/s?k=docking+station&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=34FO3BVVCJS4V&sprefix=docking%2Caps%2C302&ref=nb_sb_ss_ts-doa-p_1_7"
response = requests.get(url, headers=headers)
print(response.text[:1000])
print('---')
amazon_soup = BeautifulSoup(response.text, "html.parser")
all_divs = amazon_soup.find_all("div", {"class": "sg-col-inner"})
print('len(all_divs):', len(all_divs))
print('---')
products = all_divs[3].find("div", {"class": "a-section"})
name = products.find("span", {"class": "a-size-base-plus"})
rating = products.find("span", {"class": "a-icon-alt"})
price = products.find("span", {"class": "a-price-whole"})
print('name:', name.text)
print('rating:', rating.text)
print('price:', price.text)
EDIT:
Version which display all products:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0',
}
url = "https://www.amazon.es/s?k=docking+station&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=34FO3BVVCJS4V&sprefix=docking%2Caps%2C302&ref=nb_sb_ss_ts-doa-p_1_7"
response = requests.get(url, headers=headers)
#print(response.text[:1000])
#print('---')
soup = BeautifulSoup(response.text, "html.parser")
results = soup.find("div", {"class": "s-main-slot s-result-list s-search-results sg-row"})
all_products = results.find_all("div", {"class": "sg-col-inner"})
print('len(all_products):', len(all_products))
print('---')
for item in all_products:
name = item.find("span", {"class": "a-size-base-plus"})
rating = item.find("span", {"class": "a-icon-alt"})
price = item.find("span", {"class": "a-price-whole"})
if name:
print('name:', name.text)
if rating:
print('rating:', rating.text)
if price:
print('price:', price.text)
if name or rating or price:
print('---')
BTW:
From time to time portals refresh code and HTML on servers - so if you find tutorial then check how old it is. Older tutorials may not work because portals could changed something in code.
Many modern pages start using JavaScript to add elements but requests and BeautifulSoup can't run JavaScript. And this may need to use Selenium to control real web browser which can run JavaScript.

Related

Terminal showing [] when trying to run

I am following a price tracker tutorial I found on YT. However, when i try to do "python main.py" in the terminal, it shows me this:
(venv) julia#Julias-Maccie-3 pythonProject1 % python main.py
[]
[]
(venv) julia#Julias-Maccie-3 pythonProject1 %
Where the two [] are, it is supposed to show me the price and title of the product.
Here's my code:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.lookfantastic.nl/olaplex-no.3-hair-perfector-100ml/11416400.html'
headers = {"User-Agent": 'My user agent'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find_all("div", class_="productName_title")
price = soup.find_all("div", class_="productPrice_price")
converted_price = price[0:4]
print(converted_price)
print(title)
Does anyone know how to solve this?
NOTE: I did fill in my user agent. Just removed it for the purpose of this question
Check your soup and adjust the tag names you expect to find:
title = soup.find_all("h1", class_="productName_title")
price = soup.find_all("p", class_="productPrice_price")
Output:
[<p class="productPrice_price" data-product-price="price">
€22,45
</p>, <p class="productPrice_price" data-product-price="price">
€22,45
</p>]
[<h1 class="productName_title" data-product-name="title">Olaplex No.3 Hair Perfector 100ml</h1>, <h1 class="productName_title" data-product-name="title">Olaplex No.3 Hair Perfector 100ml</h1>]
Be aware that find_all() will give you a ResultSet if you like to get only first information go with find() instead
title = soup.find("h1", class_="productName_title").get_text(strip=True)
price = soup.find("p", class_="productPrice_price").get_text(strip=True)
converted_price = price[1:]
Output:
22,45
Olaplex No.3 Hair Perfector 100ml

Scrap a datas using CSS selector (Python, BS4)

I am scraping datas using CSS selector for the first time.
And There is a problem scraping content of anchor.
Here is my code:
import requests
from bs4 import BeautifulSoup
url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term=ruby"
wwr_result = requests.get(url)
wwr_soup = BeautifulSoup(wwr_result.text, "html.parser")
posts = wwr_soup.find_all("li", {"class": "feature"})
link = post.select("#category-2 > article > ul > li:nth-child(1) > a[href]")
title = post.find("span", {"class": "title"}).get_text()
company = post.find("span", {"class": "company"}).get_text()
location = post.find("span", {"class": "region company"}).get_text()
link = post.select("#category-2 > article > ul > li:nth-child(1) > a[href]")
print {"title": title, "company": company, "location": location, "link":f"https://weworkremotely.com/{link}"}
I want to scrap the content of anchor to make a link of each post. So, I put a[href].
But it doesn't work but scrap contents of all subcategory.
How do I have to change to scrap just the content of anchor?
Assuming you have correctly selected the jobs of interest from all jobs listed, you need a loop, then extract the first href attribute with substring -jobs i.e. post.select_one('[href*=-jobs]' during the loop:
import requests
from bs4 import BeautifulSoup
url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term=ruby"
wwr_result = requests.get(url)
wwr_soup = BeautifulSoup(wwr_result.text, "html.parser")
posts = wwr_soup.find_all("li", {"class": "feature"})
for post in posts:
print('https://weworkremotely.com' + post.select_one('a[href*=-jobs]')['href'])
To get all the listings on the page switch to:
posts = wwr_soup.select('li:has(.tooltip)')

Web Scraping with Python - blank return

I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.

Python bs4 BeautifulSoup: findall gives empty bracket

when i run this code it gives me an empty bracket. Im new to web scraping so i dont know what im doing wrong.
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=laptop'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
container = soup.findAll('li', {'class': 's-result-item celwidget '})
#btw the space is also there in the html code
print(container)
results:
[]
What i tried is to grab the html code from the site, and to soup trough the li tags where all the information is stored so I can print out all the information in a for loop.
Also if someone wants to explain how to use BeautifulSoup we can always talk.
Thank you guys.
So a working code that grabs product and price would could look something like this.
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=laptop'
r = requests.get(url, headers={'User-Agent': 'Mozilla Firefox'})
soup = BeautifulSoup(r.text, 'html.parser')
container = soup.findAll('li', {'class': 's-result-item celwidget '})
for cont in container:
h2 = cont.h2.text.strip()
# Amazon lists prices in two ways. If one fails, use the other
try:
currency = cont.find('sup', {'class': 'sx-price-currency'}).text.strip()
price = currency + cont.find('span', {'class': 'sx-price-whole'}).text.strip()
except:
price = cont.find('span', {'class': 'a-size-base a-color-base'})
print('Product: {}, Price: {}'.format(h2, price))
Let me know if that helps you further...

Parsing multiple urls with Python and BeautifulSoup

I started learning Python today and so it is not a surprise that I am struggling with some basics. I am trying to parse data from a school website for a project and I managed to parse the first page. However, there are multiple pages (results are paginated).
I have an idea about how to go about it, ie, run through the urls in a loop since I know the url format but I have no idea how to proceed. I figured it would be better to somehow search for the "next" button and run the function if it is there, if not, then stop function.
I would appreciate any help I can get.
import requests
from bs4 import BeautifulSoup
url = "http://www.myschoolwebsite.com/1"
#url2 = "http://www.myschoolwebsite.com/2"
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
g_data = soup.find_all('ul', {"class": "searchResults"})
for item in g_data:
for li in item.findAll('li'):
for resultnameh2 in li.findAll('h2'):
for resultname in resultnameh2.findAll('a'):
print(resultname).text
for resultAddress in li.findAll('p', {"class": "resultAddress"}):
print(resultAddress).text.replace('Get directions','').strip()
for resultContact in li.findAll('ul', {"class": "resultContact"}):
for resultContact in li.findAll('a', {"class": "resultMainNumber"}):
print(resultContact).text
First, you can assume the maximum no. of pages of the directory (if you know pattern of the url). I am assuming the url is of the form http://base_url/page Next you can write this:
base_url = 'http://www.myschoolwebsite.com'
total_pages = 100
def parse_content(r):
soup = BeautifulSoup(r.content,'lxml')
g_data = soup.find_all('ul', {"class": "searchResults"})
for item in g_data:
for li in item.findAll('li'):
for resultnameh2 in li.findAll('h2'):
for resultname in resultnameh2.findAll('a'):
print(resultname).text
for resultAddress in li.findAll('p', {"class": "resultAddress"}):
print(resultAddress).text.replace('Get directions','').strip()
for resultContact in li.findAll('ul', {"class": "resultContact"}):
for resultContact in li.findAll('a', {"class": "resultMainNumber"}):
print(resultContact).text
for page in range(1, total_pages):
response = requests.get(base_url + '/' + str(page))
if response.status_code != 200:
break
parse_content(response)
I would make an array with all the URLs and loop through it, or if there is a clear pattern, write a regex to search for that pattern.

Categories