how to extract amazon product links using python - python

I'm a Beginner in Python, I just want to scrap product links from amazon page.
for example, I want to scrap this page
http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368 and I use this code in python
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
file = open("parseddata.txt", "wb")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(href + '\n')
file.close()
I Just want the products title link as the output. Can anyone tell me where I am doing wrong.

Add an user-agent to the request header to pretend that you are not a robot.
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
# add header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
file = open(r"parseddata.txt", "w")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(link.get('href')+ '\n')
file.close()
Result
https://www.amazon.com/Funko-POP-Marvel-Dancing-Bobble/dp/B00N1EJXUU/ref=sr_1_1/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-1&refinements=p_4%3AFunKo
https://www.amazon.com/Funko-POP-Movies-Potter-Action/dp/B019JIA4IQ/ref=sr_1_2/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-2&refinements=p_4%3AFunKo
https://www.amazon.com/FunKo-2390-Funko-Darth-Maul/dp/B005F1QBMK/ref=sr_1_3/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-3&refinements=p_4%3AFunKo
........

Related

scraping comments from booking.com

I'm trying to get all the reviews from a specific hotel page in booking.com
I have tried this code but I'm not getting anything printed at all.
This is the code I tried:
import urllib.request
from bs4 import BeautifulSoup
url='https://www.booking.com/hotel/sa/sarwat-park.ar.html?aid=304142&label=gen173nr-1DCAEoggI46AdIM1gEaMQBiAEBmAERuAEHyAEM2AED6AEBiAIBqAIDuAL_oY-aBsACAdICJDE5YzYxY2ZiLWRlYjUtNDRjNC04Njk0LTlhYWY4MDkzYzNhNNgCBOACAQ&sid=c7009aac67195c0a7ef9aa63f6537581&dest_id=6376991;dest_type=hotel;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;req_adults=2;req_children=0;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1665388865;srpvid=1219386046550156;type=total;ucfs=1&#tab-reviews'
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
}
)
f = urllib.request.urlopen(req)
soup = BeautifulSoup(f.read().decode('utf-8'), 'html.parser')
reviews = soup.findAll("li", {"class": "review_item clearfix "})
for review in reviews:
print(review.find("div", {"class": "review_item_header_content"}).text)
To begin with, there is no class "review_item" on the entire page.
A better approach would be to just use the etree to find and get details from the xPath of the reviews list that you have now.
//*[#id="b2hotelPage"]/div[25]/div/div/div/div[1]/div[2]/div/ul
Then you could do something like
webpage = req.get(URL, headers=headers)
soup = bs(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
listTarget = dom.xpath('//*[#id="b2hotelPage"]/div[25]/div/div/div/div[1]/div[2]/div/ul')
This should give you a list of lxml objects which are essentially your comment cards.
Then you can work on them in a similar fashion

I am trying to get one element of a website but it prints "none" (Python Requests)

from bs4 import BeautifulSoup
import requests
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', id_="most-recorded")
print(element)
This always prints "none" but when I go to the website, I can see it. I even deleted all cookies and it's still there.
Without specifying a user agent, the site does not give you the tag you need.
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', {'id': "most-recorded"}).get_text(strip=True)
print(element)
OUTPUT:
Fortnite

How to extract data from dynamic website?

I am trying to get the restaurant name and address of each restaurant from this platform:
https://customers.dlivery.live/en/list
So far I tried with BeautifulSoup
import requests
from bs4 import BeautifulSoup
import json
url = 'https://customers.dlivery.live/en/list'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/75.0.3770.80 Safari/537.36'}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
soup
I noticed that within soup there is not the data about the restaurants.
How can I do this?
if you inspect element the page, you will notice that the names are wrapped in the card_heading class, and the addresses are wrapped in card_distance class.
soup = BeautifulSoup(response.text, 'html.parser')
restaurantAddress = soup.find_all(class_='card_distance')
for address in restaurantAddress:
print(address.text)
and
soup = BeautifulSoup(response.text, 'html.parser')
restaurantNames = soup.find_all(class_='card_heading')
for name in restaurantNames:
print(name.text)
Not sure if this exact code will work, but this is pretty close to what you are looking for.

Web scraping finviz for fundamental data on marketcap

I'm trying to scrape finviz(https://finviz.com/quote.ashx?t=aapl) for marketcap in the fundamental table but couldn't for the life of me locate the table or the class with beautiful soup. It seems that every time I use soup.find_all() it works for 'div', 'td', 'table' etc. but it returns an empty list when I try to add a class like {'class':'snapshot-td2'}. Does anyone know how I may fix this?
import requests
from bs4 import BeautifulSoup
import bs4
def parseMarketCap():
response = requests.get("https://finviz.com/quote.ashx?t=aapl")
soup = bs4.BeautifulSoup(response.content, "lxml")
table = soup.find_all('td', {'class':'snapshot-td2'})
print(table)
I also tried the following for the table, but no luck as well:
table = soup.find_all('table', {'class': "snapshot-table2"})
inspect
fundamental table
You need an user-agent header, then you can use -soup-contains: to target the preceding td by its text then move to the desired value field:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finviz.com/quote.ashx?t=aapl', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
print(soup.select_one('td:-soup-contains("Market Cap")').find_next('b').text)
as QHarr suggest that you to add user-agent for proper response
response = requests.get("https://finviz.com/quote.ashx?t=aapl",headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})
soup = BeautifulSoup(response.text, "lxml")
content_table = soup.find('table', {'class':'snapshot-table2'})
tabel_rows = content_table.find_all('tr')
market_cap = tabel_rows[1].find_all('td')[1].text
print(market_cap)
Try to use User-Agent for your request like this:
user_agent = {'User Agent':'YOUR_USER_AGENT'}
r = requests.get('YOURURL', headers=user_agent)
...

Find specific Tag Python BeautifulSoup

Hey I'm trying to extract URLs between 2 tags
This is what i got so far:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = []
for links in soup.findAll('cite'):
print(links.get('cite'))
I have tried different things but I couldn't extract the URL between
<cite>.....</cite>
My code Updated
import requests
from bs4 import BeautifulSoup as bs
dorks = input("Keyword : ")
binglist = "http://www.bing.com/search?q="
with open(dorks , mode="r",encoding="utf-8") as my_file:
for line in my_file:
clean = binglist + line
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
r = requests.get(clean, headers=headers)
soup = bs(r.text, 'html.parser')
links = soup.find('cite')
print(links)
In keyword file you just need to put any keyword like :
test
games
Thanks for your help
You can do it as follows:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find('cite')
for link in links:
print(link.text)
You can webscrape Bing as follows:
import requests
from bs4 import BeautifulSoup as bs
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
r = requests.get("https://www.bing.com/search?q=test", headers=headers)
soup = bs(r.text, 'html.parser')
links = soup.find('cite')
for link in links:
print(link.text)
This code does the following:
With request we get the Web Page we're looking for. We set headers to avoid being blocked by Bing (more information, see: https://oxylabs.io/blog/5-key-http-headers-for-web-scraping)
Then we HTML'ify the code, and extract all codetags (this returns a list)
For each element in the list, we only want what's inside the codetag, using .text we print the inside of this tag.
Please pay attention to the headers!
Try this:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find_all('cite')
for link in links:
print(link.text)
You're looking for this to get links from Bing organic results:
# container with needed data: title, link, snippet, etc.
for result in soup.select(".b_algo"):
link = result.select_one("h2 a")["href"]
Specifically for example provided by you:
from bs4 import BeautifulSoup
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
link = soup.select_one('.b_attribution cite').text
print(link)
# https://www.developpez.net/forums/d1497343/environnements-developpem...
Code and example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
}
params = {
"q": "lasagna",
"hl": "en",
}
html = requests.get("https://www.bing.com/search", headers=headers, params=params)
soup = BeautifulSoup(html.text, "lxml")
for links in soup.select(".b_algo"):
link = links.select_one("h2 a")["href"]
print(link)
------------
'''
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.simplyrecipes.com/recipes/lasagna/
'''
Alternatively, you can achieve the same thing by using Bing Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with extraction, maintain, bypass from the blocks part, instead, you only need to iterate over structured JSON and get what you want.
Code to integrate to achieve your goal:
from serpapi import GoogleSearch
import os
params = {
"api_key": os.getenv("API_KEY"),
"engine": "bing",
"q": "lion king"
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
link = result['link']
print(link)
------------
'''
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.simplyrecipes.com/recipes/lasagna/
'''
Disclaimer, I work for SerpApi.

Categories