Python Href scraping - python

I'm trying to loop over a href and get the URL. I've managed to extrat the href but i need the full url to get into this link. This is my code at the minute
import requests
from bs4 import BeautifulSoup
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
webpage_response.content
webpage_response = requests.get
soup = BeautifulSoup(webpage, "html.parser")
#only finding one track
#soup.table to find all links for days racing
harness_table = soup.table
#scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
link.insert(0, "http://www.harness.org.au")
webpage = requests.get(link)
new_soup = BeautifulSoup(webpage.content, "html.parser")
#work through table to get links to tracks
print(new_soup)'''

You can store the base url of website in a variable and then once you get the href from link you can join them both to create the next url.
import requests
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
soup = BeautifulSoup(webpage_response.content, "html.parser")
# only finding one track
# soup.table to find all links for days racing
harness_table = soup.table
# scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
webpage = requests.get(base_url + link["href"])
new_soup = BeautifulSoup(webpage.content, "html.parser")
# work through table to get links to tracks
print(new_soup)

Try this solution. Maybe you'll like this library.
from simplified_scrapy import SimplifiedDoc,req
url = 'http://www.harness.org.au/racing/results/?activeTab=tab'
html = req.get(url)
doc = SimplifiedDoc(html)
links = [doc.absoluteUrl(url,ele.a['href']) for ele in doc.selects('td.meetingText')]
print(links)
Result:
['http://www.harness.org.au/racing/fields/race-fields/?mc=BA040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=BH040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=RE040320']

Related

How to extract links from HTML using BeautifulSoup?

I am trying to use regex function in python to filter out links from the html that I scraped on eBay website.
My question is how can I filter out those links with using following pattern: https://www.ebay.com/itm/ + all other characters.
I am getting successfully the https://www.ebay.com/itm/ part but I am not sure how to do the rest.
Python version that I am using: 3.8.8.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re
url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0'
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.find_all('li')
pattern = 'https://www.ebay.com/itm/'
results = re.findall('https://www.ebay.com/itm/', str(listings))
print(results)
To get links that starts with https://www.ebay.com/itm/ you can do:
import requests
from bs4 import BeautifulSoup
url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0"
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.select("li a")
for a in listings:
link = a["href"]
if link.startswith("https://www.ebay.com/itm/"):
print(link)
Prints:
https://www.ebay.com/itm/01920391?epid=26039819083&_trkparms=ispr%3D1&hash=item3b542eae7a:g:FQkAAOSwK21gKvEZ&amdata=enc%3AAQAFAAACcBaobrjLl8XobRIiIML1V4Imu%252Fn%252BzU5L90Z278x5ickkrDx%252B2NLp21dg6hHbHAkGMYdiW1E6zjXxnQ0bf7c%252Fx%252Fvs5PW%252FYFw1ZdbGMi8wsGV6qXw8OFLl4Os1ACX3bnQxFkVpRib9hMb5gVyLha4q9L0xiporu5InbX0LrSgg7nCCCwtC7y3vOE3hc8PszsrXWLb5KFdj7%252BD98et12MdkEfMPFhJZuS%252BkFsp2esVTRCYctOhcwzPSdfzCOYprlr2miQc4czCv1Tcfs3LKUPJn8uQyRc%252BAnKY1oyTeYnJ7wYuGkBU%252FSVYjziLBaPhT%252FlVu0hR9ZX6OnAeRaJ1g0iCaDjrRXEXRwUO87riWeI8kExm1zzY7QicPeMnfWZdBvVhg05GOScPOlLTVPHakqGLX0y2GUXV6fkTLua3nSF5YBmLX%252FqdCxT6yS0dutVs5MPWvQYlN474hUzbubkZVAs7Y%252BBBEsHrGjVzCj0szZ6w1%252BHgkV5O9jrXGnyew5%252Bnxy7VCq5xEkUDIt1nSg996AeDksNmSNumhfsIOGltIXbqAbjqEUpPcVO%252BDPymxlh0iMxCZQalYnmljBRzKILYWkES0vfA14Gh5E7KWrztdC6WzEEFtgVuABakQ1eAOZnuEueqK6IakC%252BIfRbXv96Tv01IPDvwPeM8wMo6j8bMjY3D5KHS5EXPVdHKUnjCJiYCcVUqcKwhL6eN2MZ%252Bn9yxmWESUPN394NPrX%252FI2z7t0Bbo7iqmsWNQcyi0EHzDwJPMK%252FNSif8%252F2adRF7dT1JrbL9sryKSN2kv9OsdGQ0fMMC1LV3Ph43HivUJdqkgjGxqEqX5v1xQ%253D%253D%7Ccksum%3A25481541593068896952f4834d93a0bb998f5b5ba5fe%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/333909214974?_trkparms=ispr%3D1&hash=item4dbe8a46fe:g:wjYAAOSwXZtgQKMK&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsSprd4IgPv6LBzlteBBg9Pe7gpGas6KEYWvIQmgmNrIcf1ZIcLwSyYlVIojBTlJPAd4XoDj5k2AzULwOu2K1678su8RhEOIAhD%252FWN3J3erMgWugg2CT0y8zcUjbbWHtNfzYrnVuG%252FUvWmYzvxdGvHZvRdzha0juBNJzGKXnChJ6MECxb2uzy5G631xzrIxblEW3AM1ogDVv19il%252Fn7Vlzr0cERgdjLqGUj5KIynU8Xj9sHRx%252F%252FcSxPq8W4v1RHP%252FYZQhE4keU2U3pZGBCy9hyaYaf5pr78zEcjg2aoyh%252B6DUFkxtjTnmPhNqiwO5QNkl21UaDagoNQYRkZb4iW0QpXX628qHxIqBVL4GjtRczXt%252Bi0ZVn0B6GBqNzHtHarXqCSe7966rspJOMXwYB%252Fj3m%252Fpe3oJO8dKBNDmzxUNhnE4Bf76ZlFiW%252Fh9TXWEO3vdhLjRjjmjJKjgj7IXVorj%252BypBfHCbTXCA7nHsBSsOdNR%252B3LopnwoQutVCsk%252FQ8bESJCn0unvLARcM6wBRakZl3Uxhe1iSnMV%252F69vpTVm2XC1ZjU0IzF7xB5laYPRPcrOwXSJHwZJHkwyziteo%252FWBZNkklgTjY%252BdEanyHKlzppyJWLAufvIbWt4YWy3Xz%252FAKaKScpMGsHWNqEKwGJ%252BPeSFwq57qnm5xK0InzfsHtyVHd6lWoaKqyP8e7nVQBjqfNciDdLD5D3fuiIHQupPyX2m1Z%252B%252FeQOayocP6GV%252BAfSHCS44Y%252Fgh4njFjCVO5KAR%252BZXiNddo%252BfFGSpUZO1q38Cq3MDvyeRF3vYd2r3FNaimwBT2d3%252Fnl3seechJuFBWHONudGgMuSiXwoy66SbYwIj0CD427%252B1FVDHw%252F4LSkJhi78oWKX2I9sGnvHrj4boKie6mlPN6ZGOenVXOpLWNK8ZSN%252FzEqUzt6iC32EFb8rj%252FC4HK9AWddRejsdJHnsFKs7QAysEUN%252Fh6ltSA3FR64qww0Qvp3A%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/333909214974?_trkparms=ispr%3D1&hash=item4dbe8a46fe:g:wjYAAOSwXZtgQKMK&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsSprd4IgPv6LBzlteBBg9Pe7gpGas6KEYWvIQmgmNrIcf1ZIcLwSyYlVIojBTlJPAd4XoDj5k2AzULwOu2K1678su8RhEOIAhD%252FWN3J3erMgWugg2CT0y8zcUjbbWHtNfzYrnVuG%252FUvWmYzvxdGvHZvRdzha0juBNJzGKXnChJ6MECxb2uzy5G631xzrIxblEW3AM1ogDVv19il%252Fn7Vlzr0cERgdjLqGUj5KIynU8Xj9sHRx%252F%252FcSxPq8W4v1RHP%252FYZQhE4keU2U3pZGBCy9hyaYaf5pr78zEcjg2aoyh%252B6DUFkxtjTnmPhNqiwO5QNkl21UaDagoNQYRkZb4iW0QpXX628qHxIqBVL4GjtRczXt%252Bi0ZVn0B6GBqNzHtHarXqCSe7966rspJOMXwYB%252Fj3m%252Fpe3oJO8dKBNDmzxUNhnE4Bf76ZlFiW%252Fh9TXWEO3vdhLjRjjmjJKjgj7IXVorj%252BypBfHCbTXCA7nHsBSsOdNR%252B3LopnwoQutVCsk%252FQ8bESJCn0unvLARcM6wBRakZl3Uxhe1iSnMV%252F69vpTVm2XC1ZjU0IzF7xB5laYPRPcrOwXSJHwZJHkwyziteo%252FWBZNkklgTjY%252BdEanyHKlzppyJWLAufvIbWt4YWy3Xz%252FAKaKScpMGsHWNqEKwGJ%252BPeSFwq57qnm5xK0InzfsHtyVHd6lWoaKqyP8e7nVQBjqfNciDdLD5D3fuiIHQupPyX2m1Z%252B%252FeQOayocP6GV%252BAfSHCS44Y%252Fgh4njFjCVO5KAR%252BZXiNddo%252BfFGSpUZO1q38Cq3MDvyeRF3vYd2r3FNaimwBT2d3%252Fnl3seechJuFBWHONudGgMuSiXwoy66SbYwIj0CD427%252B1FVDHw%252F4LSkJhi78oWKX2I9sGnvHrj4boKie6mlPN6ZGOenVXOpLWNK8ZSN%252FzEqUzt6iC32EFb8rj%252FC4HK9AWddRejsdJHnsFKs7QAysEUN%252Fh6ltSA3FR64qww0Qvp3A%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/313269877281?hash=item48f056fa21:g:hd0AAOSwz6tfknZF
https://www.ebay.com/itm/313269877281?hash=item48f056fa21:g:hd0AAOSwz6tfknZF
https://www.ebay.com/itm/184741430233?_trkparms=ispr%3D1&hash=item2b037283d9:g:TyQAAOSwVYBgZxlp&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsS7UXID%252BRPOSNsnm8kYPghtOhpdocScHh%252BDgY2CTi3nJTgQcXQEKAs8Ix13gdusXFa3yngmHdTBh2vatEHVnazTxou3w%252FDC26a237oG00lpNN0M8zpUGIN9nYQpAgSrl8Z2cNTqgManH1%252Bz3LIm4YNAlHJk%252BHZA8%252FzFNts2J8C7tWjwB%252FA16vRrvHG7nYYLh0tdLtgbf0dpc2Qe9fAJlzaEBwz89Vmv30NMzTFBw4HT%252FzDvPHrTnvmygxCs75Law6GL0yq5FZiufYo%252Fj1bGviIrCjul2tV%252FqMzojlafc7De3fEH0%252Fx7e%252B3S%252BRPOUwrq%252BjDlOH8%252Bp7tZBwZd70GEYnMNBPUFcFmnDcsZgJhS4xnlt5ibp7JgMABsL7TnNK6pm0ran5wu7KVyRsCnTEkWCx0WZ9emEe1xzzVQrJt%252B7ICl%252FNqMq9gaz3%252F2O%252B4Zf83FlMWgd2K0kvtYiBYlFzRZTKrLIkJUl9pDo8nYDJROrSUnT3xZdDu38mqKft9ckuwd8LsRn8emIR%252BJJQi2I6M1gWYnuPrD09rQQMKD7FA2VxWfA4JfePSSoAwyFVyvOkYN47cJkG3ymeQJZUZOFXt8h2j%252F3b0KlMlImQcYcancrGMk7cx4AfUBca%252BSPEC8e3w2RIDzp62%252F9If%252FVLm6Vq6rvdjjspmxA9r2LErZAgEIu6%252BXWuF3eicCrtX9dQic3TmmPPeordO3nK2QKgdSc487ywrfOX9i5XmQqmFmOyN5W%252B%252FwDmeGHmb1l1KPnWa14048eIHqDITTFibsKyRk7H04VwWrkqhfOxsrek7VxmnK7Ciqgli22Se7VPqPhlVRddgYe%252FIvp5Yw8%252BGoewf4mpOiRQ%252Bmm5QpRVpA%252BusDd2id6Gz2crCd%252FnoYT9shk8tfXCfGwFson3CxpgjeSoLd1tsQ%252FeY0qgZelpVjKdZ%252BeaaJpiC1uKa4r5FZxxJ0b5VxYp8j6F9255HFQGZzAQ2MVJyu647sA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/184741430233?_trkparms=ispr%3D1&hash=item2b037283d9:g:TyQAAOSwVYBgZxlp&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsS7UXID%252BRPOSNsnm8kYPghtOhpdocScHh%252BDgY2CTi3nJTgQcXQEKAs8Ix13gdusXFa3yngmHdTBh2vatEHVnazTxou3w%252FDC26a237oG00lpNN0M8zpUGIN9nYQpAgSrl8Z2cNTqgManH1%252Bz3LIm4YNAlHJk%252BHZA8%252FzFNts2J8C7tWjwB%252FA16vRrvHG7nYYLh0tdLtgbf0dpc2Qe9fAJlzaEBwz89Vmv30NMzTFBw4HT%252FzDvPHrTnvmygxCs75Law6GL0yq5FZiufYo%252Fj1bGviIrCjul2tV%252FqMzojlafc7De3fEH0%252Fx7e%252B3S%252BRPOUwrq%252BjDlOH8%252Bp7tZBwZd70GEYnMNBPUFcFmnDcsZgJhS4xnlt5ibp7JgMABsL7TnNK6pm0ran5wu7KVyRsCnTEkWCx0WZ9emEe1xzzVQrJt%252B7ICl%252FNqMq9gaz3%252F2O%252B4Zf83FlMWgd2K0kvtYiBYlFzRZTKrLIkJUl9pDo8nYDJROrSUnT3xZdDu38mqKft9ckuwd8LsRn8emIR%252BJJQi2I6M1gWYnuPrD09rQQMKD7FA2VxWfA4JfePSSoAwyFVyvOkYN47cJkG3ymeQJZUZOFXt8h2j%252F3b0KlMlImQcYcancrGMk7cx4AfUBca%252BSPEC8e3w2RIDzp62%252F9If%252FVLm6Vq6rvdjjspmxA9r2LErZAgEIu6%252BXWuF3eicCrtX9dQic3TmmPPeordO3nK2QKgdSc487ywrfOX9i5XmQqmFmOyN5W%252B%252FwDmeGHmb1l1KPnWa14048eIHqDITTFibsKyRk7H04VwWrkqhfOxsrek7VxmnK7Ciqgli22Se7VPqPhlVRddgYe%252FIvp5Yw8%252BGoewf4mpOiRQ%252Bmm5QpRVpA%252BusDd2id6Gz2crCd%252FnoYT9shk8tfXCfGwFson3CxpgjeSoLd1tsQ%252FeY0qgZelpVjKdZ%252BeaaJpiC1uKa4r5FZxxJ0b5VxYp8j6F9255HFQGZzAQ2MVJyu647sA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/154108126132?_trkparms=ispr%3D1&hash=item23e18f63b4:g:5g0AAOSwkApfch0l&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsStEKTPzZMfQmny3knR97t0JsD6L97oauzovvk2KO0WEt%252Fy1DvPmlzbWssMnX7EK9TVYjaGDcIoX%252FGC%252FmDBJiZuzo4Q57rySuBn9egod2B14d0XvKYJrNvSErcJ2RIJIQABAo473RTmNDf6Ql8ivp1PqCbAFg3a3CFzJJNK1in5oOpyIPlur6D%252BrPTA0SbM8%252F6ryLrpO9VpOBncBnX3aQFMBkALPsmmbolhK1Z2wtACsanrnaWudSur0%252BWE2VNOx8K2DSdSW7FqtEDE0lE%252F%252FUhnHvrdqsLg%252F1GsgbpoyWNV1TSap52SOr0Ndsb6HhxIaOdngXwWmW%252BD4qbhHsUxFesHcMrPsOtlb7gIRYj5ubReQpvgK9GiBSCwiEBMUcODZoXhNEsXp9MiBZF9qeWKTApnNy3pbBWAI65p3v0TiawoyWN%252BGywOpl8laWKrZseCWQMn4o0ZmSC08wU%252Fj7Zn2biBdULHfRiJm%252FPvDzSeAVy92JchTe69dUO9%252BHL7%252B5zurnYfPEjLiOaKhHW60bbsO3ru3PZ1RMwF4iMxP%252FgGIKc8xk11Zsms1%252BGfiApbCy7rQraZPQBzQJpzQOIzd7xrkPyOBuNjj2HdPE7bZjLGOCjuIYDIuBxCP0ZMajblQrr5pzvUjdMT4zJ76GmJ28qGDm5TkxD8tDv0eaWnBfDssMpHtAW8GND7g3hE%252Fhqvoc2072E78fzVz0Ts51H%252BW77rrbpC9DmtLHopXEp5fCMcP3vuTv%252BaCqKuaMuy8blc6S6ldjbuqSkRe5qtI5edG5HeingADQenYM%252FbViMlpVwv1ul9GZquueAo89zMOUKKRhsfq6oP4LyLN6%252BNUcOoAxOSBMw7bWC7oYD5yilolDFw7RauJooyv3wMOuZLCABtjABogDu63sdJpcNptiYVj84nsGwDi4AJ4uTPjw1jItB87NskmHFAyY2sdHH%252Be3OqwybMZKg7OKXzx2WPSDPVdN2K5TjZA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
...and so on.
You could do a more efficient filtering within css using ^ starts with operator to identify the appropriate links starting with that string. Use a set comprehension to return only unique items.
from bs4 import BeautifulSoup
import requests
url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0'
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
links = {i['href'] for i in soup.select('[href^="https://www.ebay.com/itm/"]')}
print(links)
If you wish to specify the href is a descendant of a li then add that into the selector with a descendant combinator and type selector:
links = {i['href'] for i in soup.select('li [href^="https://www.ebay.com/itm/"]')}

Extracting links from html from the link of the following website

I want to extract the link
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
from the html of the page
http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05
The following is the code that is used
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
soup = BeautifulSoup(html.text,'html.parser')
link = soup.find_all('a')
print(link)
using beautiful soup. How would I go about it, using find_all('a") doesn't return the required link in the returned html.
Please try this to get Exact Url you want.
import bs4 as bs
import requests
import re
sauce = requests.get('https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=CHC&durationType=Y&Year=2018')
soup = bs.BeautifulSoup(sauce.text, 'html.parser')
for a in soup.find_all('a', href=re.compile("company_info")):
# print(a['href'])
if 'pageno' in a['href']:
print(a['href'])
output:
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=3&next=0&durationType=Y&Year=2018&duration=1&news_type=
You just have to use the get method to find the href attribute:
from bs4 import BeautifulSoup as soup
import requests
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
page= soup(html.text,'html.parser')
link = page.find_all('a')
for l in link:
print(l.get('href'))

Scraper not extracting url link:

Hi I am trying to scrape the Amazon url address on this site linked under "View item on Amazon".
My code is below, I get zero response. Appreciate any assistance. Thanks
import requests
url = "https://app.jumpsend.com/deals/230513"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
tags = soup.find_all('a')
for tag in tags:
print(tag.get('href'))
The Amazon link (https://www.amazon.com/dp/B07MH9DK5B) isn't in the html page source. You need to use Selenium in order to read-in the html of all the elements that are set by Java script:
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://app.jumpsend.com/deals/230513"
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
soup.find('a', attrs={'class': 'deal-modal-link'})['href']
The above code prints out the Amazon link:
'https://www.amazon.com/dp/B07MH9DK5B'

Scraping IMDB.com with beautifulsoup in python but can't get href from movie link

I'm trying to get the href link for a movie (ex: search Iron Man on IMDB) but I can't seem to get it. I keep getting "None" when I run the code but if I remove .get('href'), the code will return the entire line of html (including the link I want). I appreciate any help with this. Thanks!
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin # For joining next page url with base url
search_terms = input("What movie do you want to know about?\n> ").split()
url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + '+'.join(search_terms) + '&s=all'
def scrape_find_next_page(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
next_page = soup.find('td', 'result_text').get('href')
return next_page
next_page_url = scrape_find_next_page(url)
You are trying to get the href from td, which the attribute does not exist. You need to get the a tag that contains the href attribute
next_page = soup.find('td', 'result_text').find('a').get('href')

scrape href of each product from website using python

I'm using beautifulsoup to scrape the href of each product in this webpage: http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera. These hrefs end up with "keywords=digital+camera"
Here's my code:
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera"
keyword = "keywords=digital+camera"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
href = link.get('href')
if href is None:
continue
elif keyword in href:
print href
I got nothing back from above script, is there anything I can do to fix it?
Thanks
Amazon is detecting the user-agent ("the name of your browser") and changing the content based on that value. If you add a user-agent to the request, you'll get the strings with "keyword=digital+camera" added to them. Otherwise, you don't.
url ="http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera"
import urllib2
from bs4 import BeautifulSoup
req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html)
links = soup.find_all('a')
for l in links:
href = l.get('href')
title = l.get('title', '')
if 'Sony W800/B 20.1 MP Digital' in title:
print(href) # prints: http://www.amazon.com/Sony-W800-Digital-Camera-Black/dp/B00I8BIBCW/ref=sr_1_2/183-0842534-8993425?s=photo&ie=UTF8&qid=1421400650&sr=1-2&keywords=digital+camera

Categories