scrape href of each product from website using python - python

I'm using beautifulsoup to scrape the href of each product in this webpage: http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera. These hrefs end up with "keywords=digital+camera"
Here's my code:
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera"
keyword = "keywords=digital+camera"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
href = link.get('href')
if href is None:
continue
elif keyword in href:
print href
I got nothing back from above script, is there anything I can do to fix it?
Thanks

Amazon is detecting the user-agent ("the name of your browser") and changing the content based on that value. If you add a user-agent to the request, you'll get the strings with "keyword=digital+camera" added to them. Otherwise, you don't.
url ="http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=digital+camera"
import urllib2
from bs4 import BeautifulSoup
req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html)
links = soup.find_all('a')
for l in links:
href = l.get('href')
title = l.get('title', '')
if 'Sony W800/B 20.1 MP Digital' in title:
print(href) # prints: http://www.amazon.com/Sony-W800-Digital-Camera-Black/dp/B00I8BIBCW/ref=sr_1_2/183-0842534-8993425?s=photo&ie=UTF8&qid=1421400650&sr=1-2&keywords=digital+camera

Related

im trying to fetch available urls in webpage but the output is like plain text

import requests
from bs4 import BeautifulSoup
url = 'https://www.worldometers.info/world-population/population-by-country/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
urls = link.get('href')
print(urls)
the URL to be printed as "https://www.worldometers.info/world-population/china-population/"
but is just printed as "world-population/china-population"
followed by I need to fetch one particular table from each URL fetched
That's relative url so you have to make them absolute urls as follows
import requests
from bs4 import BeautifulSoup
url = 'https://www.worldometers.info/world-population/population-by-country/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
urls = link.get('href')
full_url='https://www.worldometers.info'+urls
print(full_url)

How do I exclude certain beautifulsoup results that I don't want?

I am having issues trying to exclude results given from my beautiful soup program this is my code:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
I don't want to get the results that start with a "#" for example: #cite_ref-18
I have tried using for loops but I get this error message: KeyError: 0
You can use the str.startswith() method:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for tag in soup.find_all('a'):
link = tag.get('href')
if not str(link).startswith('#'):
print(link)
You can use CSS selector a[href]:not([href^="#"]). This will select all <a> tags with href= attribute but not the ones starting with # character:
import requests
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.select('a[href]:not([href^="#"])'):
print(link['href'])

Python Href scraping

I'm trying to loop over a href and get the URL. I've managed to extrat the href but i need the full url to get into this link. This is my code at the minute
import requests
from bs4 import BeautifulSoup
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
webpage_response.content
webpage_response = requests.get
soup = BeautifulSoup(webpage, "html.parser")
#only finding one track
#soup.table to find all links for days racing
harness_table = soup.table
#scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
link.insert(0, "http://www.harness.org.au")
webpage = requests.get(link)
new_soup = BeautifulSoup(webpage.content, "html.parser")
#work through table to get links to tracks
print(new_soup)'''
You can store the base url of website in a variable and then once you get the href from link you can join them both to create the next url.
import requests
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
soup = BeautifulSoup(webpage_response.content, "html.parser")
# only finding one track
# soup.table to find all links for days racing
harness_table = soup.table
# scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
webpage = requests.get(base_url + link["href"])
new_soup = BeautifulSoup(webpage.content, "html.parser")
# work through table to get links to tracks
print(new_soup)
Try this solution. Maybe you'll like this library.
from simplified_scrapy import SimplifiedDoc,req
url = 'http://www.harness.org.au/racing/results/?activeTab=tab'
html = req.get(url)
doc = SimplifiedDoc(html)
links = [doc.absoluteUrl(url,ele.a['href']) for ele in doc.selects('td.meetingText')]
print(links)
Result:
['http://www.harness.org.au/racing/fields/race-fields/?mc=BA040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=BH040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=RE040320']

I cannot extract the body of the a tag using the web crawler in python

this is my code in python. I can extract the href tags just not whats inside the body. should I us "body" in with get() command or "content" or something else?
import requests
from bs4 import BeautifulSoup
def web():
url='https://www.phoenixmarketcity.com/mumbai/brands'
source = requests.get(url)
plain=source.text
soup = BeautifulSoup(plain,"html.parser")
for link in soup.findAll('a'):
href = link.get('body')
print(href)
web()
I think here is what you want to do:-
from bs4 import BeautifulSoup
import requests
def web():
url='https://www.phoenixmarketcity.com/mumbai/brands'
source = requests.get(url)
plain=source.text
soup = BeautifulSoup(plain,"html.parser")
tags = soup('a')
for link in tags:
href = link.get('href')
print(href)
web()

Scraping IMDB.com with beautifulsoup in python but can't get href from movie link

I'm trying to get the href link for a movie (ex: search Iron Man on IMDB) but I can't seem to get it. I keep getting "None" when I run the code but if I remove .get('href'), the code will return the entire line of html (including the link I want). I appreciate any help with this. Thanks!
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin # For joining next page url with base url
search_terms = input("What movie do you want to know about?\n> ").split()
url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + '+'.join(search_terms) + '&s=all'
def scrape_find_next_page(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
next_page = soup.find('td', 'result_text').get('href')
return next_page
next_page_url = scrape_find_next_page(url)
You are trying to get the href from td, which the attribute does not exist. You need to get the a tag that contains the href attribute
next_page = soup.find('td', 'result_text').find('a').get('href')

Categories