Scrape through website and iterate over seach results to get specific data

Scrape through website and iterate over seach results to get specific data - python

I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone

You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))

I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

How to extract text within h4 strong?

I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)

Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']

Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)

How do I get hrefs from hrefs?

How do I get hrefs from hrefs using Python in class and method format?
I have tried:
root_url = 'https://www.iea.org'
class IEAData:
def __init__(self):
try:--
except:
def get_links(self, url):
all_links = []
page = requests.get(root_url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links =iea_obj.get_links(yearLinks)
print(links)
Recommended: links variable must have all month hrefs but not getting, so please tell me how I should do it.

There were a couple of issues with your code. Your get_links() function was not using the url that was passed to it. When looping over the returned links, you were passing yearLinks rather than the url.
The following should get you going:
from bs4 import BeautifulSoup
import requests
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for li in soup.find_all(class_='omrlist'):
all_links.append(root_url + li.find('a').get('href'))
return all_links
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
for url in yearLinks:
links = iea_obj.get_links(url)
print(url, links)
This would give you output starting:
https://www.iea.org/oilmarketreport/reports/2018/ ['https://www.iea.org/oilmarketreport/reports/2018/0118/', 'https://www.iea.org/oilmarketreport/reports/2018/0218/', 'https://www.iea.org/oilmarketreport/reports/2018/0318/', 'https://www.iea.org/oilmarketreport/reports/2018/0418/', 'https://www.iea.org/oilmarketreport/reports/2018/0518/', 'https://www.iea.org/oilmarketreport/reports/2018/0618/', 'https://www.iea.org/oilmarketreport/reports/2018/0718/', 'https://www.iea.org/oilmarketreport/reports/2018/0818/', 'https://www.iea.org/oilmarketreport/reports/2018/1018/']
https://www.iea.org/oilmarketreport/reports/2017/ ['https://www.iea.org/oilmarketreport/reports/2017/0117/', 'https://www.iea.org/oilmarketreport/reports/2017/0217/', 'https://www.iea.org/oilmarketreport/reports/2017/0317/', 'https://www.iea.org/oilmarketreport/reports/2017/0417/', 'https://www.iea.org/oilmarketreport/reports/2017/0517/', 'https://www.iea.org/oilmarketreport/reports/2017/0617/', 'https://www.iea.org/oilmarketreport/reports/2017/0717/', 'https://www.iea.org/oilmarketreport/reports/2017/0817/', 'https://www.iea.org/oilmarketreport/reports/2017/0917/', 'https://www.iea.org/oilmarketreport/reports/2017/1017/', 'https://www.iea.org/oilmarketreport/reports/2017/1117/', 'https://www.iea.org/oilmarketreport/reports/2017/1217/']

I'm fairly new to programming, and I'm still learning and trying to understand how classes and whatnot all work together. But gave it a shot (that's how we learn, right?)
Not sure if this is what you're looking for as your output. I changed 2 things and was able to put all the links from within the yearLinks into a list. Note that it'll also include the PDF links as well as the months links that I think you wanted. If you don't want those PDF links, and exclusively the months, then just don't include the pdf.
So here's the code I did it with, and maybe you can use that to fit into how you have it structured.
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links = iea_obj.get_links(url)
# uncomment line below if you do not want the .pdf links
#links = [ x for x in links if ".pdf" not in x ]
reportLinks += links

Execute loop based on a list - obtain result for each page (subpage)

I am trying to obtain the number of pages for each url from a list of urls. My code works as long as I have only one url, however as soon as I try it with a list of urls I only get the rest from one url. Guess the problem is related to my loop. given that I am new to python and beautifoul soup I dont manage to spot the mistake myself.
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
urls=[]
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
for i in range (1,last_page):
page = main_page.format(i)
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
urls = base_url + link.find('a').get('href')+"/-/p/{}"
print(urls)
So far, everything works, I obtain the max page number and get all the urls from each page. The problem lies in the code below (I believe):
for url in urls: #to loop through the list of urls
r = requests.get(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
anzahl=max(tes_int)
print(anzahl)
I am trying to apply the same concept as in the code above for each url from the list urls- but instead of obtaining the max page number for each url I obtain 241 every time, as if I am caught in a loop...
Any thoughts on that? Help is highly appreciated.

You're equating urls to last link generated by loop.
To build valid list of urls you need to replace = on append():
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try:
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
print(urls)
EDIT: okay, as far as I see you have several issues in your code. along with my initial fix I'm outlining my vision and understanding of how your code desired to work:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try: #also adding try-except for escaping broken/unavailable links
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
urls = list(set(urls)) #check and drop duplicated in links list
for url in urls: #to loop through the list of urls
try:
r = requests.get(url.format(0))
print(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
except:
print('broken link')
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
# print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
try:
anzahl=max(tes_int)
print(anzahl)
except:
print('maximum cannot be calculated')

web scraping with beautifulsoup getting error

I'm pretty new to Python and mainly need it for getting information from website.
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.example.com'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'c5'}):
href = link.get('href')
time.sleep(0.3)
# print(href)
single_item(href)
page += 1
def single_item(item_url):
s_code = requests.get(item_url)
p_text = s_code.text
soup = BeautifulSoup(p_text, "html.parser")
upc = ('div', {'class': 'product-upc'})
for upc in soup.findAll('span', {'class': 'upcNum'}):
print(upc.string)
sku = ('span', {'data-selenium': 'bhSku'})
for sku in soup.findAll('span', {'class': 'fs16 c28'}):
print(sku.text)
price = ('span', {'class': 'price'})
for price in soup.findAll('meta', {'itemprop': 'price'}):
print(price)
outFile = open(r'C:\Users\abc.txt', 'a')
outFile.write(str(upc))
outFile.write("\n")
outFile.write(str(sku))
outFile.write("\n")
outFile.write(str(price))
outFile.write('\n')
outFile.close()
spider(1)
What i want to get is "UPC:813066012487, price:26.45 and SKU:KBPTMCC2" without any span, meta or content attributes.I attached my output below
Here is my output:
screenshot
Where do i do wrong ?
Hope someone can figure it out! Thanks!!

The data you want is in the div attribute data-itemdata, you can call json.loads and it will give you a dict that you can access to get what you want:
from bs4 import BeautifulSoup
import requests
import json
soup = BeautifulSoup(requests.get("https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/1/N/4005352853+35").content, "html.parser")
for d in soup.select("div[data-selenium=itemDetail]"):
data = json.loads(d["data-itemdata"])
print(data)
Each data dict will look like:
{u'catagoryId': u'20861',
u'inCart': False,
u'inWish': False,
u'is': u'REG',
u'itemCode': u'KBPTMCC2',
u'li': [],
u'price': u'26.45',
u'searchTerm': u'',
u'sku': u'890522'}
So just access by key i.e price = data["price"].
To get the UPC we just need to visit the items page, we can get the url from h3 with the data-selenium attribute:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum").text.strip()
data = json.loads(d["data-itemdata"])
Not all pages have a UPC value so you will have to decide what to do, if you just want products with UPC's first check if the select finds anything:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum")
if upc:
data = json.loads(d["data-itemdata"])
text = (upc.text.strip()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape through website and iterate over seach results to get specific data - python

Related

Python / BeautifulSoup webscraper returning "None"

How to extract text within h4 strong?

How do I get hrefs from hrefs?

Execute loop based on a list - obtain result for each page (subpage)

web scraping with beautifulsoup getting error

Categories

Resources