I am using Soup and Selenium to access this page https://www.chewy.com/blue-buffalo-basics-limited/dp/37047 and trying to get a list of all packaging types' prices and ratings.
Below is my code:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(5)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
for brand, product, id, auto_ship, price, rating in zip(test.findAll('span', attrs={'itemprop': 'brand'}),
test.findAll('div', attrs={'id': 'product-title'}),
test.findAll('div', attrs={'class': 'value js-part-number'}),
test.findAll('p', attrs={'class': 'autoship-pricing p'}),
test.findAll('span', attrs={'class': 'ga-eec__price'}),
test.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
The result I have is
I would expect the data to be different for the three different buttons, but it seems it is only returning the value from the default page.
Is there anything else I should do to dynamically insert values for each button?
The HTML looks like
I copied the xpath of labels. It does bring me to the target view for different packages and the underlying HTML values do change. However, my print statment is still getting it from the main page. Any recommendation?
I found what happened. I wasnt loading the current page to soup but was rather loading a brand new source page.
I added a driver.page_source after the click and gave the browser sufficient time to load (10 seconds) then souped the page source. It works now.
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(1)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
for brand, product, id, auto_ship, price, rating in zip(soup.findAll('span', attrs={'itemprop': 'brand'}),
soup.findAll('div', attrs={'id': 'product-title'}),
soup.findAll('div', attrs={'class': 'value js-part-number'}),
soup.findAll('p', attrs={'class': 'autoship-pricing p'}),
soup.findAll('span', attrs={'class': 'ga-eec__price'}),
soup.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
Related
From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)
two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.
I have build a webscraping for real estate data with the help of some fellowsmembers on this website.
It works perfectly, but after is crawls to page 6/7 or furhter, a cookie the typical cookie warning pop up, and seem to disrupt my output in my CSV file.
Is there a way to handle the pop up?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
THe cookie script in the website:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<=(new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString()};a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function(e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a,"")}var m=q[a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+
f,l=e.value,r=e.expiresMinutes||1440(e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function(){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set(f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){try{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}}
var q;try{q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;])").exec(document.cookie))?a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; path=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=Tue, 13 Mar 1979 00:00:00 UTC; path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",[""],["https://www.hotjar.com","https://local.hotjar.com","http://local.hotjar.com","https://insights-staging.hotjar.com",
"http://insights-staging.hotjar.com"],!0),grant_consent:new d("cookie",[""],[""],!1),screenshot_retake:new d("localStorage",[""],[""],!1),screenshot_active_retake:new d("sessionStorage",[""],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();
To overcome the pop up problem just check after loading the page if there any pop up available. If yes,then click on that.Hope this help.
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
i want to scrape the product name,price and image source from the page but only limited result will display here is website which i want to scrape https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085
and i also want to scrape the check box of fillers but don't know about how to scrape all result but only 10 result display what should i do to scrape complete result .If i remove headers than display complete result of names and prices but image sources are not scrape {
headers = {"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"}
scrap = requests.get('https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085',headers=headers)
# Grab title-artist classes and store in recordList
content = BeautifulSoup(scrap.text, "html.parser")
if content.findAll("div", {"class": "search-result-gridview-item-wrapper"}) != None:
products = content.findAll("div", {"class": "search-result-gridview-item-wrapper"})
for product in products:
name = product.find("div", {"class": "search-result-product-title gridview"})
title = name.find('a').text
price = product.find("div", {"class": "search-result-productprice gridview enable-2price-2"})
p = price.text
image=product.find("div",{"class":"display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href=hreff.find('a')['href']
if content.findAll("div", {"class": "search-result-listview-item clearfix"}) != None:
products = content.findAll("div", {"class": "search-result-listview-item clearfix"})
for product in products:
if product.find("span",{"class":"Price-group"}) !=None:
name = product.find("a", {"class": "product-title-link"}).text
price = product.find("span", {"class": "Price-group"}).text
image = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href = hreff.find('a')['href']
}
Please see below the sample code to scrape data from this website. I have just added on interaction, but this should give you general idea - (You need to use inspect element functionality of your browser to find xpaths)
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome("./chromedriver") #download chromebrowser
browser.get("https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085") #open page in browser
outDF = pd.DataFrame(columns=['prodname', 'imageurl', 'minprice', 'maxprice', 'actualprice']) #template of data
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]") #finding prices
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]") #product name
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]") #images
#getting actual prices/ranges
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
#Reading next pages
next = True
while next:
try:
#clicking next button
browser.find_element(By.XPATH, "//button[contains(#class, 'paginator-btn paginator-btn-next')]").click()
#repeating process
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]")
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]")
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]")
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
except:
print("Something went wrong")
next = False
browser.quit()
I have this soup:
The webpage has references of companies in a grid view (16 rows x 5 columns) and I want to retrieve each reference's url and the title. The problem is that all 5 references in each row, are in one class named row and when I'm scraping the page, I can only see the first reference of every row, instead of all 5 of them. Here is my code so far:
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
try:
title = entry.find('img').get('title')
url = entry.a['href']
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Is there a way to fix this?
I think you should iterate over the "img" or over the "a".
You can write something like this:
for entry in info_block:
try:
for a in entry.find_all("a"):
title = a.find('img').get('title')
url = a.get('href')
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
anchors = entry.find_all("a")
for a in anchors:
try:
title = a.find('img').get('title')
url = a['href']
# urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
I've been working on Python code to pull data from stock-trading simulation software (I get the tickers for the securities I'm trading, e.g. VTI, AAPL, GOOG, etc.), then search Morningstar for the ticker and pull pricing info and whatever else I want from that site. I save the data I want from lists into a .csv file for use in Excel. I'm using Selenium to run a webdriver (I use either Chrome to see the process visually or PhantomJS to run the program headless without browser GUI), and beautifulsoup to access the websites and work with the HTML.
I have the program working decently, but it takes upwards of 120 seconds to run through a portfolio of only 11 securities, and I am hoping to expand this program to do more elaborate actions.
Is there anything in my coding style that could be changed to speed up the webscraping process? Are there any general methods of writing Python code to allow for fast execution?
Here's the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
#browser = webdriver.Chrome() #replace with .Firefox(), or with the browser of your choice
browser = webdriver.PhantomJS()
security_links_list = list()
equitysim_ticker_list = list()
url = ['https://www.equitysim.com/Home'
]
for item in url:
browser.get(item) #navigate to page behind login
username = browser.find_element_by_id('placeholderContent_txtUserName')
username.send_keys('EDITED_FOR_SECURITY')
password = browser.find_element_by_id('placeholderContent_txtPassword')
password.send_keys('EDITED_FOR_SECURITY')
form = browser.find_element_by_id("placeholderContent_LoginButton")
form.click()
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
table_a = soup.find('table', 'ba-tbl admintable')
for a in table_a.find_all('a', href=True):
security_links_list.append(a['href'])
links_set = set(security_links_list)
links_set.remove('#')
print(links_set)
mystring = "https://www.equitysim.com"
links_set_revised = [mystring + link_text for link_text in links_set]
print(links_set_revised)
for item in links_set_revised:
browser.get(item)
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
title_element = soup.find("title")
title = title_element.text
ticker = title.split(':', 1)[0]
ticker = ticker.replace('\n','')
ticker = ticker.replace('\t','')
equitysim_ticker_list.append(ticker)
print(equitysim_ticker_list)
morningstar_ticker_search = "http://quote.morningstar.com/TickerLookup.html"
uri_links = list()
for ticker in equitysim_ticker_list:
browser.get(morningstar_ticker_search)
enter_ticker = browser.find_element_by_xpath("//input[#value='Ticker']")
enter_ticker.click()
search_ticker = browser.find_element_by_class_name('F3')
search_ticker.send_keys(ticker)
go_box = browser.find_element_by_xpath("//input[#src='http://im.morningstar.com/im/go.gif']")
go_box.click()
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
outer_div = soup.find('div', attrs={'id': 'quote_quicktake'})
iframe = outer_div.find('iframe').get('src')
full_url = 'https:' + iframe
uri_links.append(full_url)
print(uri_links)
price_list = list()
ticker_list = list()
nav_list = list()
for item in uri_links:
browser.get(item) #navigate to page behind login
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
price_element = soup.find("div", attrs={"id": "lastPrice"})
price = price_element.text # strip() is used to remove starting and trailing
nav_element = soup.find("span", attrs={"id": "NAV"} or {"vkey": "NAV"})
nav = nav_element.text
nav_split1 = nav.split('\n \t\t', 1)[1]
nav_split2 = nav_split1.split(' ', 1)[0]
title_element = soup.find("title")
title = title_element.text
ticker = title.split(' ', 1)[0]
price_list.append(price)
nav_list.append(nav_split2)
ticker_list.append(ticker)
print(ticker)
print(price)
print(nav_split2)
#ticker =
print(ticker_list)
print(price_list)
print(nav_list)
csvfile = "C:\\Users\\USERNAME\\AppData\\Local\\Programs\\Python\\Python36\\personal\\exampleCsv.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output,lineterminator='')
writer.writerow(ticker_list)
writer.writerow('\n')
writer.writerow(price_list)
writer.writerow('\n')
writer.writerow(nav_list)