How to speed up my webscraping python code?

How to speed up my webscraping python code? - python

I've been working on Python code to pull data from stock-trading simulation software (I get the tickers for the securities I'm trading, e.g. VTI, AAPL, GOOG, etc.), then search Morningstar for the ticker and pull pricing info and whatever else I want from that site. I save the data I want from lists into a .csv file for use in Excel. I'm using Selenium to run a webdriver (I use either Chrome to see the process visually or PhantomJS to run the program headless without browser GUI), and beautifulsoup to access the websites and work with the HTML.
I have the program working decently, but it takes upwards of 120 seconds to run through a portfolio of only 11 securities, and I am hoping to expand this program to do more elaborate actions.
Is there anything in my coding style that could be changed to speed up the webscraping process? Are there any general methods of writing Python code to allow for fast execution?
Here's the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
#browser = webdriver.Chrome() #replace with .Firefox(), or with the browser of your choice
browser = webdriver.PhantomJS()
security_links_list = list()
equitysim_ticker_list = list()
url = ['https://www.equitysim.com/Home'
]
for item in url:
browser.get(item) #navigate to page behind login
username = browser.find_element_by_id('placeholderContent_txtUserName')
username.send_keys('EDITED_FOR_SECURITY')
password = browser.find_element_by_id('placeholderContent_txtPassword')
password.send_keys('EDITED_FOR_SECURITY')
form = browser.find_element_by_id("placeholderContent_LoginButton")
form.click()
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
table_a = soup.find('table', 'ba-tbl admintable')
for a in table_a.find_all('a', href=True):
security_links_list.append(a['href'])
links_set = set(security_links_list)
links_set.remove('#')
print(links_set)
mystring = "https://www.equitysim.com"
links_set_revised = [mystring + link_text for link_text in links_set]
print(links_set_revised)
for item in links_set_revised:
browser.get(item)
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
title_element = soup.find("title")
title = title_element.text
ticker = title.split(':', 1)[0]
ticker = ticker.replace('\n','')
ticker = ticker.replace('\t','')
equitysim_ticker_list.append(ticker)
print(equitysim_ticker_list)
morningstar_ticker_search = "http://quote.morningstar.com/TickerLookup.html"
uri_links = list()
for ticker in equitysim_ticker_list:
browser.get(morningstar_ticker_search)
enter_ticker = browser.find_element_by_xpath("//input[#value='Ticker']")
enter_ticker.click()
search_ticker = browser.find_element_by_class_name('F3')
search_ticker.send_keys(ticker)
go_box = browser.find_element_by_xpath("//input[#src='http://im.morningstar.com/im/go.gif']")
go_box.click()
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
outer_div = soup.find('div', attrs={'id': 'quote_quicktake'})
iframe = outer_div.find('iframe').get('src')
full_url = 'https:' + iframe
uri_links.append(full_url)
print(uri_links)
price_list = list()
ticker_list = list()
nav_list = list()
for item in uri_links:
browser.get(item) #navigate to page behind login
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
price_element = soup.find("div", attrs={"id": "lastPrice"})
price = price_element.text # strip() is used to remove starting and trailing
nav_element = soup.find("span", attrs={"id": "NAV"} or {"vkey": "NAV"})
nav = nav_element.text
nav_split1 = nav.split('\n \t\t', 1)[1]
nav_split2 = nav_split1.split(' ', 1)[0]
title_element = soup.find("title")
title = title_element.text
ticker = title.split(' ', 1)[0]
price_list.append(price)
nav_list.append(nav_split2)
ticker_list.append(ticker)
print(ticker)
print(price)
print(nav_split2)
#ticker =
print(ticker_list)
print(price_list)
print(nav_list)
csvfile = "C:\\Users\\USERNAME\\AppData\\Local\\Programs\\Python\\Python36\\personal\\exampleCsv.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output,lineterminator='')
writer.writerow(ticker_list)
writer.writerow('\n')
writer.writerow(price_list)
writer.writerow('\n')
writer.writerow(nav_list)

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))

I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

HTML values do not change after Selenium click button with Python

I am using Soup and Selenium to access this page https://www.chewy.com/blue-buffalo-basics-limited/dp/37047 and trying to get a list of all packaging types' prices and ratings.
Below is my code:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(5)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
for brand, product, id, auto_ship, price, rating in zip(test.findAll('span', attrs={'itemprop': 'brand'}),
test.findAll('div', attrs={'id': 'product-title'}),
test.findAll('div', attrs={'class': 'value js-part-number'}),
test.findAll('p', attrs={'class': 'autoship-pricing p'}),
test.findAll('span', attrs={'class': 'ga-eec__price'}),
test.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
The result I have is
I would expect the data to be different for the three different buttons, but it seems it is only returning the value from the default page.
Is there anything else I should do to dynamically insert values for each button?
The HTML looks like
I copied the xpath of labels. It does bring me to the target view for different packages and the underlying HTML values do change. However, my print statment is still getting it from the main page. Any recommendation?

I found what happened. I wasnt loading the current page to soup but was rather loading a brand new source page.
I added a driver.page_source after the click and gave the browser sufficient time to load (10 seconds) then souped the page source. It works now.
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(1)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
for brand, product, id, auto_ship, price, rating in zip(soup.findAll('span', attrs={'itemprop': 'brand'}),
soup.findAll('div', attrs={'id': 'product-title'}),
soup.findAll('div', attrs={'class': 'value js-part-number'}),
soup.findAll('p', attrs={'class': 'autoship-pricing p'}),
soup.findAll('span', attrs={'class': 'ga-eec__price'}),
soup.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()

Web Scraping with Python - blank return

I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```

The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.

Handeling Cookie pop-up after page 6/7

I have build a webscraping for real estate data with the help of some fellowsmembers on this website.
It works perfectly, but after is crawls to page 6/7 or furhter, a cookie the typical cookie warning pop up, and seem to disrupt my output in my CSV file.
Is there a way to handle the pop up?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
THe cookie script in the website:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<=(new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString()};a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function(e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a,"")}var m=q[a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+
f,l=e.value,r=e.expiresMinutes||1440(e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function(){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set(f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){try{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}}
var q;try{q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;])").exec(document.cookie))?a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; path=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=Tue, 13 Mar 1979 00:00:00 UTC; path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",[""],["https://www.hotjar.com","https://local.hotjar.com","http://local.hotjar.com","https://insights-staging.hotjar.com",
"http://insights-staging.hotjar.com"],!0),grant_consent:new d("cookie",[""],[""],!1),screenshot_retake:new d("localStorage",[""],[""],!1),screenshot_active_retake:new d("sessionStorage",[""],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();

To overcome the pop up problem just check after loading the page if there any pop up available. If yes,then click on that.Hope this help.
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})

get the new html after .click() with selenium

I'm using selenium to click in a link but i can't get the new table. what code i use to retrieve the new page?
df_list = []
url = 'https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1' #+ str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")
driver = webdriver.PhantomJS(executable_path = 'C:\\Python27\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get('https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1')
driver.find_element_by_xpath("/html[1]/body[1]/form[1]/div[1]/div[2]/div[3]/div[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/table[1]/tbody[1]/tr[52]/td[1]/table[1]/tbody[1]/tr[1]/td[2]/a[1]").click()
?????
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")

If I understand your question it is "How do I get the HMTL from my driver object for the new page I've loaded". The answer would be driver.page_source
driver.find_element_by_xpath("Some crazy shenanigans of an xpath").click()
html_from_page = driver.page_source
soup = bs4.BeautifulSoup(html_from_page, 'html.parser')
# more stuff

Welcome to SO. Here is the another approach where you the script will iterate through all the tables (pages) and get the data.
df_list = []
url = 'https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1' #+ str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")
driver = webdriver.PhantomJS(executable_path = 'C:\\Python27\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get('https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1')
# get the number of pages and iterate each of them
numberOfPage = driver.find_element_by_xpath("(//tr[#class='tbpaging']//a)[last()]").text
for i in range(2,int(numberOfPage)):
# click on each page link and then get the details
driver.find_element_by_xpath("(//tr[#class='tbpaging']//a)[" + i +"]").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to speed up my webscraping python code? - python

Related

Python / BeautifulSoup webscraper returning "None"

HTML values do not change after Selenium click button with Python

Web Scraping with Python - blank return

Handeling Cookie pop-up after page 6/7

get the new html after .click() with selenium

Categories

Resources