How to use concurrent futures to web scrape faster? Selenium - python

I'm trying to scrape a list of URLs with Selenium and concurrent futures to speed up the process. I've found that I get a StaleElementReferenceException when using concurrent futures, and also the job titles do not correspond to the URLs. For instance, I get repeated job titles. When using a normal "for" I do not get this error.
I don't know what I'm doing wrong. Any help is welcomed.
My simplified code is:
import concurrent.futures
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
PATH = "C:\Program Files (x86)\chromedriver.exe"
wd = webdriver.Chrome(PATH, options=options)
wd.maximize_window()
vurl = ['https://www.bumeran.com.pe/empleos/asistente-contable-exp.-en-concar-ssp-1114585777.html',
'https://www.bumeran.com.pe/empleos/asesor-a-comercial-digital-de-seguro-vehicular-1114584904.html',
'https://www.bumeran.com.pe/empleos/mecanico-de-mantenimiento-arequipa-1114585709.html',
'https://www.bumeran.com.pe/empleos/almacenero-l.o.-electronics-s.a.c.-1114585629.html',
'https://www.bumeran.com.pe/empleos/analista-de-comunicaciones-ingles-avanzado-teleperformance-peru-s.a.c.-1114564863.html',
'https://www.bumeran.com.pe/empleos/vendedores-adn-retail-s.a.c.-1114585422.html',
'https://www.bumeran.com.pe/empleos/especialista-de-intervencion-de-proyectos-mondelez-international-1114585461.html',
'https://www.bumeran.com.pe/empleos/desarrollador-java-senior-inetum-peru-1114584840.html',
'https://www.bumeran.com.pe/empleos/practicante-legal-coes-sinac-1114584788.html',
'https://www.bumeran.com.pe/empleos/concurso-publico-n-143-especialista-en-presupuesto-banco-central-de-reserva-del-peru-1114584538.html',
'https://www.bumeran.com.pe/empleos/concurso-n-147-especialista-en-analisis-de-infraestructuras-financieras-banco-central-de-reserva-del-peru-1114584444.html',
'https://www.bumeran.com.pe/empleos/asistente-legal-magdalena-del-mar-los-portales-1114584305.html',
'https://www.bumeran.com.pe/empleos/asistente-de-nuevos-negocios-inmobiliarios-madrid-ingenieros-1114584269.html',
'https://www.bumeran.com.pe/empleos/trabajo-desde-tres-horas-por-dia-ventas-ventas-por-internet-1114584205.html']
vtitle = []
def get_urls(url):
wd.get(url)
wd.implicitly_wait(20)
try:
title = wd.find_element_by_xpath("//h1").text
print('URL finished')
except:
title=''
print('Exception!')
vtitle.append(title)
vurl2.append(url)
# This throws an exception and does not scrape correctly
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(get_urls, vurl)
# output is for example
#['ALMACENERO', 'ALMACENERO', 'ALMACENERO', 'ALMACENERO', 'Desarrollador Java (Senior)', 'Desarrollador Java (Senior)', 'Desarrollador Java (Senior)']
# when it should be:
# ['ALMACENERO', 'Analista de Comunicaciones - Inglés Avanzado', 'Vendedores', 'Especialista de Intervención de Proyectos', 'Desarrollador Java (Senior)', 'Practicante Legal', 'Asistente Legal - Magdalena del Mar']
# This works fine but is too slow
for url in vurl:
get_urls(url)

Related

how to get the scraped values on a website that has fingerprintjs?

I am trying to scrape this site using google colab.
https://www.blibli.com/p/facial-tissue-tisu-wajah-250-s-paseo/is--LO1-70001-00049-00001?seller_id=LO1-70001&sku_id=LO1-70001-00049-00001&sclid=7zuGEaS4hh5SowAA6tnfd5i2wKjR6e3p&sid=c5746ccfbb298d3b&pid=LO1-70001-00049-00001
The idea here is to get the fingerprint to be used and parsed in the requests.
currently my code is
from seleniumwire import webdriver
options = webdriver.ChromeOptions()
options.set_capability(
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
)
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome('chromedriver',options=options)
dataranch=[1]
dataulrs=['https://www.blibli.com/p/facial-tissue-tisu-wajah-250-s-paseo/is--LO1-70001-00049-00001?seller_id=LO1-70001&sku_id=LO1-70001-00049-00001&sclid=7zuGEaS4hh5SowAA6tnfd5i2wKjR6e3p&sid=c5746ccfbb298d3b&pid=LO1-70001-00049-00001']
for prod_id,urls in zip(dataranch,dataulrs):
try:
driver.get(urls)
sleep(randint(3,5))
product_name=driver.find_element(By.CSS_SELECTOR, ".product-name").text
try:
normal_price=driver.find_element(By.CSS_SELECTOR, ".product-price__before").text
except:
normal_price="0"
normal_price=normal_price.replace('Rp',"").replace(".","")
try:
discount=driver.find_element(By.CSS_SELECTOR, ".product-price__discount").text
except:
discount="0"
compid=urls.split(".")[4].split("?")[0]
dat={
'product_name':product_name,
'normal_price':normal_price,
'discount':discount,
'competitor_id':compid,
'url':urls,
'prod_id':prod_id,
'date_key':today,
'web':'ranch market'
}
dat=pd.DataFrame([dat])
except Exception as e:
print(f"{urls} error")
print(e)
What am I doing wrong here? Can someone help? Because I tried inspecting the elements and the css . is there. Is there a way to scrape the data needed? Do I need to use a different module just selenium to get the data?

Web Scraping Python - Pubs

I am trying to extract the site name and address data from this website for each card but this doesn't seem to work. Any suggestions?
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://order.marstons.co.uk/")
all_cards = driver.find_elements_by_xpath("//div[#class='h3.body__heading']/div[1]")
for card in all_cards:
print(card.text) # do as you will
I'm glad that you are trying to help yourself, it seems you are new to this so let me offer some help.
Automating a browser via Selenium to do this is going to take you forever, the Marston's site is pretty straightforward to scrape if you know where to look: If you open your browser Developer Tools (F12 on pc) then - Network tab - fetch/Xhr and then hit refresh while on the Marston's site you'll see some backend api calls happening. If you click on the one that says "brand" then click the "preview" tab that should be available, you'll see a collapsible list of all sorts of information, that is a JSON file which is essentially a collection of python lists and dictionaries which make it easier to get the data you are after. The information in the "venue" list is going to be helpful when it comes to scraping the menus for each venue.
When you go to a specific pub you'll see an api call with the pubs name, this has all the menu info which you can see in the same way and we can make calls to these venue api's using the "slug" data from the venues response above.
So by making our own requests to these URLs and stepping through the JSON and getting the data we want we can have everything done in a couple minutes, far easier than trying to do this automating a browser! I've written the code below, feel free to ask questions if anything is unclear you'll need to pip install requests and pandas to make this work. You owe me a pint! :) Cheers
import requests
import pandas as pd
headers = {'origin':'https://order.marstons.co.uk'}
url = 'https://api-cdn.orderbee.co.uk/brand'
resp = requests.get(url,headers=headers).json()
venues = {}
for venue in resp['venues']:
venues[venue['slug']] = venue
print(f'{len(venues)} venues to scrape')
output = []
for venue in venues.keys():
try:
url = f'https://api-cdn.orderbee.co.uk/venues/{venue}'
print(f'Scraping: {venues[venue]["name"]}')
try:
info = requests.get(url,headers=headers).json()
except Exception as e:
print(e)
print(f'{venues[venue]["name"]} not available')
continue
for category in info['menus']['oat']['categories']: #oat = order at table?
cat_name = category['name']
for subcat in category['subCategories']:
subcat_name = subcat['name']
for item in subcat['items']:
info = {
'venue_name': venues[venue]['name'],
'venue_city': venues[venue]['address']['city'],
'venue_address': venues[venue]['address']['streetAddress'],
'venue_postcode': venues[venue]['address']['postCode'],
'venue_latlng': venues[venue]['address']['location']['coordinates'],
'category':cat_name,
'subcat':subcat_name,
'item_name' : item['name'],
'item_price' : item['price'],
'item_id' : item['id'],
'item_sku' : item['sku'],
'item_in_stock' : item['inStock'],
'item_active' : item['isActive'],
'item_last_update': item['updatedAt'],
'item_diet': item['diet']
}
output.append(info)
except Exception as e:
print(f'Problem scraping {venues[venue]["name"]}, skipping it') #when there is no menu available for some reason? Closed location?
continue
df = pd.DataFrame(output)
df.to_csv('marstons_dump.csv',index=False)
I use Firefox but it should work also for Chrome.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Firefox()
driver.get("https://order.marstons.co.uk/")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="app"]/div/div/div/div[2]/div'))
).find_elements_by_tag_name('a')
for el in element:
print("heading", el.find_element_by_tag_name('h3').text)
print("address", el.find_element_by_tag_name('p').text)
finally:
driver.quit()

Driver.get a group of links?

How do I use driver.get to open several URLs in Chrome.
My code:
import requests
import json
import pandas as pd
from selenium import webdriver
chromeOptions = webdriver.ChromeOptions()
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
links = []
request1 = requests.get('https://api.beta.tab.com.au/v1/recommendation-service/featured-events?jurisdiction=NSW')
json1 = request1.json()
for n in json1['nextToGoRaces']:
if n['meeting']['location'] in ['VIC','NSW','QLD','SA','WA','TAS','IRL']:
links.append(n['_links']['self'])
driver.get('links')
Based on the comments - you'll want a class to manage your browsers, a class for your tests, then a runner to run in parallel.
Try this:
import unittest
import time
import testtools
from selenium import webdriver
class BrowserManager:
browsers=[]
def createBrowser(self, url):
browser = webdriver.Chrome()
browser.get(url)
self.browsers.append(browser)
def getBrowserByPartialURL(self, url):
for browser in self.browsers:
if url in browser.current_url:
return browser
def CloseItAllDown(self):
for browser in self.browsers:
browser.close()
class UnitTest1(unittest.TestCase):
def test_DoStuffOnGoogle(self):
browser = b.getBrowserByPartialURL("google")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
def test_DoStuffOnYahoo(self):
browser = b.getBrowserByPartialURL("yahoo")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
#create a global variable for the brwosers
b = BrowserManager()
# To Run the tests
if __name__ == "__main__":
##move to an init to Create your browers
b.createBrowser("https://www.google.com")
b.createBrowser("https://www.yahoo.com")
time.sleep(5) # This is so you can see both open at the same time
suite = unittest.TestLoader().loadTestsFromTestCase(UnitTest1)
concurrent_suite = testtools.ConcurrentStreamTestSuite(lambda: ((case, None) for case in suite))
concurrent_suite.run(testtools.StreamResult())
This code doesn't do anything exciting - it's an example of how to manage multiple browsers and run tests in parallel. It goes to the specified urls (which you should move to an init/setup), then prints out the URL it's on 10 times.
This is how you add a browser to the manager: b.createBrowser("https://www.google.com")
This is how you retrieve your browser: browser = b.getBrowserByPartialURL("google") - note it's a partial URL so you can use the domain as a keyword.
This is the output (just the first few lines- not all of it...) - It's a print URL for google then yahoo, then google then yahoo - showing that they're running at the same time:
PS C:\Git\PythonSelenium\BrowserManager> cd 'c:\Git\PythonSelenium'; & 'C:\Python38\python.exe' 'c:\Users\User\.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy\launcher' '62426' '--' 'c:\Git\PythonSelenium\BrowserManager\BrowserManager.py'
DevTools listening on ws://127.0.0.1:62436/devtools/browser/7260dee3-368c-4f21-bd59-2932f3122b2e
DevTools listening on ws://127.0.0.1:62463/devtools/browser/9a7ce919-23bd-4fee-b302-8d7481c4afcd
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/

Multiprocessing with Python, Execution never completes

New to multiprocessing! please help.
All libraries are imported, get_links method works, I've tested it on a single case. Trying to make the method run for multiple urls that are designated to parallel processes to make it faster. Without multiprocessing my runtimes are 10 hours +
Edit 2:
Tried my best at a MCVE
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from multiprocessing import Pool
options = Options()
options.headless = True
options.binary_location = 'C:\\Users\\Liam\\AppData\\Local\\Google\\Chrome SxS\\Application\\Chrome.exe'
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--no-sandbox')
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
subsubarea_urls = []
with open('subsubarea_urls.txt') as f:
for item in f:
item = item.strip()
subsubarea_urls.append(item)
test_urls = subsubarea_urls[:3]
def get_links(url):
driver = webdriver.Chrome('....\Chromedriver', chrome_options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
link = soup.find(class_ = 'listings__all')
if link is not None:
link = "example.com" + link.find('a')['href']
driver.close()
return link
def main():
how_many = 3
p = Pool(processes = how_many)
data = p.map(get_links, test_urls)
p.close()
with open('test_urls.txt', 'w') as f:
f.write(str(data))
if __name__ == '__main__':
main()
Unexpectedly the problem was not anything to do with the code. Multiprocessing in python does not seem to like Windows GUI's the sub processes called by Pool dont have std streams.
The code needs to be executed in IDLE python -m idlelib.idle (To open IDLE)
See Terry Jan Reedy's answer here

scrape website traffic from semrush using beautiful soup python

I'm trying to scrape website traffic from semrush.com.
my current code using BeautifulSoup is:
from bs4 import BeautifulSoup, BeautifulStoneSoup
import urllib
import json
req = urllib.request.Request('https://www.semrush.com/info/burton.com', headers={'User-Agent':'Magic Browser'})
response = urllib.request.urlopen(req)
raw_data = response.read()
response.close()
soup = BeautifulSoup(raw_data)
I've been trying data = soup.findAll("a", {"href":"/info/burton.com+(by+organic)"}) or data = soup.findAll("span", {"class":"sem-report-counter"}) without much luck.
I can see the numbers on the webpage that I would like to get. Is there a way to pull this information off? I'm not seeing it in the html I pull.
I went the extra mile and set up a working example of how you can use selenium to scrape that page. Install selenium and try it out!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = 'https://www.semrush.com/info/burton.com' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
driver = webdriver.Chrome(executable_path='/opt/ChromeDriver/chromedriver',
chrome_options=options)
#note: executable_path will depend on where your chromedriver.exe is located
driver.get(url) #get response
driver.implicitly_wait(1) #wait to load content
elements = driver.find_elements_by_xpath(xpath='//a[#href="/info/burton.com+(by+organic)"]') #grab that stuff you wanted?
for e in elements: print(e.get_attribute('text').strip()) #print text fields
driver.quit() #close the driver when you're done
Output that I see in my terminal:
356K
6.5K
59.3K
$usd305K
Organic keywords
Organic
Top Organic Keywords
View full report
Organic Position Distribution

Categories