Multiprocessing with Python, Execution never completes - python

New to multiprocessing! please help.
All libraries are imported, get_links method works, I've tested it on a single case. Trying to make the method run for multiple urls that are designated to parallel processes to make it faster. Without multiprocessing my runtimes are 10 hours +
Edit 2:
Tried my best at a MCVE
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from multiprocessing import Pool
options = Options()
options.headless = True
options.binary_location = 'C:\\Users\\Liam\\AppData\\Local\\Google\\Chrome SxS\\Application\\Chrome.exe'
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--no-sandbox')
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
subsubarea_urls = []
with open('subsubarea_urls.txt') as f:
for item in f:
item = item.strip()
subsubarea_urls.append(item)
test_urls = subsubarea_urls[:3]
def get_links(url):
driver = webdriver.Chrome('....\Chromedriver', chrome_options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
link = soup.find(class_ = 'listings__all')
if link is not None:
link = "example.com" + link.find('a')['href']
driver.close()
return link
def main():
how_many = 3
p = Pool(processes = how_many)
data = p.map(get_links, test_urls)
p.close()
with open('test_urls.txt', 'w') as f:
f.write(str(data))
if __name__ == '__main__':
main()

Unexpectedly the problem was not anything to do with the code. Multiprocessing in python does not seem to like Windows GUI's the sub processes called by Pool dont have std streams.
The code needs to be executed in IDLE python -m idlelib.idle (To open IDLE)
See Terry Jan Reedy's answer here

Related

How to handle large scale Web Scraping?

The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[#id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
There are multiple ways in which you can do this.
1. Creating a distributed system in which you run the spider through multiple machines in order to run in parallel.
This in my opinio is the better of the options as you can also create a scalable dynamic solution that you will be able to use many times over.
There are many ways of doing this normally it will consist of dividing the seedlist (The Zip Codes) into many separate seedlists in order to have the separate processes working with seperate seedlists, thus the downloads will run in parallel so for example if its on 2 machines it will go 2 times faster, but if on 10 machines its 10 times faster, etc.
In order to do this I might suggest looking into AWS, namely AWS Lambda , AWS EC2 Instances or even AWS Spot Instances these are the ones I have worked wiht previously and they are not terribly hard to work with.
2. Alternatively, if you are wanting to run it on a single machine you can take a look into Multithreading with Python, which can help you run the process in parallel on the singular machine.
3. This is another option particularly if it is a once off process. You can try running it simply with requests which may speed it up but with a massive amount of seeds it usually is faster to develop a process running in parallel.

How to use concurrent futures to web scrape faster? Selenium

I'm trying to scrape a list of URLs with Selenium and concurrent futures to speed up the process. I've found that I get a StaleElementReferenceException when using concurrent futures, and also the job titles do not correspond to the URLs. For instance, I get repeated job titles. When using a normal "for" I do not get this error.
I don't know what I'm doing wrong. Any help is welcomed.
My simplified code is:
import concurrent.futures
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
PATH = "C:\Program Files (x86)\chromedriver.exe"
wd = webdriver.Chrome(PATH, options=options)
wd.maximize_window()
vurl = ['https://www.bumeran.com.pe/empleos/asistente-contable-exp.-en-concar-ssp-1114585777.html',
'https://www.bumeran.com.pe/empleos/asesor-a-comercial-digital-de-seguro-vehicular-1114584904.html',
'https://www.bumeran.com.pe/empleos/mecanico-de-mantenimiento-arequipa-1114585709.html',
'https://www.bumeran.com.pe/empleos/almacenero-l.o.-electronics-s.a.c.-1114585629.html',
'https://www.bumeran.com.pe/empleos/analista-de-comunicaciones-ingles-avanzado-teleperformance-peru-s.a.c.-1114564863.html',
'https://www.bumeran.com.pe/empleos/vendedores-adn-retail-s.a.c.-1114585422.html',
'https://www.bumeran.com.pe/empleos/especialista-de-intervencion-de-proyectos-mondelez-international-1114585461.html',
'https://www.bumeran.com.pe/empleos/desarrollador-java-senior-inetum-peru-1114584840.html',
'https://www.bumeran.com.pe/empleos/practicante-legal-coes-sinac-1114584788.html',
'https://www.bumeran.com.pe/empleos/concurso-publico-n-143-especialista-en-presupuesto-banco-central-de-reserva-del-peru-1114584538.html',
'https://www.bumeran.com.pe/empleos/concurso-n-147-especialista-en-analisis-de-infraestructuras-financieras-banco-central-de-reserva-del-peru-1114584444.html',
'https://www.bumeran.com.pe/empleos/asistente-legal-magdalena-del-mar-los-portales-1114584305.html',
'https://www.bumeran.com.pe/empleos/asistente-de-nuevos-negocios-inmobiliarios-madrid-ingenieros-1114584269.html',
'https://www.bumeran.com.pe/empleos/trabajo-desde-tres-horas-por-dia-ventas-ventas-por-internet-1114584205.html']
vtitle = []
def get_urls(url):
wd.get(url)
wd.implicitly_wait(20)
try:
title = wd.find_element_by_xpath("//h1").text
print('URL finished')
except:
title=''
print('Exception!')
vtitle.append(title)
vurl2.append(url)
# This throws an exception and does not scrape correctly
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(get_urls, vurl)
# output is for example
#['ALMACENERO', 'ALMACENERO', 'ALMACENERO', 'ALMACENERO', 'Desarrollador Java (Senior)', 'Desarrollador Java (Senior)', 'Desarrollador Java (Senior)']
# when it should be:
# ['ALMACENERO', 'Analista de Comunicaciones - Inglés Avanzado', 'Vendedores', 'Especialista de Intervención de Proyectos', 'Desarrollador Java (Senior)', 'Practicante Legal', 'Asistente Legal - Magdalena del Mar']
# This works fine but is too slow
for url in vurl:
get_urls(url)

Driver.get a group of links?

How do I use driver.get to open several URLs in Chrome.
My code:
import requests
import json
import pandas as pd
from selenium import webdriver
chromeOptions = webdriver.ChromeOptions()
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
links = []
request1 = requests.get('https://api.beta.tab.com.au/v1/recommendation-service/featured-events?jurisdiction=NSW')
json1 = request1.json()
for n in json1['nextToGoRaces']:
if n['meeting']['location'] in ['VIC','NSW','QLD','SA','WA','TAS','IRL']:
links.append(n['_links']['self'])
driver.get('links')
Based on the comments - you'll want a class to manage your browsers, a class for your tests, then a runner to run in parallel.
Try this:
import unittest
import time
import testtools
from selenium import webdriver
class BrowserManager:
browsers=[]
def createBrowser(self, url):
browser = webdriver.Chrome()
browser.get(url)
self.browsers.append(browser)
def getBrowserByPartialURL(self, url):
for browser in self.browsers:
if url in browser.current_url:
return browser
def CloseItAllDown(self):
for browser in self.browsers:
browser.close()
class UnitTest1(unittest.TestCase):
def test_DoStuffOnGoogle(self):
browser = b.getBrowserByPartialURL("google")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
def test_DoStuffOnYahoo(self):
browser = b.getBrowserByPartialURL("yahoo")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
#create a global variable for the brwosers
b = BrowserManager()
# To Run the tests
if __name__ == "__main__":
##move to an init to Create your browers
b.createBrowser("https://www.google.com")
b.createBrowser("https://www.yahoo.com")
time.sleep(5) # This is so you can see both open at the same time
suite = unittest.TestLoader().loadTestsFromTestCase(UnitTest1)
concurrent_suite = testtools.ConcurrentStreamTestSuite(lambda: ((case, None) for case in suite))
concurrent_suite.run(testtools.StreamResult())
This code doesn't do anything exciting - it's an example of how to manage multiple browsers and run tests in parallel. It goes to the specified urls (which you should move to an init/setup), then prints out the URL it's on 10 times.
This is how you add a browser to the manager: b.createBrowser("https://www.google.com")
This is how you retrieve your browser: browser = b.getBrowserByPartialURL("google") - note it's a partial URL so you can use the domain as a keyword.
This is the output (just the first few lines- not all of it...) - It's a print URL for google then yahoo, then google then yahoo - showing that they're running at the same time:
PS C:\Git\PythonSelenium\BrowserManager> cd 'c:\Git\PythonSelenium'; & 'C:\Python38\python.exe' 'c:\Users\User\.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy\launcher' '62426' '--' 'c:\Git\PythonSelenium\BrowserManager\BrowserManager.py'
DevTools listening on ws://127.0.0.1:62436/devtools/browser/7260dee3-368c-4f21-bd59-2932f3122b2e
DevTools listening on ws://127.0.0.1:62463/devtools/browser/9a7ce919-23bd-4fee-b302-8d7481c4afcd
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/

NameError: name 'driver' is not defined

I use above code to scrape friend list from facebook UID and am getting an error:
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 15, in friend_uid_list
soup = from_uid(uid)
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 11, in from_uid
driver.get('https://www.facebook.com/' + uid + '/friends')
NameError: name 'driver' is not defined
"""
Can you show me how to fix it ? Thank you very much ! Below code is my code
import multiprocessing
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
def from_uid(uid):
driver.get('https://www.facebook.com/' + uid + '/friends')
return BeautifulSoup(driver.page_source, "html5lib")
def friend_uid_list(uid):
soup = from_uid(uid)
friends = soup.find_all("div", class_="fsl fwb fcb")
target = open('C:/friend_uid_list.txt', 'a')
for href in friends:
href = href.find('a')
try:
target.write(href + "\n")
except:
pass
target.close()
if __name__ == '__main__':
driver = webdriver.Firefox()
driver.get("https://www.facebook.com/")
driver.find_element_by_css_selector("#email").send_keys("myemail#gmail.com")
driver.find_element_by_css_selector("#pass").send_keys("mypass")
driver.find_element_by_css_selector("#u_0_m").click()
pool = multiprocessing.Pool(3)
pool.map(friend_uid_list, [100004159542140,100004159542140,100004159542140])
The reason is simple: You create some new processes, and it can't see the variables in another process(main process).
There are several solutions:
Pass the variables you need as arguments. But this is not possible since driver is not picklable.
Create a new driver for each process.
Use multi-threading instead of multi-processing. However I'm not sure if selenium works this way, you'll have to test it yourself.

Python Selenium Function In Seperate File - NameError

I am building a Python script and want to split up certain functions into separate files to make maintenance easier.
I have two files currently called main.py and function1.py
main.pydef
#Setup Imports
import os
import os.path
import sys
# Import Functions
from function1 import myfunction
#Setup Selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
#Launch Firefox
def init_driver():
driver = webdriver.Firefox()
return driver
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
driver = init_driver()
# Init Blank List
checked_urls = []
for url in url_list:
myfunction(driver)
print(checked_urls)
function1.py
def myfunction(driver):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
I am trying to get it to visit each URL in the list and check for This Is My Phrase on the page. If it finds it then it should add to the list.
I am seeing the following error when running the script...
NameError: name 'url' is not defined
I am pretty sure it's related to the way I am importing the separate function but can't work out whats wrong, can anyone help?
You have to also pass url variable to myfunction:
def myfunction(driver, url):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
Then in main file:
for url in url_list:
myfunction(driver, url)
I think some code should be corrected:
Frist, delete the blank space before url_list:
#url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
Then, the url is a local variable, it's not directly accessible in the function myfunction. But it can be accessed as a function parameter:
def myfunction(driver, url):
...

Categories