I have written functin based on selenium and I want it to parse simultaneously multiple webpages. I have list of urls that I pass to the function that I want scrape at the same time so as to save time.
I created scraper.py file where i put scraper function:
def parser_od(url):
price=[]
url_of = url
driver.get(url_of)
try:
price.append(browser.find_element_by_xpath("//*[#id='root']/article/header/div[2]/div[1]/div[2]").text.replace(" ","").replace("zł","").replace(",","."))
except NoSuchElementException:
price.append("")
Now I want to use the function to parse multiple urls from my urls at the same time using multiprocessing library:
from scraper import *
url_list=['https://www.otodom.pl/oferta/2-duze-pokoje-we-wrzeszczu-do-zamieszania-ID42f6s',
'https://www.otodom.pl/oferta/mieszkanie-na-zamknietym-osiedlu-z-ogrodkiem-ID40ZxM',
'https://www.otodom.pl/oferta/zaciszna-nowe-mieszkanie-3-pokoje-0-ID41UaX',
'https://www.otodom.pl/oferta/dwupoziomowe-dewel-mieszkanie-101-m2-lebork-i-p-ID3JEcQ']
driver = webdriver.Chrome(executable_path=r"C:\Users\Admin\chromedriver.exe")
from multiprocessing import Pool
with Pool(4) as p:
price = p.map(parser_od, url_list)
But I get following error:
NameError: name 'driver' is not defined
Which is weird because chrome is opened up.
Edit:
I need to have the browser(s) open while running this scraper, so that the driver is opened before not everytime this function is invoked.
Just should probably just split up the list of urls you want to process ino 4 equal parts, and have a driver for each process that processes one of the equal parts in the Pool.
def parser_od(urls, thread_index):
driver = webdriver.Chrome(executable_path=r"C:\Users\Admin\chromedriver.exe")
prices = []
for i in range(len(urls)):
url = urls[i]
if i % 4 == thread_index:
price=[]
url_of = url
driver.get(url_of)
try:
price.append(browser.find_element_by_xpath("//*[#id='root']/article/header/div[2]/div[1]/div[2]").text.replace(" ","").replace("zł","").replace(",","."))
except NoSuchElementException:
price.append("")
prices.append(price)
return prices
from multiprocessing import Pool
with Pool(4) as p:
price = p.map(lambda x: parser_od(x, url_list), list(range(len(url_list))))
Related
Can you help me find a bug in my code ?
I'm trying to speed up web scraping URLs gathered by using googlesearch.search.
*Note!
This seems similiar issue as it was described in this post:
Concurrent.futures + requests_html's render() = "There is no current event loop in thread 'ThreadPoolExecutor-0_0'."
But after attempting to implement it the way it was described there, I still can't get rid of my issue.
Here's my original code so far:
from requests_html import HTMLSession
import multiprocessing as mp
import concurrent.futures
from googlesearch import search
#get 20 urls for "funny cats"
def getURLs():
urls = list(search("funny cats", tld='com', num=20, stop=20, pause=2))
return urls
# divide list of 20 urls into list of 4 lists x 5 url
# each sub-list will be processed on one processor (I have 4 cores)
def fillContainer(some_iterable):
my_gen = iter(some_iterable)
cores = mp.cpu_count()
container = [ [] for n in range(cores) ]
while True:
for a in container:
try:
a.append(next(my_gen))
except StopIteration:
return container
def processURL(urls):
with HTMLSession() as session:
for u in urls:
try:
response = session.get(u)
response.raise_for_status()
response.html.render()
# plus some regex to process html, but that's not the point
except Exception as e:
print(f"ERROR !!! {e} , accessing URL: {u} , Movinh on ...")
def main():
URLs = getURLs()
container = fillContainer(URLs)
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(processURL, url) for url in container]
if __name__ == '__main__':
main()
I get the :
There is no current event loop in thread 'ThreadPoolExecutor'
Error for each URL I try to proces using my processURL() function. I also tried using: executor.map(processURL, URLs) but with no success.
Thank you for your help.
#EDIT #1
It seems that there's a problem with line: response.html.render() ,
however I don't know how to deal with it.
I have built a webscraper using python and selenium with geckodriver, it is currently running in an EC2 instance on a crontab schedule.
My issue is it takes more than 5 minutes to finish downloading and I want to use lamda functions to run my scraper but they only allow for 5 minutes of runtime.
So I have a code similar to this.
from selenium import webdriver
def start_browser(url):
browser = webdriver.Firefox( executable_path="./geckodriver")
executable_path="./geckodriver")
browser.get(url)
return browser
def log_in(user, pass, user_elem, pass_elem, login_elem, browser):
user_elem.click().send_keys(user)
pass_elem.click().send_keys(pass)
login_elem.click()
return browser
def nav_to_data(browser, data_elem)
data_elem.click()
return browser
def find_data(browser, data_table)
data_links = data_table.find_elements_by_tag_name("tr")
return data_links, browser
I'm thinking these functions could be ran on lambda functions passing the browser/webdriver instance to each other?
The part I'm struggling with is looping through the data and waiting for all downloads to finish, this would take longer than 5 mins.
Is there anyway around this?
def download_data(browser, link)
link.click()
time.sleep(2)
download_elem = browser.find_element_by_id("download_xls_file")
download_path = download_elem.click()
return download_path
# THIS TAKES LONGER THAN 5 mins
download_paths = []
for link in data_links:
download = download_data(browser, link) # clicks a link to a new page wdownload button and returns path to the .xls file
download_paths.append(download)
upload_data()
You can partition your data and use a recursive lambda to process chunks of your list.
Taking an example from my blog
def invoke_self_async(data_list, context):
this_data_list = data_list[0:20] # increase number as needed
new_event = {
'data': data_list[20:] # needs to match above number
}
boto3.client('lambda').invoke_async(
FunctionName=context.invoked_function_arn,
InvokeArgs=json.dumps(new_event)
)
my_data = []
for data in data_list:
download = download_data(browser, data) # returns path to .xls file
my_data.append(download)
return my_data
I need help with a feature I try to implement, unfortunately I'm not very comfortable with multithreading.
My script download 4 different files from internet, and calls a dedicated function for each one, then saving all.
The problem is that I'm doing it step by step, therefore I have to wait for each download to finish in order to proceed to the next one.
I see what I should do to solve this, but I don't succeed to code it.
Actual Behaviour:
url_list = [Url1, Url2, Url3, Url4]
files_list = []
files_list.append(downloadFile(Url1))
handleFile(files_list[-1], type=0)
...
files_list.append(downloadFile(Url4))
handleFile(files_list[-1], type=3)
saveAll(files_list)
Needed Behaviour:
url_list = [Url1, Url2, Url3, Url4]
files_list = []
for url in url_list:
callThread(files_list.append(downloadFile(url)), # function
handleFile(files_list[url.index], type=url.index) # trigger
#use a thread for downloading
#once file is downloaded, it triggers his associated function
#wait for all files to be treated
saveAll(files_list)
Thanks for your help !
Typical approach is to put the IO heavy part like fetching data over the internet and data processing into the same function:
import random
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def fetch_and_process_file(url):
thread_name = threading.currentThread().name
print(thread_name, "fetch", url)
data = requests.get(url).text
# "process" result
time.sleep(random.random() / 4) # simulate work
print(thread_name, "process data from", url)
result = len(data) ** 2
return result
threads = 2
urls = ["https://google.com", "https://python.org", "https://pypi.org"]
executor = ThreadPoolExecutor(max_workers=threads)
with executor:
results = executor.map(fetch_and_process_file, urls)
print()
print("results:", list(results))
outputs:
ThreadPoolExecutor-0_0 fetch https://google.com
ThreadPoolExecutor-0_1 fetch https://python.org
ThreadPoolExecutor-0_0 process data from https://google.com
ThreadPoolExecutor-0_0 fetch https://pypi.org
ThreadPoolExecutor-0_0 process data from https://pypi.org
ThreadPoolExecutor-0_1 process data from https://python.org
To scrape a pool of URLs, I am paralell processing selenium with joblib. In this context, I am facing two challenges:
Challenge 1 is to speed up this process. In the moment, my code opens and closes a driver instance for every URL (ideally would be one for every process)
Challenge 2 is to get rid of the CPU-intensive while loop that I think I need to continue on empty results (I know that this is most likely wrong)
Pseudocode:
URL_list = [URL1, URL2, URL3, ..., URL100000] # List of URLs to be scraped
def scrape(URL):
while True: # Loop needed to use continue
try: # Try scraping
driver = webdriver.Firefox(executable_path=path) # Set up driver
website = driver.get(URL) # Get URL
results = do_something(website) # Get results from URL content
driver.close() # Close worker
if len(results) == 0: # If do_something() failed:
continue # THEN Worker to skip URL
else: # If do_something() worked:
safe_results("results.csv") # THEN Save results
break # Go to next worker/URL
except Exception as e: # If something weird happens:
save_exception(URL, e) # THEN Save error message
break # Go to next worker/URL
Parallel(n_jobs = 40)(delayed(scrape)(URL) for URL in URL_list))) # Run in 40 processes
My understanding is that in order to re-use a driver instance across iterations, the # Set up driver-line needs to be placed outside scrape(URL). However, everything outside scrape(URL) will not find its way to joblib's Parallel(n_jobs = 40). This would imply that you can't reuse driver instances while scraping with joblib which can't be true.
Q1: How to reuse driver instances during parallel processing in the above example?
Q2: How to get rid of the while-loop while maintaining functionality in the above-mentioned example?
Note: Flash and image loading is disabled in firefox_profile (code not shown)
1) You should first create a bunch of drivers: one for each process. And pass an instance to the worker. I don't know how to pass drivers to an Prallel object, but you could use threading.current_thread().name key to identify drivers. To do that, use backend="threading". So now each thread will has its own driver.
2) You don't need a loop at all. Parallel object itself iter all your urls (I hope I realy understend your intentions to use a loop)
import threading
from joblib import Parallel, delayed
from selenium import webdriver
def scrape(URL):
try:
driver = drivers[threading.current_thread().name]
except KeyError:
drivers[threading.current_thread().name] = webdriver.Firefox()
driver = drivers[threading.current_thread().name]
driver.get(URL)
results = do_something(driver)
if results:
safe_results("results.csv")
drivers = {}
Parallel(n_jobs=-1, backend="threading")(delayed(scrape)(URL) for URL in URL_list)
for driver in drivers.values():
driver.quit()
But I don't realy think you get profit in using n_job more than you have CPUs. So n_jobs=-1 is the best (of course I may be wrong, try it).
I'm trying to do the following: grab some information off a page, and then insert it into a mongodb. There are a list of pages and I'm wanting to multiprocessing as these pages can take time to load. Once the webdriver returns the result I want to insert into the db. The problem I'm facing is that I'm only getting 1/4 of the results I'm expecting in the db, so I imagine the way I'm managing the results and the inserting isn't working. I was hoping someone could show me where I've gone wrong. The following is an example of the code:
from multiprocessing.dummy import Pool
from multiprocessing import cpu_count
from selenium import webdriver
import timeit
from pymongo import MongoClient
def mp_worker(urls):
driver = webdriver.Chrome(chromedriver,
chrome_options=options)
url = "http://website"+urls
driver.get(url)
return what_you_want
driver.quit() #do I do this here, close or quit?
def mp_handler():
urls= ["14360705","4584061","13788961","6877217","13194596","13400479","9868014","8524704","16394198","16315464"]
client = MongoClient()
db = client.test
collection = db['test-collection']
p = Pool(cpu_count()*2)
for result in p.imap(mp_worker, urls):
db.restaurants.update(result,{"upsert":"True"})
if __name__=='__main__':
start = timeit.default_timer()
mp_handler()
stop = timeit.default_timer()
print (stop - start)
This syntax is incorrect:
db.restaurants.update(result,{"upsert":"True"})
You want, likely:
db.restaurants.insert(result)
Or:
db.restaurants.update(filter, result, upsert=True)
Where "filter" is a MongoDB query (expressed as a Python dict) that uniquely matches the document you want to update or create.