Python multithreading crawler for unknown size - python

I have a list of pages to crawl using selenium
Let's say the website is example.com/1...N (up to unknown size)
from concurrent.futures import ThreadPoolExecutor, as_completed
from webdriver_manager.chrome import ChromeDriverManager
def crawl_example(page):
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(f"example.com/{page}")
# Do some processing
result = "Fetched data"
return result
N_THREAD = 10
MAX_SIZE = 100
with ThreadPoolExecutor(N_THREAD) as ex:
futures = [ex.submit(crawl_example, page) for page in range(MAX_SIZE)]
Setting MAX_SIZE call unnecessary request after N, so I wanted to find a better solution.
I could only think of creating a global variable (is_done) or add another parameter to the function.
What would be the most pythonic approach to solve the above issue?

Initialize a last_page variable to be infinity (preferably in a class variable)
And update and crawl with the following logic would be good enough
Since two threads can update last_page at the same time,
prevent higher page overwrite last_page updated by lower page
from threading import Lock
last_page_lock = Lock()
def crawl_page(page):
if page > last_page:
continue
if page_empty():
with last_page_lock():
last_page = min(last_page, page)
...

Related

How to handle large scale Web Scraping?

The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[#id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
There are multiple ways in which you can do this.
1. Creating a distributed system in which you run the spider through multiple machines in order to run in parallel.
This in my opinio is the better of the options as you can also create a scalable dynamic solution that you will be able to use many times over.
There are many ways of doing this normally it will consist of dividing the seedlist (The Zip Codes) into many separate seedlists in order to have the separate processes working with seperate seedlists, thus the downloads will run in parallel so for example if its on 2 machines it will go 2 times faster, but if on 10 machines its 10 times faster, etc.
In order to do this I might suggest looking into AWS, namely AWS Lambda , AWS EC2 Instances or even AWS Spot Instances these are the ones I have worked wiht previously and they are not terribly hard to work with.
2. Alternatively, if you are wanting to run it on a single machine you can take a look into Multithreading with Python, which can help you run the process in parallel on the singular machine.
3. This is another option particularly if it is a once off process. You can try running it simply with requests which may speed it up but with a massive amount of seeds it usually is faster to develop a process running in parallel.

Skip selenium Webdriver.get() call inside for loop if it takes too long

Hey guys I'm having trouble understanding how can I add exceptions to a for in range loop. Right now I'm pulling URLs from an excel sheet and scraping the information while moving throughout the pages until I reach page 200. The thing is that not all URLs have pages up to 200 so It's taking a lot of time until the loop ends and program can continue with another URL. Is there a way to implement exceptions in to the code here?
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome("C:/Users/Acer/Desktop/chromedriver.exe")
companies = []
df = pd.read_excel('C:/Users/Acer/Desktop/urls.xlsx')
for index, row in df.iterrows():
base_url = (row['urls'])
for i in range(1,201,1):
url = "{base_url}?curpage={i}".format(base_url=base_url, i=i)
driver.get(url)
time.sleep(2)
name = driver.find_elements_by_xpath('//a/div/div/p')
for names in name:
print(names.text, url)
companies.append([names.text, url])
You can set a max timeout on the Webdriver and then watch for Timeout exceptions in the loop:
from selenium.common.exceptions import TimeoutException
MAX_TIMEOUT_SECONDS = 5
driver = webdriver.Chrome("C:/Users/Acer/Desktop/chromedriver.exe")
driver.set_page_load_timeout(MAX_TIMEOUT_SECONDS)
for i in range(1, 201):
try:
url = "{base_url}?curpage={i}".format(base_url=base_url, i=i)
driver.get(url)
except TimeoutException:
# skip this if it takes more than 5 seconds
continue
... # process the scraped URL as usual
If a timeout occurs, the current iteration is skipped via continue.

Is time.sleep() enough to safely create a delay for a simple webscraper?

I'm using a webscraping code, without a headless browser, in order to scrape about 500 inputs from Transfer Mrkt for a personal project.
According to best practices, I need to randomize the web scraping pattern I have, along with using a delay and dealing with errors/loading delays, in order to successfully scrape Transfer Markt without getting raising any flags.
I understand how Selenium and Chromedriver can help with all of these in order to scrape more safely, but I've used requests and BeautifulSoup to create a much simpler webscraper:
import requests, re, ast
from bs4 import BeautifulSoup import pandas as pd
i = 1
url_list = []
while True:
page = requests.get('https://www.transfermarkt.us/spieler-statistik/wertvollstespieler/marktwertetop?page=' + str(i), headers = {'User-Agent':'Mozilla/5.0'}).text
parsed_page = BeautifulSoup(page,'lxml')
all_links = []
for link in parsed_page.find_all('a', href=True): link = str(link['href']) all_links.append(link)
r = re.compile('.*profil/spieler.*')
player_links = list(filter(r.match, all_links))
for plink in range(0,25):
url_list.append('https://www.transfermarkt.us' + player_links[plink])
i += 1
if i > 20: break
final_url_list = []
for i in url_list:
int_page = requests.get(i, headers = {'User-Agent':'Mozilla/5.0'}).text
parsed_int_page = BeautifulSoup(int_page,'lxml')
graph_container = parsed_int_page.find('div', class_='large-7 columns small-12 marktwertentwicklung-graph')
graph_a = graph_container.find('a')
graph_link = graph_a.get('href')
final_url_list.append('https://www.transfermarkt.us' + graph_link)
for url in final_url_list:
r = requests.get('https://www.transfermarkt.com/neymar/marktwertverlauf/spieler/68290', headers = {'User-Agent':'Mozilla/5.0'})
p = re.compile(r"'data':(.*)}\],")
s = p.findall(r.text)[0]
s = s.encode().decode('unicode_escape')
data = ast.literal_eval(s)
#rest of the code to write scraped info below this
I was wondering if this is generally considered a safe enough way to scrape a website like Transfer Mrkt if I add the time.sleep() method from the time library, as detailed here, in order to create a delay - long enough to allow the page to load, like 10 seconds - to scrape the 500 inputs successfully without raising any flags.
I would also forego using randomized clicks (which I think can only be done with selenium/chromedriver) to mimic human behavior, and was wondering if that too would be ok to exclude in order to scrape safely.

optimize my python bank webscraper

I am using Python 3.4 to make a webscraper that logins to my bank account, clicks into each account copying the balance , adding the total then pasting into google sheets.
I got it working but as you can see from the code, it is repetitive, ugly and long winded.
I have identified a few issues:
I believe I should be using a function to loop through the different account pages to get the balance and then assigning values to a different variable. However I couldn't think of a way of getting this done.
converting the string to float seems messy, what I am trying to do is to make a string ie. $1,000.00 into a float by stripping the '$' and ',' , is there a more elegant way?
from selenium import webdriver
import time
import bs4
import gspread
from oauth2client.service_account import serviceAccountCredentials
driver = webdriver.Chrome()
driver.get(bank url)
inputElement = driver.find_element_by_id("dUsername")
inputElement.send_keys('username')
pwdElement = driver.find_element_by_id("password")
pwdElement.send_keys('password')
driver.find_element_by_id('loginBtn').click()
time.sleep(3)
#copies saving account balance
driver.find_element_by_link_text('Savings').click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
SavingsAcc = float(elems[0].getText().strip('$').replace(',',''))
driver.back()
#copy cheque balance
driver.find_element_by_link_text('cheque').click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
ChequeAcc = float(elems[0].getText().strip('$').replace(',',''))
Total = SavingsAcc+ ChequeACC
driver.back()
try the following code:
from selenium import webdriver
import time
import bs4
import gspread
from oauth2client.service_account import serviceAccountCredentials
driver = webdriver.Chrome()
driver.get(bank url)
inputElement = driver.find_element_by_id("dUsername")
inputElement.send_keys('username')
pwdElement = driver.find_element_by_id("password")
pwdElement.send_keys('password')
driver.find_element_by_id('loginBtn').click()
time.sleep(3)
def getBalance(accountType):
driver.find_element_by_link_text(accountType).click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
return float(elems[0].getText().strip('$').replace(',',''))
#copies saving account balance
SavingsAcc = getBalance('Savings')
driver.back()
#copy cheque balance
ChequeACC = getBalance('cheque')
Total = SavingsAcc+ ChequeACC
driver.back()
Made a method getBalance, where you have to pass the account type, which returns the balance amount.
Note: you can keep driver.back call in getBalance as per your convenience, but before return statement.
Related to converting string to float, I don't know any other better way apart from the existing logic. As it is now moved into a method, I hope now it won't trouble you much. there is float method, which converts string to float, but $, , are not accepted. more details here
Note: If #CurrentBalanceAmount value changes every time for different account types, you can parameterize like accountType.
I would use several python idioms to clean up the code:
Wrap all code in functions
Generally speaking, putting your code in functions makes it easier to read and follow
When you run a python script (python foo.py), the python interpreter runs every line it can, in order, one by one. When it encounters a function definition, it only runs the definition line (def bar():), and not the code within the function.
This article seems like a good place to get more info on it: Understanding Python's Execution Model
Use the if __name__ == "__main__": idiom to make it an importable module
Similar to the above bullet, this gives you more control on how and when your code executes, how portable it is, and how reusable it is.
"Importable module" means you can write your code in one file, and then import that code in another module.
More info on if __name__ == "__main__" here: What does if name == “main”: do?
Use try/finally to make sure your driver instances get cleaned up
Use explicit waits to interact with the page so you don't need to use sleep
By default, Selenium tries to find and return things immediately. If the element hasn't loaded yet, Selenium throws an exception because it isn't smart enough to wait for it to load.
Explicit waits are built into Selenium, and allow your code to wait for an element to load into the page. By default it checks every half a second to see if the element loaded in. If it hasn't, it simply tries again in another half second. If it has, it returns the element. If it doesn't ever load in, the Wait object throws a TimeoutException.
More here: Explicit and Implicit Waits
And here: WAIT IN SELENIUM PYTHON
Code (untested for obvious reasons):
from selenium import webdriver
from explicit import waiter, ID # This package makes explicit waits easier to use
# pip install explicit
from selenium.webdriver.common.by import By
# Are any of these needed?
# import time
# import bs4
# import gspread
# from oauth2client.service_account import serviceAccountCredentials
def bank_login(driver, username, password):
"""Log into the bank account"""
waiter.find_write(driver, 'dUsername', username, by=ID)
waiter.find_write(driver, 'password', password, by=ID, send_enter=True)
def get_amount(driver, source):
"""Click the page and scrape the amount"""
# Click the page in question
waiter.find_element(driver, source, by=By.LINK_TEXT).click()
# Why are you using beautiful soup? Because it is faster?
# time.sleep(3)
# html = driver.page_source
# soup = bs4.BeautifulSoup(html)
# elems=soup.select('#CurrentBalanceAmount')
# SavingsAcc = float(elems[0].getText().strip('$').replace(',',''))
# driver.back()
# I would do it this way:
# When using explicit waits there is no need to explicitly sleep
amount_str = waiter.find_element(driver, "CurrentBalanceAmount", by=ID).text
# This conversion scheme will handle none $ characters too
amount = float("".join([char for char in amount_str if char in ["1234567890."]]))
driver.back()
return amount
def main():
driver = webdriver.Chrome()
try:
driver.get(bank_url)
bank_login(driver, 'username', 'password')
print(sum([get_amount(driver, source) for source in ['Savings', 'cheque']]))
finally:
driver.quit() # Use this try/finally idiom to prevent a bunch of dead browsers instances
if __name__ == "__main__":
main()
Full disclosure: I maintain the explicit package. You could replace the waiter calls above with relatively short Wait calls if you would prefer. If you are using Selenium with any regularity it is worth investing the time to understand and use explicit waits.

Multiprocessing for browser in python

I want to hit a website at a very high frequency like 100 hits/sec.
In my code below, i have a sample process to open google.com and find an element and record time and then do some more random operations.
I observed that even for 5 request, time varies in seconds.
Ex
17:38:04
17:38:05
17:38:05
17:38:05
17:38:06
I need 100 requests to show same time. Please improve my code or suggest something else to do so. I am open to use any technology that can help me achieve 100 hits per sec. My application is browser based which need to submit HTML form and calling API in background.
import multiprocessing as mp
from selenium import webdriver
import time
def run_test(params):
driver = webdriver.Firefox()
driver.get("http://google.com")
a = driver.find_element_by_name("q")
print time.ctime()
a.send_keys("priyanka chopra")
a = driver.find_element_by_name("btnG")
a.click()
driver.quit()
if __name__ == '__main__':
count=5
parameters = range(count)
pool = mp.Pool(len(parameters))
pool.map(run_test, parameters)
time.sleep(5)

Categories