How to set timeout for PhantomJS? - python

Here's the code I have for setting it:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.resourceTimeout"] = ("5000")
driver = webdriver.PhantomJS(desired_capabilities=dcap)
However in my super long script, it doesn't seem to timeout when my internet is slow and a page takes longer than 5 seconds to load.
There is so little documentation on PhantomJS time outs, and even less of it is for Python, so I figure maybe this isn't even the way to do it.
Has anyone successfully set a timeout with PhantomJS in Python?
Thanks!

instead of targeting phantomjs resource timeout, you can set a timeout for Driver like below example of Firefox driver:
browser = webdriver.Firefox()
browser.set_page_load_timeout(30)
you can change 30 to any numeric
Thanks

You could use the default Python sleep method to retry a few times.
import time
time.sleep()

FIREFOX = 'firefox'
PHANTOM = 'phantom'
NO_IMAGES = False
NEED_IMAGES = True
opened_pages_counter = 0
driver = None
details = {}
user_agent_mozilla = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
print("[x] UNIVERSAL_DRIVER module loaded")
if platform.system() == 'Windows':
PHANTOMJS_PATH = 'd:/bin/phantomjs/bin/phantomjs.exe'
else:
PHANTOMJS_PATH = './phantomjs'
#atexit.register
def cleanup():
print("universal_driver:: on_exit")
closeDriver()
# PHANTOMJS_PATH = 'd:/bin/phantomjs/bin/phantomjs.exe'
# driver_test = buildDriver(FIREFOX, NO_IMAGES )
# timeout need to be in seconds
# driver_test = buildDriver(FIREFOX, NEED_IMAGES, timeout=100, width=100, height=50)
def buildDriver(driverType, needImages, **kwargs):
global driver, details
closeDriver()
timeout = 60
width=800
height = 600
x=0
y=0
for key in kwargs:
print("another keyword arg: %s: %s" % (key, kwargs[key]))
if key=="timeout": timeout = int(float(kwargs[key]))
if key == "width": width = int(float(kwargs[key]))
if key == "height": height = int(float(kwargs[key]))
if key == "x": x = int(float(kwargs[key]))
if key == "y": y = int(float(kwargs[key]))
details['driverType'] = driverType
details['needImages'] = needImages
if driverType == FIREFOX:
if driver == None:
firefox_profile = webdriver.FirefoxProfile()
if needImages == False:
firefox_profile.set_preference('permissions.default.image', 2)
firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
firefox_profile.set_preference("http.response.timeout", timeout)
firefox_profile.set_preference("dom.max_script_run_time", timeout)
driver = webdriver.Firefox(firefox_profile=firefox_profile)
driver.set_window_size(width, height)
return driver
if driverType == PHANTOM:
if driver == None:
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent_mozilla
dcap["phantomjs.page.settings.resourceTimeout"] = timeout*1000 # in mls
# dcap["phantomjs.page.settings.loadImages"] = needImages
# driver = webdriver.PhantomJS(desired_capabilities=dcap)
service_argsA = []
if needImages == False:
service_argsA = ['--load-images=no']
driver = webdriver.PhantomJS(PHANTOMJS_PATH, desired_capabilities=dcap, service_args=service_argsA)
driver.set_window_size(width, height)
return driver
def openPage(url): # need to prevent of opening pages twice
global driver
global opened_pages_counter
if driver == None:
driver = buildDriver(PHANTOM, NO_IMAGES)
if driver.current_url != url:
driver.get(url)
opened_pages_counter = opened_pages_counter + 1
uprint("universal_driver::", details['driverType'], ", needImages:", details['needImages'], " ; page opened:", url)
def closeDriver():
global driver
if driver == None:
return
driver.close()
driver.quit()
driver = None
uprint("universal_driver:: driver closed")
def uprint(*objects, sep=' ', end='\n', file=sys.stdout):
enc = file.encoding
if enc == 'UTF-8':
print(*objects, sep=sep, end=end, file=file)
else:
f = lambda obj: str(obj).encode(enc, errors='backslashreplace').decode(enc)
print(*map(f, objects), sep=sep, end=end, file=file)

Related

Using Selenium to get a list of Instagram followers

I am trying to take an account of over 1,000,000 followers on instagram and add their usernames to a txt file. I am trying to use Selenium for this but my authorization for login fails every time i login. Any advice on how to get around this? I assume the site believes this is a hack but im not sure.
from selenium import webdriver as web
from selenium.webdriver.common.keys import Keys
import time
import random
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
bot_username = 'null'
bot_password = 'Null'
profiles = ['Enter Here']
amount = 300
# 'usernames' or 'links'
result = 'usernames'
us = ''
class Instagram():
def __init__(self, username, password):
self.username = username
self.password = password
options = Options()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.browser = web.Chrome("chromedriver",options=options)
self.browser.set_window_size(400, 900)
def close_browser(self):
self.browser.close()
self.browser.quit()
def login(self):
browser = self.browser
try:
browser.get('https://www.instagram.com')
time.sleep(random.randrange(3, 5))
# Enter username:
username_input = browser.find_element_by_name('username')
username_input.clear()
username_input.send_keys(self.username)
time.sleep(random.randrange(2, 4))
# Enter password:
password_input = browser.find_element_by_name('password')
password_input.clear()
password_input.send_keys(self.password)
time.sleep(random.randrange(1, 2))
password_input.send_keys(Keys.ENTER)
time.sleep(random.randrange(3, 5))
print(f'[{self.username}] Successfully logged on!')
except Exception as ex:
print(f'[{self.username}] Authorization fail')
self.close_browser()
def xpath_exists(self, url):
browser = self.browser
try:
browser.find_element_by_xpath(url)
exist = True
except NoSuchElementException:
exist = False
return exist
def get_followers(self, users, amount):
browser = self.browser
followers_list = []
for user in users:
browser.get('https://instagram.com/' + user)
time.sleep(random.randrange(3, 5))
followers_button = browser.find_element_by_xpath('/html/body/div[1]/section/main/div/ul/li[2]/a/span')
count = followers_button.get_attribute('title')
if ',' in count:
count = int(''.join(count.split(',')))
else:
count = int(count)
if amount > count:
print(f'You set amount = {amount} but there are {count} followers, then amount = {count}')
amount = count
followers_button.click()
loops_count = int(amount / 12)
print(f'Scraping. Total: {amount} usernames. Wait {loops_count} iterations')
time.sleep(random.randrange(8,10))
followers_ul = browser.find_element_by_xpath("/html/body/div[6]/div/div/div[2]")
time.sleep(random.randrange(5,7))
try:
for i in range(1, loops_count + 1):
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", followers_ul)
time.sleep(random.randrange(8, 10))
all_div = followers_ul.find_elements_by_tag_name("li")
for us in all_div:
us = us.find_element_by_tag_name("a").get_attribute("href")
if result == 'usernames':
us1 = us.replace("https://www.instagram.com/", "")
us = us1.replace("/", "")
followers_list.append(us)
time.sleep(1)
f3 = open('userlist.txt', 'w')
for list in followers_list:
f3.write(list + '\n')
print(f'Got: {len(followers_list)} usernames of {amount}. Saved to file.')
time.sleep(random.randrange(3, 5))
except Exception as ex:
print(ex)
self.close_browser()
return followers_list
bot = Instagram(bot_username, bot_password)
bot.login()
followers = bot.get_followers(profiles, amount)

How to fix 'Options' object has no attribute 'add_experimental_option' in Selenium Python

Environment:
Python 3.10
selenium==4.3.0
Windows 10
Problem
I am trying to make a selenium script with Chrome.
chrome_options = Options()
chrome_options.add_argument(r"--user-data-dir=" + chrome_profile_path)
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-application-cache")
chrome_options.add_argument("--disable-session-crashed-bubble")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
I execute the code from my Windows terminal. It creates this issue:
chrome_profile_path : C:\Users\gauth\AppData\Local\Google\Chrome\User Data
You don't have Chrome browser installed on your computer or we di'nt find your profile folder! : 'Options' object has no attribute 'add_experimental_option'
So in the same terminal, I executed python and typed these lines below in order to reproduce the issue:
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
And I didn't get any issue.
So why do I get the issue when I execute my script and not when I execute line by line in Python terminal?
Here is the full code:
def ChromeDriverWithProfile():
"""
This function will return the chrome driver with the Chrome profile loaded
"""
try:
if platform.system() == 'Linux':
chrome_profile_path = os.environ['HOME'] + "/.config/google-chrome"
chrome_profile_path = os.path.expandvars(chrome_profile_path)
elif platform.system() == 'Darwin':
if not os.environ['HOME'] or os.environ['HOME'] == "":
HOME_DIR = os.path.expanduser("~")
else:
HOME_DIR = os.environ['HOME']
chrome_profile_path = HOME_DIR + "/Library/Application Support/Google/Chrome/Default"
chrome_profile_path = os.path.expandvars(chrome_profile_path)
else:
chrome_profile_path = r"%LocalAppData%\Google\Chrome\User Data"
chrome_profile_path = os.path.expandvars(chrome_profile_path)
print(f"chrome_profile_path : {chrome_profile_path}")
chrome_options = Options()
chrome_options.add_argument(r"--user-data-dir=" + chrome_profile_path)
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-application-cache")
chrome_options.add_argument("--disable-session-crashed-bubble")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) # !!!! THE ISSUE COME HERE AT THIS LINE !!!!
chrome_options.add_experimental_option('useAutomationExtension', False)
# overcome limited resource problems
chrome_options.add_argument("--disable-dev-shm-usage")
# Bypass OS security model
chrome_options.add_argument("--no-sandbox")
# We need to remove the bubble popup 'Restore pages' of Chrome:
# https://dev.to/cuongld2/get-rid-of-chrome-restore-bubble-popup-when-automate-gui-test-using-selenium-3pmh
if platform.system() == 'Linux':
preference_file = chrome_profile_path + "/Default/Preferences"
elif platform.system() == 'Darwin':
preference_file = chrome_profile_path
else:
preference_file = chrome_profile_path + "\\Default\\Preferences"
string_to_be_change = '"exit_type":"Crashed"'
new_string = '"exit_type": "none"'
# read input file
fin = open(preference_file, "rt")
# read file contents to string
data = fin.read()
# replace all occurrences of the required string
data = data.replace(string_to_be_change, new_string)
# close the input file
fin.close()
# open the input file in write mode
fin = open(preference_file, "wt")
# overrite the input file with the resulting data
fin.write(data)
# close the file
fin.close()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
return driver
except Exception as ex:
print(
f"You don't have Chrome browser installed on your computer or we di'nt find your profile folder! : {ex}")
return None
driver = ChromeDriverWithProfile()
I searched for this issue "'Options' object has no attribute 'add_experimental_option'" on the web. There isn't anything. Obviously, I am the only one on earth who has this issue. :-(
Can you help me, please?

Save screenshot on test failure in python with 'splinter'

I try save screenshot on test failure in python with 'splinter'
1) This code works for Selenium:
# #pytest.fixture(scope="function")
# def browser(request):
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--start-maximized")
# # browser = webdriver.Chrome(ChromeDriverManager().install())
# browser = webdriver.Chrome(options=options)
# browser.implicitly_wait(5)
# failed_before = request.session.testsfailed
# yield browser
# if request.session.testsfailed != failed_before:
# test_name = request.node.name
# take_screenshot(browser, test_name)
# browser.quit()
#
# def take_screenshot(browser, test_name):
# screenshots_dir = "C:\\Users\Ark\\PycharmProjects\\Gop\\Reports"
# screenshot_file_path = "{}/{}.png".format(screenshots_dir, test_name)
# browser.save_screenshot(
# screenshot_file_path)
But doesn't works with Splinter (browser don't close and don't make screenshot):
#pytest.fixture(scope="function")
def browser(request):
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = Browser("chrome", headless=False, incognito=True, options=options)
failed_before = request.session.testsfailed
yield browser
if request.session.testsfailed != failed_before:
test_name = request.node.name
take_screenshot(browser, test_name)
browser.quit()
def take_screenshot(browser, test_name):
screenshots_dir = "C:\\Users\Ark\\PycharmProjects\\Gop\\Reports"
screenshot_file_path = "{}/{}.png".format(screenshots_dir, test_name)
browser.save_screenshot(
screenshot_file_path)
print("\n!!! SCREENSHOT OF FAILURE '" + test_name + "' SAVED INTO: '" + screenshots_dir + "' WITH NAME '" + test_name + "'")
2) Or how do this function working? (pytest-splinter)
splinter_make_screenshot_on_failure
https://github.com/pytest-dev/pytest-splinter
Can you help?

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[#id="hhh"]/h4/a', '//*[#id="sss"]/h4/a/', '//*[#id="jjj"]/h4/a',
'//*[#id="bbb"]/h4/a', '//*[#id="ggg"]/h4/a']
dark_side_xpaths = ['//*[#id="ccc"]/h4/a', '//*[#id="ppp"]/h4', '//*[#id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[#id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[#id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
and the main.py file that opens the two processes and initializes them is as follows:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
Any help would be appreciated!
Try passing your target as reference to your function instead of calling it, like this Process(target=darksideScraper.navigate_pages). Also refer to this, for another example of how to use multiprocessing.

web scraping with python, with navigation controller

I am new to python and I need help with web scraping code to save a dynamic map every week.
This is the site I am interested in.
The purpose is to get to the page, select season, select week, and download image to a local folder. I'll use the image to integrate for an automated weekly report using SAS.
thank you in advance!
import sys
import os
import time
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium import webdriver
import arrow
BASE_URL = 'https://gis.cdc.gov/grasp/fluview/main.html'
DOWNLOAD_PATH = "/Users/"
def closeWebDriver(driver):
if os.name == 'nt':
driver.quit()
else:
driver.close()
def getImage():
profile = FirefoxProfile()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.helperApps.neverAsk.openFile","image/png")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "image/png")
profile.set_preference("browser.download.folderList", 2);
profile.set_preference("browser.download.dir", DOWNLOAD_PATH)
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(BASE_URL)
time.sleep(5)
if not isValidTimeFrame(driver):
print('Not the time to download yet!')
closeWebDriver(driver)
return
selectFirstWeek(driver)
print('- Consume the web.')
wrapper = driver.find_element_by_class_name('downloads-help-area')
download_img_els = wrapper.find_elements_by_class_name('downloads-button')
for el in download_img_els:
text = el.text.encode('utf-8')
# print(text)
if 'download image' == text.strip().lower():
# Process
downloadImage(el)
break
time.sleep(5)
closeWebDriver(driver)
def isValidTimeFrame(driver):
seasons_button = driver.find_element_by_class_name('seasons-button')
time_frame = seasons_button.text.encode('utf-8').strip().lower()
current_year = arrow.now().to('local')
current_year_str = current_year.format('YYYY')
next_year = current_year.shift(years=1)
next_year_str = next_year.format('YY')
print(time_frame)
compare_year = '%s-%s' % (current_year_str, next_year_str)
return time_frame == compare_year
def selectFirstWeek(driver):
prev = driver.find_element_by_id('prevMap')
week = driver.find_element_by_id('weekSlider')
while True:
print(week)
current_number = week.get_property('value')
print('- Week: ' + current_number)
prev.click()
if int(current_number) < 2:
break;
time.sleep(1)
def downloadImage(el):
print('- Click on ' + el.text)
el.click()
getImage()

Categories