I'm using selenium to make a headless scraping of a website within an endpoint of an API using Flask for Python. I made several tests and my selenium scraping code works perfectly within a script and while running as an API in the localhost. However, when I deploy the code in a remote server, the requests always return a 502 Bad Gateway error. It is weird because by logging I can see that the scraping is working correctly, but the server responds with 502 before the scraping finish processing, as if it was trying to set up a proxy and it fails. I also noticed that removing the time.sleep in my code makes it return a 200 although the result could be wrong because it doesn't give selenium the proper time to load the all the page to scrape.
I also tried to set up to use falcon instead of flask and I get a similar error. This is a sample of my recent code using Falcon:
class GetUrl(object):
def on_get(self, req, resp):
"""
Get Request
:param req:
:param resp:
:return:
"""
# read parameter
req_body = req.bounded_stream.read()
json_data = json.loads(req_body.decode('utf8'))
url = json_data.get("url")
# get the url
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options)
driver.get(url)
time.sleep(5)
result = False
# check for outbound links
content = driver.find_elements_by_xpath("//a[#class='_52c6']")
if len(content) > 0:
href = content[0].get_attribute("href")
result = True
driver.quit()
# make the return
return_doc = {"result": result}
resp.body = json.dumps(return_doc, sort_keys=True, indent=2)
resp.content_type = 'text/string'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
I saw some other similar issues like this, but even though I can see that there is a gunicorn running in my server, I don't have nginx, or at least it is not running where it should running. And I don't think Falcon uses it. So, what exactly am I doing wrong? Some light in this issue is highly appreciated, thank you!
This might work:
from IPython.display import clear_output
import time as time
import json
!apt-get update
!apt install chromium-chromedriver
!which chromedriver
!pip install selenium
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located
!pip install page_objects
import page_objects
from page_objects import PageObject, PageElement
time.sleep(1)
clear_output()
class GetUrl(object):
def on_get(self, req, resp):
"""
Get Request
:param req:
:param resp:
:return:
"""
# read parameter
req_body = req.bounded_stream.read()
json_data = json.loads(req_body.decode('utf8'))
url = json_data.get("https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175")
# get the url
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options = options)
driver.implicitly_wait(3)
driver.get("https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175")
result = False
# check for outbound links
contentStorage = []
content = driver.find_elements_by_tag_name('a')
for i in content:
contentStorage.append(i.get_attribute('text'))
result = True
#driver.quit()
# make the return
return_doc = {"result": result}
resp.body = json.dumps(return_doc, sort_keys=True, indent=2)
resp.content_type = 'text/string'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
However, I was testing it without using a class object, and also it's using Chrome instead of FireFox:
from IPython.display import clear_output
import time as time
!apt-get update
!apt install chromium-chromedriver
!which chromedriver
!pip install selenium
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located
!pip install page_objects
import page_objects
from page_objects import PageObject, PageElement
time.sleep(1)
clear_output()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options = options)
driver.implicitly_wait(3)
driver.get('https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175')
content = driver.find_elements_by_tag_name('a')
contentStorage = []
for i in content:
contentStorage.append(i.get_attribute('text'))
#driver.quit()
Related
I am going to download pictures from a clothing website for academic research, I use the code below
`
from ast import keyword
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time
import os
import wget
import random
import time
delay_choices = range(5,15)
delay = random.choice(delay_choices)
import requests
from fake_useragent import UserAgent
keyword = "jeans"
user_agent = UserAgent()
response = requests.get(url="https://www2.hm.com/en_asia3/ladies/shop-by-product/jeans.html", headers={ 'user-agent': user_agent.random })
driver = webdriver.Chrome("~~~~")
driver.get("https://www2.hm.com/en_asia3/ladies/shop-by-product/jeans.html")
time.sleep(4)
cookie = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
cookie.click()
time.sleep(2)
for i in range(6):
driver.execute_script("window.scrollTo(0, 6900);")
time.sleep(delay)
loadmore = driver.find_element(By.XPATH,"/html/body/main/div/div/div/div[3]/div[2]/button")
loadmore.click()
imgs = driver.find_elements(By.CLASS_NAME, 'item-image')
path = os.path.join("H&M" + keyword)
os.mkdir(path)
count = 0
for img in imgs:
save_as = os.path.join(path, keyword + str(count) + '.jpg')
#print(img.get_attribute("src"))
wget.download(img.get_attribute("src"), save_as)
count += 1
time.sleep(6)
driver.quit()
`
and I got this issue:
in this line: wget.download(img.get_attribute('src'), save_as)
but I also use the "src" to download other website and didn't wrong.
I would wonder if anyone know what happen. 😢
Thanks a lot.
I have searched and couldn't solve this problem, and I hope someone can give me some advice.
enter image description here
The url you are trying to download using wget has lots of specific symbols and this can cause problems for the wget. This is an example of the URL you are attempting to download from: https://lp2.hm.com/hmgoepprod?set=source[/2b/bf/2bbf11a29fde773adcdOK],res[y],hmver[1]&call=url[file:/product/main]
Try to change the command a bit:
Instead of this:
wget.download(img.get_attribute("src"), save_as)
try this:
wget.download(f'"{img.get_attribute("src")}"', save_as)
I have a selenium-python automation test, I am generating HTML/JSON reports using Pytest. I want to add response time in the HTML/JSON report, Is this even possible?
following is my code
test_screenshot.py
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pytest_html
from selenium.common.exceptions import InvalidSessionIdException
def test_Openurl(setup):
driver = setup["driver"]
url = setup["url"]
try:
before_time = datetime.now().strftime('%H%M%S%f') # Timestamp
driver.get(url)
now_time = datetime.now().strftime('%H%M%S%f') # Timestamp
response_time = int(now_time) - int(before_time)
except Exception as e:
print(e.message)
assert driver.current_url == URL
driver.save_screenshot("ss.png")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.save_screenshot("ss1.png")
driver.close()
Conftest.py
import pytest
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
def pytest_addoption(parser):
parser.addoption("--url", action="store", default="https://google.com/")
#pytest.fixture()
def setup(pytestconfig):
s = Service("C:/Users/Yash/Downloads/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(service=s)
driver.maximize_window()
yield {"driver":driver, "url": pytestconfig.getoption("url")}
Following is the command i am using to generate reports
pytest -v -s --json-report --json-report-indent=4 --json-report-file=report/report.json --html=report/report.html test_screenshot.py
you can use some hooks to JSON reports to do this. something along the line of these in your conftest.py file:
#hookimpl(optionalhook=True)
def pytest_json_runtest_metadata(call):
"""
fixture from the pytest-json-report plugin that will add your info
"""
if call.when != 'call':
return {}
# collect the start and finish times in ISO format for the US/Eastern timezone
start_iso_dt = timezone('US/Eastern').localize(datetime.fromtimestamp(call.start)).isoformat()
stop_iso_dt = timezone('US/Eastern').localize(datetime.fromtimestamp(call.stop)).isoformat()
return {'start': start_iso_dt, 'stop': stop_iso_dt
That will land up in your json_report metadata. (my code needed that US/Eastern timezone, you can obviously adjust accordingly, or just do the difference calculation in this function and return {'response_time': mydiff}
I have no idea why the following code returns the proxy as invalid only for the chrome browser help is appreciated. Below are the imports.
import requests
import json
import time
import random
import threading
from threading import Thread
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime
from proxymanager import ProxyManager
from random import randint
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.chrome.options import Options
def getProxy():
try:
proxy_manager = ProxyManager('proxies.txt')
proxydict = proxy_manager.random_proxy()
proxies = proxydict.get_dict()
except:
proxies = []
return proxies
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=https://%s' %getProxy)
chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("http://whatismyipaddress.com")
I'll go out of a limb and guess the problem is with the proxy expansion - you're trying to pass a dict to Chrome instead of the actual proxy address. You want to get the actual value from the Proxy() class in your getProxy() function, e.g.:
def get_proxy(string_only=True):
try:
proxy_manager = ProxyManager("proxies.txt")
proxy = proxy_manager.random_proxy()
if string_only:
return proxy.proxy_string
return proxy.get_dict()
except (OSError, IOError, IndexError) as e: # couldn't load the file / file is empty
return None
# With Chrome:
chrome_options = webdriver.ChromeOptions()
proxy = get_proxy()
if proxy:
chrome_options.add_argument("--proxy-server=" + proxy)
chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("http://whatismyipaddress.com")
# with requests:
response = requests.get("http://whatismyipaddress.com", proxies=get_proxy(False))
# etc.
I'd also recommend to load the proxy list only once if you intend to call this function often and if the proxies.txt is a static file.
This works
def get_proxy():
try:
proxy_manager = ProxyManager("proxies.txt")
return proxy_manager.random_proxy().proxy_string
except (OSError, IOError) as e: # couldn't load the file
return None
chrome_options = webdriver.ChromeOptions()
proxy = get_proxy()
if proxy:
chrome_options.add_argument("--proxy-server=" + proxy)
chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("http://whatismyipaddress.com")
I am learning python crawler and I want to know how to deal with the "load more" button located in the following url:
https://www.photo.net/search/#//Sort-View-Count/All-Categories/All-Time/Page-1
(I was trying to crawl all the picture)
Current code I have is using beautifulsoup:
from urllib.request import *
from http.cookiejar import CookieJar
from bs4 import BeautifulSoup
url = 'https://www.photo.net/search/#//Sort-View-Count/All-Categories/All- Time/Page-1'
cj = CookieJar()
opener = build_opener(HTTPCookieProcessor(cj))
try:
p = opener.open(url)
soup = BeautifulSoup(p, 'html.parser')
except Exception as e:
print(str(e))
Well, I have a solution for you.
You should try Selenium module for python.
1) Download Chrome Driver
2) Install Selenium via pip
Here is an example of how to use it
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser = webdriver.Chrome('Path to chrome driver')
browser.get()
while True:
button = WebDriverWait(browser,10).until(EC.presence_of_element_located((By.LINK_TEXT, 'Load More')))
button.click()
I am trying to log in to my Morningstar.com premium account using the requests module in python as below. The post command runs through with status 200 but does not actually log me in.
(When I download the balance sheet, I only receive the 5 year (non-premium) version instead of the requested 10 year (premium) version. This indicates that my login script fails, since the 5 year data is available without login. The balance sheet URL works correctly right when logging in manually in the browser.)
Does anybody know how to correctly set up the login script?
It seems very straight forward but I have tried the whole day using different forms of the payload/ headers etc. and can't find the right way... Also, I am confused since I cannot find the Form Data information when inspecting the login page.
import csv
import requests
urlLogin = 'http://members.morningstar.com/memberservice/login.aspx'
urlBalanceSheet = 'http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=XNYS:F®ion=usa&culture=en-US&cur=&reportType=bs&period=12&dataType=A&order=desc&columnYear=10&rounding=1&view=raw&r=149906&denominatorView=raw&number=1'
payload = {
"uEmail": "<userEmail>",
"uPassword": "<userPW>",
"remember_me": "on",
"login": "Sign In"
}
with requests.Session() as s:
p = s.post(urlLogin, data = payload)
print(p.status_code)
download = s.get(urlBalanceSheet)
There are few things you can do to automate downloading from morningstar
pip install selenium
http://selenium-python.readthedocs.io/installation.html
install firefox, find out where your profile is here is an resource http://toolsqa.com/selenium-webdriver/custom-firefox-profile/
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import requests
from xml.etree import cElementTree as ET
import csv
from selenium.webdriver.common.action_chains import ActionChains
def timeme(method):
def wrapper(*args, **kw):
startTime = int(round(time.time() * 1000))
result = method(*args, **kw)
endTime = int(round(time.time() * 1000))
print(endTime - startTime, 'ms')
return result
return wrapper
class Driver():
def __init__(self,profile, diver_path, url):
self.profile = profile
self.driver_path = diver_path
self.url = url
def start_driver(self):
user_profile = webdriver.FirefoxProfile(self.profile)
user_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", 'text/csv')
driver = webdriver.Firefox(executable_path=self.driver_path, firefox_profile=user_profile)
driver.get(self.url)
return driver
def shutdown(self,driver):
driver.quit()
#timeme
def login(driver, email = '', password = ''):
wait_time = 1
try:
email_input = WebDriverWait(driver,wait_time).until(
EC.presence_of_all_elements_located((By.XPATH,'//*[#id="uim-uEmail-input"]')))
email_input = driver.find_element_by_xpath('//*[#id="uim-uEmail-input"]').send_keys(email)
time.sleep(5) # wait time to see if you have input remove later
pwd_input = driver.find_element_by_xpath('//*[#id="uim-uPassword-input"]').send_keys(password)
time.sleep(5)
sign_in = driver.find_element_by_xpath('//*[#id="uim-login-submit"]').click()
title = driver.title
driver.execute_script("window.open('http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=XNYS:F®ion=usa&culture=en-US&cur=&reportType=bs&period=12&dataType=A&order=desc&columnYear=10&rounding=1&view=raw&r=149906&denominatorView=raw&number=1','new_window');")
time.sleep(1)
return 0
except Exception as e:
return None
#timeme
def main():
# i am using on my mac, if you are using windows change paths accordingly
Mozilla = Driver(profile = '/Users/yourname/Library/Application Support/Firefox/Profiles/xxxxxxxxxxxx.default',
diver_path='/usr/local/bin/geckodriver', # path to firefox driver
url='https://www.morningstar.com/members/login.html?vurl=')
driver = Mozilla.start_driver()
download = login(driver, password='', email='')
if download ==0:
time.sleep(10) # let browser to download csv
Mozilla.shutdown(driver) # shutdown
main()