I'm trying to scrape the website using Scrapy. To get the content which I want I need to login first. The url is login_url
There I have form as follows:
My code is as follows:
LOGIN_URL1 = "https://www.partslink24.com/partslink24/user/login.do"
class PartsSpider(scrapy.Spider):
name = "parts"
login_url = LOGIN_URL1
start_urls = [
login_url,
]
def parse(self, response):
form_data = {
'accountLogin': COMPANY_ID,
'userLogin': USERNAME,
'loginBean.password': PASSWORD
}
yield FormRequest(url=self.login_url, formdata=form_data, callback=self.parse1)
def parse1(self, response):
inspect_response(response, self)
print("RESPONSE: {}".format(response))
def start_scraper(vin_number):
process = CrawlerProcess()
process.crawl(PartsSpider)
process.start()
But the problem is that they check if the session is activated and I get an error, the form can not be submitted.
When I check the response which I get after submitting the login form, I get the following error:
The code on their site which checks that is as follows:
var JSSessionChecker = {
check: function()
{
if (!Ajax.getTransport())
{
alert('NO_AJAX_IN_BROWSER');
}
else
{
new Ajax.Request('/partslink24/checkSessionCookies.do', {
method:'post',
onSuccess: function(transport)
{
if (transport.responseText != 'true')
{
if (Object.isFunction(JSSessionChecker.showError)) JSSessionChecker.showError();
}
},
onFailure: function(e)
{
if (Object.isFunction(JSSessionChecker.showError)) JSSessionChecker.showError();
},
onException: function (request, e)
{
if (Object.isFunction(JSSessionChecker.showError)) JSSessionChecker.showError();
}
});
}
},
showError: function()
{
var errorElement = $('sessionCheckError');
if (errorElement)
{
errorElement.show();
}
}
};
JSSessionChecker.check();
And on success it returns only true.
Is there any way that I can activate the session before submitting a form?
Thanks in advance.
EDIT
The error page which I get using the answer from #fam.
Please check this code.
import scrapy
LOGIN_URL1 = "https://www.partslink24.com/partslink24/user/login.do"
class PartsSpider(scrapy.Spider):
name = "parts"
login_url = LOGIN_URL1
start_urls = [
login_url,
]
def parse(self, response):
form_data = {
'loginBean.accountLogin': "COMPANY_ID",
'loginBean.userLogin': "USERNAME",
'loginBean.sessionSqueezeOut' : "false",
'loginBean.password': "PASSWORD",
'loginBean.userOffsetSec' : "18000",
'loginBean.code2f' : ""
}
yield scrapy.FormRequest.from_response(response=response, url=self.login_url, formdata=form_data, callback=self.parse1)
def parse1(self, response):
#scrapy.inspect_response(response, self)
print("RESPONSE: {}".format(response))
def start_scraper(vin_number):
process = scrapy.CrawlerProcess()
process.crawl(PartsSpider)
process.start()
I am not getting an error and the response is as follows:
RESPONSE: <200 https://www.partslink24.com/partslink24/user/login.do>
EDIT:
The following code is for Selenium. It will log you into the page easily. You only need to download the chrome driver and install Selenium.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.options import Options
import time
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.partslink24.com/partslink24/user/login.do")
# enter the form fields
company_ID = "company id"
user_name = "user name"
password = "password"
company_ID_input = driver.find_element_by_xpath("//input[#name='accountLogin']")
company_ID_input.send_keys(company_ID)
time.sleep(1)
user_name_input = driver.find_element_by_xpath("//input[#name='userLogin']")
user_name_input.send_keys(user_name)
time.sleep(1)
password_input = driver.find_element_by_xpath("//input[#id='inputPassword']")
password_input.send_keys(password)
time.sleep(1)
# click the search button and get links from first page
click_btn = driver.find_element_by_xpath("//a[#tabindex='5']")
click_btn.click()
time.sleep(5)
Don't forget to change the credentials.
Related
I tried to extract some data from dynamically loaded javascript website using scrapy-playwright but I stuck at the very beginning.
From where I'm facing trubles in settings.py file is as follows:
#playwright
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
#TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
#ASYNCIO_EVENT_LOOP = 'uvloop.Loop'
When I inject the following scrapy-playwright hanndler:
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
Then I got:
scrapy.exceptions.NotSupported: Unsupported URL scheme 'https': The installed reactor
(twisted.internet.selectreactor.SelectReactor) does not match the requested one (twisted.internet.asyncioreactor.AsyncioSelectorReactor)
When I inject TWISTED_REACTOR"
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
Then I got:
raise TypeError(
TypeError: SelectorEventLoop required, instead got: <ProactorEventLoop running=False closed=False debug=False>
After all,When I inject ASYNCIO_EVENT_LOOP
Then I got:
ModuleNotFoundError: No module named 'uvloop'
At last, fail to install 'uvloop'
pip install uvloop
Script
import scrapy
from scrapy_playwright.page import PageCoroutine
class ProductSpider(scrapy.Spider):
name = 'product'
def start_requests(self):
yield scrapy.Request(
'https://shoppable-campaign-demo.netlify.app/#/',
meta={
'playwright': True,
'playwright_include_page': True,
'playwright_page_coroutines': [
PageCoroutine("wait_for_selector", "div#productListing"),
]
}
)
async def parse(self, response):
pass
# parses content
It's been suggested by the developers of scrapy_playwright to instantiate the DOWNLOAD_HANDLERS and TWISTER_REACTOR into your script.
A similar comment is provided here
Here's a working script implementing just this:
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy.crawler import CrawlerProcess
class ProductSpider(scrapy.Spider):
name = 'product'
def start_requests(self):
yield scrapy.Request(
'https://shoppable-campaign-demo.netlify.app/#/',
callback = self.parse,
meta={
'playwright': True,
'playwright_include_page': True,
'playwright_page_coroutines': [
PageCoroutine("wait_for_selector", "div#productListing"),
]
}
)
async def parse(self, response):
container = response.xpath("(//div[#class='col-md-6'])[1]")
for items in container:
yield {
'products':items.xpath("(//h3[#class='card-title'])[1]//text()").get()
}
# parses content
if __name__ == "__main__":
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"CONCURRENT_REQUESTS": 32,
"FEED_URI":'Products.jl',
"FEED_FORMAT":'jsonlines',
}
)
process.crawl(ProductSpider)
process.start()
And we get the following output:
{'products': 'Oxford Loafers'}
If you are using Windows then your problem is that Playwright doesn't support Windows. Check it out here https://github.com/scrapy-plugins/scrapy-playwright/issues/154
I am trying to fetch dynamic phone number from this page (among others): https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html
The phone number appears after a click on the element div with the class page-action click-tel. I am trying to get to this data with scrapy_splash using a LUA script to execute a click.
After pulling splash on my ubuntu:
sudo docker run -d -p 8050:8050 scrapinghub/splash
Here is my code so far (I am using a proxy service) :
class company(scrapy.Spider):
name = "company"
custom_settings = {
"FEEDS" : {
'/home/ubuntu/scraping/europages/data/company.json': {
'format': 'jsonlines',
'encoding': 'utf8'
}
},
"DOWNLOADER_MIDDLEWARES" : {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
"SPLASH_URL" : 'http://127.0.0.1:8050/',
"SPIDER_MIDDLEWARES" : {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
"DUPEFILTER_CLASS" : 'scrapy_splash.SplashAwareDupeFilter',
"HTTPCACHE_STORAGE" : 'scrapy_splash.SplashAwareFSCacheStorage'
}
allowed_domains = ['www.europages.fr']
def __init__(self, company_url):
self.company_url = "https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html" ##forced
self.item = company_item()
self.script = """
function main(splash)
splash.private_mode_enabled = false
assert(splash:go(splash.args.url))
assert(splash:wait(0.5))
local element = splash:select('.page-action.click-tel')
local bounds = element:bounds()
element:mouse_click{x=bounds.width/2, y=bounds.height/2}
splash:wait(4)
return splash:html()
end
"""
def start_requests(self):
yield scrapy.Request(
url = self.company_url,
callback = self.parse,
dont_filter = True,
meta = {
'splash': {
'endpoint': 'execute',
'url': self.company_url,
'args': {
'lua_source': self.script,
'proxy': 'http://usernamepassword#proxyhost:port',
'html':1,
'iframes':1
}
}
}
)
def parse(self, response):
soup = BeautifulSoup(response.body, "lxml")
print(soup.find('div',{'class','page-action click-tel'}))
The problem is that it has no effect, I still have nothing as if no button were clicked.
Shouldn't the return splash:html() return the results of element:mouse_click{x=bounds.width/2, y=bounds.height/2} (as element:mouse_click() waits for the changes to appear) in response.body ?
Am I missing something here ?
Most times when sites load data dynamically, they do so via background XHR requests to the server. A close examination of the network tab when you click the 'telephone' button, shows that the browser sends an XHR request to the url https://www.europages.fr/InfosTelecomJson.json?uidsid=DEU241700-00101&id=1330. You can emulate the same in your spider and avoid using scrapy splash altogether. See sample implementation below using one url:
import scrapy
from urllib.parse import urlparse
class Company(scrapy.Spider):
name = 'company'
allowed_domains = ['www.europages.fr']
start_urls = ['https://www.europages.fr/LEMMERFULLWOOD-GMBH/DEU241700-00101.html']
def parse(self, response):
# obtain the id and uuid to make xhr request
uuid = urlparse(response.url).path.split('/')[-1].rstrip('.html')
id = response.xpath("//div[#itemprop='telephone']/a/#onclick").re_first(r"event,'(\d+)',")
yield scrapy.Request(f"https://www.europages.fr/InfosTelecomJson.json?uidsid={uuid}&id={id}", callback=self.parse_address)
def parse_address(self, response):
yield response.json()
I get the response
{'digits': '+49 220 69 53 30'}
I'm trying to login to Magento account from Python script using requests module, the relevant code I made looks as below:
s = requests.session()
main_url = '<redacted.tld>/en/index.html'
html_data = s.get('https://'+main_url, headers=headers, timeout=(30, 30), verify=dst_verify_ssl)
web_user = 'test#test.com'
web_pass = '123test321'
form_key = soup.find('input', {'name':'form_key'})['value']
l_url = 'https://<redacted.tld>/'
l_route = 'en/customer/account/loginPost/'
login_payload = {
'form_key':form_key,
'login[username]':web_user,
'login[password]':web_pass
}
login_req = s.post(l_url + l_route, headers=headers, data=login_payload)
But it's not getting me logged in so I was wondering if someone could tell me what does it take to login via Python to the Magento account?
Thanks.
I gave this one a go on a public demo instance and I can see the data on the Magento 2 dashboard just fine:
import requests
from bs4 import BeautifulSoup
web_user = 'youremail#example.com'
web_pass = 'yourpassword'
s = requests.session()
main_url = 'https://magento2demo/'
html_data = s.get(main_url)
form_soup = BeautifulSoup(html_data.content, 'html.parser')
form_key = form_soup.find('input', {'name':'form_key'})['value']
login_route = 'https://magento2demo/customer/account/loginPost/'
login_payload = {
'form_key': form_key,
'login[username]': web_user,
'login[password]': web_pass
}
login_req = s.post(login_route, data=login_payload)
account_url = "https://magento2demo/customer/account/"
html_account = s.get(account_url)
account_soup = BeautifulSoup(html_account.content, 'html.parser')
info = account_soup.find('div', {'class':'box-information'}).find('div', {'class':'box-content'})
assert web_user in str(info)
"beautifulsoup4": { "version": "==4.9.3"
"requests": { "version": "==2.26.0"
What's the response code on the POST? Anything peculiar in your headers?
Might wanna add more reproducible data if the above doesn't help.
I am new to Scrapy and Python on general.
Here is the code:
import scrapy
import json
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1458122036
def parse(self, response):
url = 'https://www.plurk.com/Search/search2'
headers = {
...omitted...
}
for i in range(1,10):
formdata = {
"after_id": str(self.global_id)
}
yield scrapy.FormRequest(url, callback=self.parse_api, formdata=formdata, headers=headers)
def parse_api(self, response):
raw = response.body
data = json.loads(raw)
posts = data["plurks"]
users = data["users"]
l = len(posts)
i = 0
for post in posts:
i = i + 1
if (i == l):
self.global_id = post["plurk_id"]
...omitted code...
yield {
'Author': user_name,
'Body': post['content'],
'app': 'plurk'
}
The problem that I have is that Scrapy is making first all the requests in the for loop and then it is executing the code in parse_api.
What I would like to do is let scrapy do one iteration of the for loop, call the callback function, wait for it to return and then do another iteration.
This because the id that I need for the next request will be set in the global_id variable by the callback function.
You can't achieve this by scheduling requests in loop.
You can implement this only if you will schedule only one (next) request per parse/parse_api method call:
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'DOWNLOAD_DELAY':5,
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}
def parse(self, response):
# schedule only first request (withour loop)
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": '1458122036', #<- your initial global_id
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)
def parse_api(self, response):
data = json.loads(response.body)
after_id = None
for post in data["plurks"]:
after_id = post["plurk_id"]
yield {
'Author': data["users"][str(post["owner_id"])]["nick_name"], # instead of user_id?
'Body': post["content"],
'app': 'plurk'
}
# after end of this loop - after_id should contain required data for next request
# instead of separate loop variable response.meta["depth"] used to limit number requests
if response.meta["depth"] <=11 and after_id: # schedule next request
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": str(after_id),
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)
Answering my own question:
Now the parse method does just one request and calls once the parse_api method. Parse_api processes the response and sets the global_id variable. Once it's done processing its own response it makes another request passing itself as the callback function.
By doing this you are guaranteed that the global_id variable will be properly set, since the new request will be made only once parse_api has finished running.
request.cb_kwargs["loop_l"] is used to pass an additional argument to the callback function. This time it's a counter that controls the number of requests we want to make. When the counter is equal to 100 we stop the crawling
import scrapy
import json
plurk_id = []
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1455890167
url = 'https://www.plurk.com/Search/search2'
headers = {
...OMITTED...
}
def parse(self, response):
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(0)
yield request
def parse_api(self, response, loop_l):
int_loop_l = int(loop_l)
int_loop_l = int_loop_l + 1
if (int_loop_l == 200):
return
raw = response.body
data = json.loads(raw)
...omitted code...
... GET AND SET THE NEW global_id FROM THE RESPONSE ...
# make another request with the new id
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(int_loop_l)
yield request
I am trying to get the cookies from a splash request, but I keep getting an error.
Here is the code I am using:
class P2PEye(scrapy.Spider):
name = 'p2peyeSpider'
allowed_domains = ['p2peye.com']
start_urls = ['https://www.p2peye.com/platform/h9/']
def start_requests(self):
script = '''
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return {
cookies = splash:get_cookies(),
}
end
'''
for url in self.start_urls:
yield SplashRequest(url, callback=self.parse, endpoint='render.html',args={'wait': 1, 'lua_source': script})
def parse(self, response):
print(response.request.headers.getlist('Set-Cookie'))
print(response.cookiejar)
This is my settings.py
SPLASH_URL = 'http://127.0.0.1:8050'
CRAWLERA_ENABLED= False
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100 }
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
COOKIES_ENABLED = True
COOKIES_DEBUG = True
SPLASH_COOKIES_DEBUG = True
The result of response.request.headers.getlist('Set-Cookie') is [],
and response.cookiejar got an error: AttributeError: 'SplashTextResponse' object has no attribute 'cookiejar'.
So how can I get the cookies without causing an error?
To access response.cookiejar you need to return SplashJsonResponse
try returning extra fields on your Lua script:
script = '''
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
'''
Using the LUA script below the response will be a dict with cookies located at key cookies
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return {
cookies = splash:get_cookies(),
}
end
So to access you should use
# d = requests.post('splash').json()
print(d['cookies'])