How to submit a form in scrapy? - python

I tried to use scrapy to complete the login and collect my project commit count. And here is the code.
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.spider import Spider
from scrapy.utils.response import open_in_browser
class GitSpider(Spider):
name = "github"
allowed_domains = ["github.com"]
start_urls = ["https://www.github.com/login"]
def parse(self, response):
formdata = {'login': 'username',
'password': 'password' }
yield FormRequest.from_response(response,
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)
def parse1(self, response):
open_in_browser(response)
After running the code
scrapy runspider github.py
It should show me the result page of the form, which should be a failed login in the same page as the username and password is fake. However it shows me the search page. The log file is located in pastebin
How should the code be fixed? Thanks in advance.

Your problem is that FormRequest.from_response() uses a different form - a "search form". But, you wanted it to use a "log in form" instead. Provide a formnumber argument:
yield FormRequest.from_response(response,
formnumber=1,
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)
Here is what I see opened in the browser after applying the change (used "fake" user):

Solution using webdriver.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from scrapy.contrib.spiders import CrawlSpider
class GitSpider(CrawlSpider):
name = "gitscrape"
allowed_domains = ["github.com"]
start_urls = ["https://www.github.com/login"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
login_form = self.driver.find_element_by_name('login')
password_form = self.driver.find_element_by_name('password')
commit = self.driver.find_element_by_name('commit')
login_form.send_keys("yourlogin")
password_form.send_keys("yourpassword")
actions = ActionChains(self.driver)
actions.click(commit)
actions.perform()
# by this point you are logged to github and have access
#to all data in the main menĂ¹
time.sleep(3)
self.driver.close()

Using the "formname" argument also works:
yield FormRequest.from_response(response,
formname='Login',
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)

Related

Scrapy with Selenium Middleware to generate second response after first response

I'm trying to extract comments from a news page. The Crawler starts at the homepage and follows all the internal links found on the site. The comments are just on the article-sites and those comments are embedded from an external Website, so the section with the comments are in an JavaScript iframe. Here's an example article site
My first Step was to build a crawler and a selenium middleware. The crawler follows all the links and those are loaded through Selenium:
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CrawlerSpider(CrawlSpider):
name = 'crawler'
allowed_domains = ['www.merkur.de', 'disqus.com/embed/comments/']
start_urls = ['https://www.merkur.de/welt/novavax-corona-totimpfstoff-omikron-zulassung-impfstoff-weihnachten-wirkung-covid-lauterbach-zr-91197497.html']
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse',
follow=True)]
def parse(self, response):
title = response.xpath('//html/head/title/text()').extract_first()
iframe_url = response.xpath('//iframe[#title="Disqus"]//#src').get()
yield Request(iframe_url, callback=self.next_parse, meta={'title': title})
def next_parse(self, response):
title = response.meta.get('title')
comments = response.xpath("//div[#class='post-message ']/div/p").getall()
yield {
'title': title,
'comments': comments
}
To get access to the iframe elements the Scrapy Request goes through the middleware:
from scrapy import signals, spiders
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options=chrome_options)
# Here you get the request you are making to the urls with the LinkExtractor found and use selenium to get them and return a response.
def process_request(self, request, spider):
self.driver.get(request.url)
element = self.driver.find_element_by_xpath('//div[#id="disqus_thread"]')
self.driver.execute_script("arguments[0].scrollIntoView();", element)
time.sleep(1)
body = self.driver.page_source
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
I am getting the right link from the iframe src here but my CrawlerSpider is not yielding the iframe_url Request so that I can follow the link from the iframe. What am I doing wrong here ? I really appreciate your help!

Scrapy crawl every link after authentication

Introduction
Since my crawler is more or less finished yet, i need to redo a crawler which only crawls whole domain for links, i need this for my work.
The spider which crawls every link should run once per month.
I'm running scrapy 2.4.0 and my os is Linux Ubuntu server 18.04 lts
Problem
The website which i have to crawl changed their "privacy", so you have to be logged in before you can see the products, which is the reason why my "linkcrawler" wont work anymore.
I already managed to login and scrape all my stuff, but the start_urls where given in a csv file.
Code
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(deny='https://www.duif.nl/nl/'), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links)
def search_links(self, response):
link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href').get()
for a in link:
link = response.url
yield response.follow(url=link, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Unfortunately i cant provide a dummyaccount, where you can try login by yourself, because its a b2b-service website.
I can imagine that my "def search_links" is wrong.
My planned structure is:
visit login_page, pass my login credentials
check if logged in by xpath, where it checks, if the logout button is given or not.
If logged in, it prints 'success'
Given by xpath expression, it should start to follow links by:
by visiting every link, it should check by xpath xpression, if specific container is given or not, so it knows whether its a productpage or not.
if product page, save visited link, if not productpage, take next link
Console output
Like you can see, the authentication is working, but it wont do anything afterwards.
Update
i reworked my code a very bit:
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl/']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)
def search_links(self, response):
# link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href')
link = response.xpath('//a/#href')
for a in link:
link = a.get()
link = 'https://www.duif.nl' + link if link else link
yield response.follow(url=link, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Now i know, that i am definitely logged in, but it doesnt follow the "sub"-links, but i thought if i use response.xpath('//a/#href') , it will automatically searches the whole dom for every link.
Below my new console output
After you login, you go back to parsing your start url. Scrapy filters out duplicate requests by default, so in your case it stops here. You can avoid this by using 'dont_filter=True' in your request, like this:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)

Scrapy with selenium for a webpage requiring authentication

I am trying to scrape data from a page which has a lot of AJAX calls and javascript execution to render the webpage.So I am trying to use scrapy with selenium to do this. The modus operandi is as follow :
Add the login page URL to the scrapy start_urls list
Use the formrequest from response method to post the username and password to get authenticated.
Once logged in,request for the desired page to be scraped
Pass this response to the Selenium Webdriver to click buttons on the page.
Once the buttons are clicked and a new webpage is rendered,capture the result.
The code that I have thus far is as follows:
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest, Request
from selenium import webdriver
import time
class LoginSpider(BaseSpider):
name = "sel_spid"
start_urls = ["http://www.example.com/login.aspx"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
return FormRequest.from_response(response,
formdata={'User': 'username', 'Pass': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
if "Log Out" in response.body:
self.log("Successfully logged in")
scrape_url = "http://www.example.com/authen_handler.aspx?SearchString=DWT+%3E%3d+500"
yield Request(url=scrape_url, callback=self.parse_page)
else:
self.log("Bad credentials")
def parse_page(self, response):
self.driver.get(response.url)
next = self.driver.find_element_by_class_name('dxWeb_pNext')
next.click()
time.sleep(2)
# capture the html and store in a file
The 2 roadblocks i have hit till now are:
Step 4 does not work.Whenever selenium open the firefox window,it is always at the login screen and does not know how to get past it.
I don't know how to achieve step 5
Any help will be greatly appreciated
I don't believe you can switch between scrapy Requests and selenium like that. You need to log into the site using selenium, not yield Request(). The login session you created with scrapy is not transfered to the selenium session. Here is an example (the element ids/xpath will be different for you):
scrape_url = "http://www.example.com/authen_handler.aspx"
driver.get(scrape_url)
time.sleep(2)
username = self.driver.find_element_by_id("User")
password = self.driver.find_element_by_name("Pass")
username.send_keys("your_username")
password.send_keys("your_password")
self.driver.find_element_by_xpath("//input[#name='commit']").click()
then you can do:
time.sleep(2)
next = self.driver.find_element_by_class_name('dxWeb_pNext').click()
time.sleep(2)
etc.
EDIT: If you need to render javascript and are worried about speed/non-blocking, you can use http://splash.readthedocs.org/en/latest/index.html which should do the trick.
http://splash.readthedocs.org/en/latest/scripting-ref.html#splash-add-cookie has details on passing a cookie, you should be able to pass it from scrapy, but I have not done it before.
log in with scrapy api first
# call scrapy post request with after_login as callback
return FormRequest.from_response(
response,
# formxpath=formxpath,
formdata=formdata,
callback=self.browse_files
)
pass session to selenium chrome driver
# logged in previously with scrapy api
def browse_files(self, response):
print "browse files for: %s" % (response.url)
# response.headers
cookie_list2 = response.headers.getlist('Set-Cookie')
print cookie_list2
self.driver.get(response.url)
self.driver.delete_all_cookies()
# extract all the cookies
for cookie2 in cookie_list2:
cookies = map(lambda e: e.strip(), cookie2.split(";"))
for cookie in cookies:
splitted = cookie.split("=")
if len(splitted) == 2:
name = splitted[0]
value = splitted[1]
#for my particular usecase I needed only these values
if name == 'csrftoken' or name == 'sessionid':
cookie_map = {"name": name, "value": value}
else:
continue
elif len(splitted) == 1:
cookie_map = {"name": splitted[0], "value": ''}
else:
continue
print "adding cookie"
print cookie_map
self.driver.add_cookie(cookie_map)
self.driver.get(response.url)
# check if we have successfully logged in
files = self.wait_for_elements_to_be_present(By.XPATH, "//*[#id='files']", response)
print files

scrapy crawl spider ajax pagination

I was trying to scrap link which has ajax call for pagination.
I am trying to crawl http://www.demo.com link. and in .py file I provided this code for restrict XPATH and coding is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import sumSpider, Rule
from scrapy.selector import HtmlXPathSelector
from sum.items import sumItem
class Sumspider1(sumSpider):
name = 'sumDetailsUrls'
allowed_domains = ['sum.com']
start_urls = ['http://www.demo.com']
rules = (
Rule(LinkExtractor(restrict_xpaths='.//ul[#id="pager"]/li[8]/a'), callback='parse_start_url', follow=True),
)
#use parse_start_url if your spider wants to crawl from first page , so overriding
def parse_start_url(self, response):
print '********************************************1**********************************************'
#//div[#class="showMoreCars hide"]/a
#.//ul[#id="pager"]/li[8]/a/#href
self.log('Inside - parse_item %s' % response.url)
hxs = HtmlXPathSelector(response)
item = sumItem()
item['page'] = response.url
title = hxs.xpath('.//h1[#class="page-heading"]/text()').extract()
print '********************************************title**********************************************',title
urls = hxs.xpath('.//a[#id="linkToDetails"]/#href').extract()
print '**********************************************2***url*****************************************',urls
finalurls = []
for url in urls:
print '---------url-------',url
finalurls.append(url)
item['urls'] = finalurls
return item
My items.py file contains
from scrapy.item import Item, Field
class sumItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = Field()
urls = Field()
Still I'm not getting exact output not able to fetch all pages when I am crawling it.
I hope the below code will help.
somespider.py
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from demo.items import DemoItem
from selenium import webdriver
def removeUnicodes(strData):
if(strData):
strData = strData.encode('utf-8').strip()
strData = re.sub(r'[\n\r\t]',r' ',strData.strip())
return strData
class demoSpider(scrapy.Spider):
name = "domainurls"
allowed_domains = ["domain.com"]
start_urls = ['http://www.domain.com/used/cars-in-trichy/']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(5)
hxs = Selector(response)
item = DemoItem()
finalurls = []
while True:
next = self.driver.find_element_by_xpath('//div[#class="showMoreCars hide"]/a')
try:
next.click()
# get the data and write it to scrapy items
item['pageurl'] = response.url
item['title'] = removeUnicodes(hxs.xpath('.//h1[#class="page-heading"]/text()').extract()[0])
urls = self.driver.find_elements_by_xpath('.//a[#id="linkToDetails"]')
for url in urls:
url = url.get_attribute("href")
finalurls.append(removeUnicodes(url))
item['urls'] = finalurls
except:
break
self.driver.close()
return item
items.py
from scrapy.item import Item, Field
class DemoItem(Item):
page = Field()
urls = Field()
pageurl = Field()
title = Field()
Note:
You need to have selenium rc server running because HTMLUNITWITHJS works with selenium rc only using Python.
Run your selenium rc server issuing the command :
java -jar selenium-server-standalone-2.44.0.jar
Run your spider using command:
spider crawl domainurls -o someoutput.json
You can check with your browser how the requests are made.
Behind the scene, right after you click on that button "show more cars" your browser will request a JSON data to feed your next page. You can take advantage of this fact and deal directly with the JSON data without the necessity to work with a JavaScript engine as Selenium or PhantomJS.
In your case, as the first step you should simulate an user scrolling down the page given by your start_url parameter and profile at the same time your network requests to discover the endpoint used by the browser to request that JSON. To discover this endpoint in general there is a XHR(XMLHttpRequest) section on the browser's profile tool as here in Safari where you can navigate thought all resources/endpoints used to request the data.
Once you discover this endpoint it's a straightforward task: you give your Spider as start_url the endpoint that you just discovered and according you process and navigate through the JSON's you can discover if it a next page to request.
P.S.: I saw for you that the endpoint url is http://www.carwale.com/webapi/classified/stockfilters/?city=194&kms=0-&year=0-&budget=0-&pn=2
In this case my browser requested the second page, as you can see in the parameter pn. It's is important you set the some header parameters before you send the request. I noticed in your case the headers are:
Accept text/plain, /; q=0.01
Referer http://www.carwale.com/used/cars-in-trichy/
X-Requested-With XMLHttpRequest
sourceid 1
User-Agent Mozilla/5.0...

Using Scrapy with Selenium to scape a rendered page

I fixed all the issues, the only problem is I am getting a permission error because I cannot figure out exactly how to connect Firefox to this script, I've installed the plugins and I can make it work from the UI but not this script.
How do I do this?
Here is my code...
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from selenium import selenium
from linkedpy.items import LinkedPyItem
class LinkedPySpider(InitSpider):
name = 'LinkedPy'
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
def __init__(self):
InitSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*firefox", "http://www.linkedin.com")
self.log("\n\n\n Starting the Selenium Server! \n\n\n")
self.selenium.start()
self.log("\n\n\n Successfully, Started the Selenium Server! \n\n\n")
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': 'email#address.com', 'session_password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
return self.initialized()
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
hxs = HtmlXPathSelector(response)
sel = self.selenium
sel.open(response.url)
time.sleep(2.5)
sites = sel.select('//ol[#id=\'result-set\']/li')
items = []
for site in sites:
item = LinkedPyItem()
item['title'] = site.select('h2/a/text()').extract()
item['link'] = site.select('h2/a/#href').extract()
items.append(item)
return items

Categories