Using Scrapy with Selenium to scape a rendered page

Using Scrapy with Selenium to scape a rendered page - python

I fixed all the issues, the only problem is I am getting a permission error because I cannot figure out exactly how to connect Firefox to this script, I've installed the plugins and I can make it work from the UI but not this script.
How do I do this?
Here is my code...
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from selenium import selenium
from linkedpy.items import LinkedPyItem
class LinkedPySpider(InitSpider):
name = 'LinkedPy'
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
def __init__(self):
InitSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*firefox", "http://www.linkedin.com")
self.log("\n\n\n Starting the Selenium Server! \n\n\n")
self.selenium.start()
self.log("\n\n\n Successfully, Started the Selenium Server! \n\n\n")
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': 'email#address.com', 'session_password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
return self.initialized()
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
hxs = HtmlXPathSelector(response)
sel = self.selenium
sel.open(response.url)
time.sleep(2.5)
sites = sel.select('//ol[#id=\'result-set\']/li')
items = []
for site in sites:
item = LinkedPyItem()
item['title'] = site.select('h2/a/text()').extract()
item['link'] = site.select('h2/a/#href').extract()
items.append(item)
return items

Related

Scrapy crawl every link after authentication

Introduction
Since my crawler is more or less finished yet, i need to redo a crawler which only crawls whole domain for links, i need this for my work.
The spider which crawls every link should run once per month.
I'm running scrapy 2.4.0 and my os is Linux Ubuntu server 18.04 lts
Problem
The website which i have to crawl changed their "privacy", so you have to be logged in before you can see the products, which is the reason why my "linkcrawler" wont work anymore.
I already managed to login and scrape all my stuff, but the start_urls where given in a csv file.
Code
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(deny='https://www.duif.nl/nl/'), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links)
def search_links(self, response):
link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href').get()
for a in link:
link = response.url
yield response.follow(url=link, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Unfortunately i cant provide a dummyaccount, where you can try login by yourself, because its a b2b-service website.
I can imagine that my "def search_links" is wrong.
My planned structure is:
visit login_page, pass my login credentials
check if logged in by xpath, where it checks, if the logout button is given or not.
If logged in, it prints 'success'
Given by xpath expression, it should start to follow links by:
by visiting every link, it should check by xpath xpression, if specific container is given or not, so it knows whether its a productpage or not.
if product page, save visited link, if not productpage, take next link
Console output
Like you can see, the authentication is working, but it wont do anything afterwards.
Update
i reworked my code a very bit:
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl/']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)
def search_links(self, response):
# link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href')
link = response.xpath('//a/#href')
for a in link:
link = a.get()
link = 'https://www.duif.nl' + link if link else link
yield response.follow(url=link, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Now i know, that i am definitely logged in, but it doesnt follow the "sub"-links, but i thought if i use response.xpath('//a/#href') , it will automatically searches the whole dom for every link.
Below my new console output

After you login, you go back to parsing your start url. Scrapy filters out duplicate requests by default, so in your case it stops here. You can avoid this by using 'dont_filter=True' in your request, like this:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)

scrapy does not login to webpage using FormRequest.from_response

I have below code to login and scrape given url. But login is never attempted. It moves between login and forgot password screen. I tried passing login cookie and no luck. Not sure if FormRequest.from_response ever worked for anyone. Please help..
import scrapy
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "quotes"
handle_httpstatus_list = [401]
start_urls = ['https://xyz/login']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'user', 'password': 'pwd='},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
for quote in response.xpath('//select'):
yield {
'url': response.url.extract(),
'text': quote.xpath('option::text').extract(),
}
for next_page in response.xpath('//a/#href').extract():
if`enter code here` next_page is not None:
yield response.follow(next_page, self.after_login)

Giving Syntax error in Scrapy (Python) - XPath

I'm using Scrapy Crawler to extract some details like username, upvotes, join date etc.
I'm using XPath for extracting the contents from each user's webpage.
Code:
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.spiders import BaseSpider
from scrapy.http import FormRequest
from loginform import fill_login_form
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class UserSpider(scrapy.Spider):
name = 'userspider'
start_urls = ['http://forum.nafc.org/login/']
#Getting the list of usernames
user_names = ['Bob', 'Tom'] #List of Usernames
def __init__(self, *args, **kwargs):
super(UserSpider, self).__init__(*args, **kwargs)
def parse(self, response):
return [FormRequest.from_response(response,
formdata={'registerUserName': 'user', 'registerPass': 'password'},
callback=self.after_main_login)]
def after_main_login(self, response):
for user in self.user_names:
user_url = 'profile/' + user
yield response.follow(user_url, callback=self.parse_user_pages)
def parse_user_pages(self, response):
yield{
"USERNAME": response.xpath('//div[contains(#class, "main") and contains(#class, "no-sky-main")]/h1[contains(#class, "thread-title")]/text()').extract_first()
"UPVOTES": response.xpath('//div[contains(#class, "proUserInfoLabelLeft") and #id="proVotesCap"]/text()').extract()[0]
}
if __name__ == "__main__":
spider = UserSpider()
Error looks like this
P.S. I have manually checked the syntax of my XPath on the Scrapy Shell and it was working fine
Is there anything that I'm not noticing in the code?

You're missing a , after your first dict element:
{"USERNAME": response.xpath(...).extract_first(),
"UPVOTES": response.xpath(...).extract()[0]}

Web scraping with Authentication using python and scrapy - not going to according to plan

I've just started to dabble in python with a purpose to scrape data from a website and I found a scrapy tutorial on website authentication and had a plan, but unfortunately it doesn't do what it says on the tin. the login page doesn't seem to get populated with the username or password, I hoped someone could take a look at the code and maybe point out where I am going wrong and offer some help. Here is the code:-
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spiders import Rule
class controlantSpider(InitSpider):
name = 'controlant'
allowed_domains = ['controlant.com']
login_page = 'https://grp.controlant.com/user/login?redirect=%2f'
start_urls = ['https://grp.controlant.com/group',
'https://grp.controlant.com/webforms/Admin/Overview.aspx']
rules = (
Rule(SgmlLinkExtractor(allow=r'-\w+.html$'),
callback='parse_item', follow=True),
)
def init_request(self):
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'username': 'username', 'password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Hi wessex#alliance" in response.body:
self.log("Successfully logged in. Let's start crawling!")
# Now the crawling can begin..
self.initialized()
else:
self.log("Bad times :(")
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)

How to submit a form in scrapy?

I tried to use scrapy to complete the login and collect my project commit count. And here is the code.
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.spider import Spider
from scrapy.utils.response import open_in_browser
class GitSpider(Spider):
name = "github"
allowed_domains = ["github.com"]
start_urls = ["https://www.github.com/login"]
def parse(self, response):
formdata = {'login': 'username',
'password': 'password' }
yield FormRequest.from_response(response,
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)
def parse1(self, response):
open_in_browser(response)
After running the code
scrapy runspider github.py
It should show me the result page of the form, which should be a failed login in the same page as the username and password is fake. However it shows me the search page. The log file is located in pastebin
How should the code be fixed? Thanks in advance.

Your problem is that FormRequest.from_response() uses a different form - a "search form". But, you wanted it to use a "log in form" instead. Provide a formnumber argument:
yield FormRequest.from_response(response,
formnumber=1,
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)
Here is what I see opened in the browser after applying the change (used "fake" user):

Solution using webdriver.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from scrapy.contrib.spiders import CrawlSpider
class GitSpider(CrawlSpider):
name = "gitscrape"
allowed_domains = ["github.com"]
start_urls = ["https://www.github.com/login"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
login_form = self.driver.find_element_by_name('login')
password_form = self.driver.find_element_by_name('password')
commit = self.driver.find_element_by_name('commit')
login_form.send_keys("yourlogin")
password_form.send_keys("yourpassword")
actions = ActionChains(self.driver)
actions.click(commit)
actions.perform()
# by this point you are logged to github and have access
#to all data in the main menù
time.sleep(3)
self.driver.close()

Using the "formname" argument also works:
yield FormRequest.from_response(response,
formname='Login',
formdata=formdata,
clickdata={'name': 'commit'},
callback=self.parse1)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using Scrapy with Selenium to scape a rendered page - python

Related

Scrapy crawl every link after authentication

scrapy does not login to webpage using FormRequest.from_response

Giving Syntax error in Scrapy (Python) - XPath

Web scraping with Authentication using python and scrapy - not going to according to plan

How to submit a form in scrapy?

Categories

Resources