Scrapy help scraping after logging into a page - python

I am trying to scrape a table that comes after a login page using scrapy. The Login page is http://subscribers.footballguys.com/amember/login.php, and the webpage I am trying to scrape is https://subscribers.footballguys.com/myfbg/myweeklycheatsheet.php.
I have tried to follow the tutorials from scrapy's documentation as well as here, but I am not getting any responses back (not even the hello world). Below is my code. I can also provide any other information needed. Thank you in advance!
import scrapy
class FbgQbSpider(scrapy.Spider):
name = 'fbg_qb'
allowed_domains = ['www.footballguys.com/']
start_urls = ['http://subscribers.footballguys.com/amember/login.php']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'amember_login': 'example#gmail.com', 'amember_pass': 'examplepassword'},
callback=self.after_login
)
def after_login(self, response):
#check login success before going on
View(response)
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
fetch("https://subscribers.footballguys.com/myfbg/myweeklycheatsheet.php")
players = response.css("span::text").extract()
for item in zip(players):
scraped_info = {
'player' : item[0]
}
yield scraped_info
print("hello world")

hello world is not printing because of an indentation issue.

Related

Scrapy to login and then grab data from Weibo

I am still trying to use Scrapy to collect data from pages on Weibo which need to be logged in to access.
I now understand that I need to use Scrapy FormRequests to get the login cookie. I have updated my Spider to try to make it do this, but it still isn't working.
Can anybody tell me what I am doing wrong?
import scrapy
class LoginSpider(scrapy.Spider):
name = 'WB'
def start_requests(self):
return [
scrapy.Request("https://www.weibo.com/u/2247704362/home?wvr=5&lf=reg", callback=self.parse_item)
]
def parse_item(self, response):
return scrapy.FormRequest.from_response(response, formdata={'user': 'user', 'pass': 'pass'}, callback=self.parse)
def parse(self, response):
print(response.body)
When I run this spider. Scrapy redirects from the URL under start_requests, and then returns the following error:
ValueError: No element found in <200 https://passport.weibo.com/visitor/visitor?entry=miniblog&a=enter&url=https%3A%2F%2Fweibo.com%2Fu%2F2247704362%2Fhome%3Fwvr%3D5%26lf%3Dreg&domain=.weibo.com&ua=php-sso_sdk_client-0.6.28&_rand=1585243156.3952>
Does that mean I need to get the spider to look for something other than Form data in the original page. How do I tell it to look for the cookie?
I have also tried a spider like this below based on this post.
import scrapy
class LoginSpider(scrapy.Spider):
name = 'WB'
login_url = "https://www.weibo.com/overseas"
test_url = 'https://www.weibo.com/u/2247704362/'
def start_requests(self):
yield scrapy.Request(url=self.login_url, callback=self.parse_login)
def parse_login(self, response):
return scrapy.FormRequest.from_response(response, formid="W_login_form", formdata={"loginname": "XXXXX", "password": "XXXXX"}, callback=self.start_crawl)
def start_crawl(self, response):
yield Request(self.test_url, callback=self.parse_item)
def parse_item(self, response):
print("Test URL " + response.url)
But it still doesn't work, giving the error:
ValueError: No element found in <200 https://www.weibo.com/overseas>
Would really appreciate any help anybody can offer as this is kind of beyond my range of knowledge.

Scrapy login authentication not working

I have just started playing around with scrapy. I am trying to crawl a website that requires login. I got it working just fine for github. I found the form id, added the required fields and everything went on as planned.
However, when I tried the same on the investopedia website, I got into troble. I am attaching the code.
class Investo_spider(InitSpider):
name = 'investo_spider'
allowed_domains = ['investopedia.com']
login_page = 'http://www.investopedia.com/accounts/login.aspx'
start_urls = ['http://www.investopedia.com']
def init_request(self):
return Request(url=self.login_page, callback=self.login)
def login(self, response):
return FormRequest.from_response(response,
formdata={'email': 'mymail','password': 'mypass'},
callback=self.check_login_response)
def check_login_response(self, response):
if "myname" in response.body:
self.log("Successfully logged in. Let's start crawling!")
self.initialized()
else:
self.log("Login was unsuccessful")
def parse_item(self, response):
print 'I got in here, finally!!!!'
pass
I have tried adding the formnumber=0,clickdata={'nr': 0} and changing the method (POST or GET) although the defaults were already selecting the right form and clickable.
Surprisingly, I got it working on a mechanize browser, using the same parameters. I can convert the html to a HtmlResponse object that scrapy can process.
br = mechanize.Browser()
br.open("http://www.investopedia.com/accounts/login.aspx")
br.select_form(nr=0)
br.form["email"] = 'mymail'
br.form["password"] = 'mypass'
br.submit()
br.open('http://www.investopedia.com')
response = HtmlResponse(url="some_url"),body=br.response().read())
However, this would mean I would have to carry the mechanize browser around, which I assume is not the best of solutions. I think I might be missing something. I would really appreciate your input on this. Thanks!
You would have to handle redirection. this will work for you.
class Investo_spider(scrapy.Spider):
name = 'investo_spider'
allowed_domains = ['investopedia.com']
login_page = 'http://www.investopedia.com/accounts/login.aspx'
start_urls = ['http://www.investopedia.com']
def init_request(self):
return scrapy.Request(url=self.login_page, callback=self.login)
def parse(self, response):
return scrapy.FormRequest('http://www.investopedia.com/accounts/login.aspx',
formdata={'email': 'you_email', 'password': 'your_password',
'form_build_id': 'form - v14V92zFkSSVFSerfvWyH1WEUoxrV2khjfhAETJZydk',
'form_id': 'account_api_form',
'op': 'Sign in'
},
meta = {'dont_redirect': True, 'handle_httpstatus_list':[302]},
callback=self.check_login_response)
def check_login_response(self, response):
return scrapy.Request('http://www.investopedia.com/accounts/manageprofile.aspx', self.validate_login)
def validate_login(self, response):
if "myname" in response.body:
self.log("Successfully logged in. Let's start crawling!")
self.initialized()
else:
self.log("Login was unsuccessful")
def parse_item(self, response):
print 'I got in here, finally!!!!'
pass

Scrapy crawling stackoverflow questions matching multiple tags

I am trying out scrapy now. I tried the example code in http://doc.scrapy.org/en/1.0/intro/overview.html page. I tried extracting the recent questions with tag 'bigdata'. Everything worked well. But when I tried to extract questions with both tags 'bigdata' and 'python', the results were not correct, with questions having only 'bigdata' tag coming in the result. But on browser I am getting questions with both the tags correctly. Please find the code below:
import scrapy
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['https://stackoverflow.com/questions/tagged/bigdata?page=1&sort=newest&pagesize=50']
def parse(self, response):
for href in response.css('.question-summary h3 a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
yield {
'title': response.css('h1 a::text').extract()[0],
'votes': response.css('.question .vote-count-post::text').extract()[0],
'body': response.css('.question .post-text').extract()[0],
'tags': response.css('.question .post-tag::text').extract(),
'link': response.url,
}
When I change start_urls as
start_urls = ['https://stackoverflow.com/questions/tagged/bigdata+python?page=1&sort=newest&pagesize=50']
the results contain questions with only 'bigdata' tag. How to get questions with both the tags only?
Edit: I think what is happening is that scrapy is going into pages with tag 'bigdata' from the main page I gave because the tags are links to the main page for that tag. How can I edit this code to make scrapy not go into the tag pages and only questions in that page? I tried using rules like below but results were still not right.
rules = (Rule(LinkExtractor(restrict_css='.question-summary h3 a::attr(href)'), callback='parse_question'),)
The url you have (as well as the initial css rules) is correct; or more simply:
start_urls = ['https://stackoverflow.com/questions/tagged/python+bigdata']
Extrapolating from this, this will also work:
start_urls = ['https://stackoverflow.com/questions/tagged/bigdata%20python']
The issue you are running into however, is that stackoverflow appears to require you to be logged in to access the multiple tag search feature. To see this, simply log out of your stackoverflow session and try the same url in your browser. It will redirect you to a page of results for the first of the two tags only.
TL;DR the only way to get the multiple tags feature appears to be logging in (enforced via session cookies)
Thus, when using scrapy, the fix is to authenticate the session (login) before doing anything else, and then proceed to parse as normal and it all works. To do this, you can use an InitSpider instead of Spider and add the appropriate login methods. Assuming you login with StackOverflow directly (as opposed to through Google or the like), I was able to get it working as expected like this:
import scrapy
import getpass
from scrapy.spiders.init import InitSpider
class StackOverflowSpider(InitSpider):
name = 'stackoverflow'
login_page = 'https://stackoverflow.com/users/login'
start_urls = ['https://stackoverflow.com/questions/tagged/bigdata+python']
def parse(self, response):
...
def parse_question(self, response):
...
def init_request(self):
return scrapy.Request(url=self.login_page, callback=self.login)
def login(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'email': 'yourEmailHere#foobar.com',
'password': getpass.getpass()},
callback=self.check_login_response)
def check_login_response(self, response):
if "/users/logout" in response.body:
self.log("Successfully logged in")
return self.initialized()
else:
self.log("Failed login")

Why doesn't this FormRequest log me in?

Complete Python newb here so I may be asking something painfully obvious, but I've searched through this site, the Scrapy docs, and Google and I'm completely stuck on this problem.
Essentially, I want to use Scrapy's FormRequest to log me in to a site so that I can scrape and save some stats from various pages. The issue is that the response I receive from the site after submitting the form just returns me to the home page (without any login error notifications in the response body). I'm not sure how I am botching this log-in process. Although it is a pop-up login form, I don't think that should be an issue since using Firebug, I can extract the relevant html code (and xpath) for the form embedded in the webpage.
Thanks for any help. The code is pasted below (I replaced my actual username and password):
# -*- coding: utf-8 -*-
import scrapy
class dkspider(scrapy.Spider):
name = "dkspider"
allowed_domains = ["draftkings.com"]
start_urls = ['https://www.draftkings.com/contest-lobby']
def parse(self, response):
return scrapy.http.FormRequest.from_response(response,
formxpath = '//*[#id="login_form"]',
formdata = {'username' : 'myusername', 'password' : 'mypass'},
callback = self.started)
def started(self, response):
filename = 'attempt1.html'
with open(filename, 'wb') as f:
f.write(response.body)
if 'failed' in response.body:
print 'Errors!'
else:
print 'Success'
Seems like your parameters don't match(should be login instead of username) and you are missing some of them in your formdata. This is what firebug shows me is delivered when trying to log in:
Seems like layoutType and returnUrl can just be hardcoded in but profillingSessionId needs to be retrieved from the page source. I checked the source and found this there:
so your Spider should look something like this:
def parse(self, response):
return FormRequest(
url='https://www.draftkings.com/account/login',
formdata={'login': 'login', # login instead of username
'password': 'password',
'profillingSessionId': ''.join(
response.xpath("//input[#id='tmxSessionId']/#value").extract()),
'returnUrl': '',
'layoutType': '2'},
callback=self.started)
def started(self, response):
# Reload the landing page
return Request(self.start_urls[0], self.logged_in)
def logged_in(self, response):
# logged in page here
pass

Scrapy with selenium for a webpage requiring authentication

I am trying to scrape data from a page which has a lot of AJAX calls and javascript execution to render the webpage.So I am trying to use scrapy with selenium to do this. The modus operandi is as follow :
Add the login page URL to the scrapy start_urls list
Use the formrequest from response method to post the username and password to get authenticated.
Once logged in,request for the desired page to be scraped
Pass this response to the Selenium Webdriver to click buttons on the page.
Once the buttons are clicked and a new webpage is rendered,capture the result.
The code that I have thus far is as follows:
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest, Request
from selenium import webdriver
import time
class LoginSpider(BaseSpider):
name = "sel_spid"
start_urls = ["http://www.example.com/login.aspx"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
return FormRequest.from_response(response,
formdata={'User': 'username', 'Pass': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
if "Log Out" in response.body:
self.log("Successfully logged in")
scrape_url = "http://www.example.com/authen_handler.aspx?SearchString=DWT+%3E%3d+500"
yield Request(url=scrape_url, callback=self.parse_page)
else:
self.log("Bad credentials")
def parse_page(self, response):
self.driver.get(response.url)
next = self.driver.find_element_by_class_name('dxWeb_pNext')
next.click()
time.sleep(2)
# capture the html and store in a file
The 2 roadblocks i have hit till now are:
Step 4 does not work.Whenever selenium open the firefox window,it is always at the login screen and does not know how to get past it.
I don't know how to achieve step 5
Any help will be greatly appreciated
I don't believe you can switch between scrapy Requests and selenium like that. You need to log into the site using selenium, not yield Request(). The login session you created with scrapy is not transfered to the selenium session. Here is an example (the element ids/xpath will be different for you):
scrape_url = "http://www.example.com/authen_handler.aspx"
driver.get(scrape_url)
time.sleep(2)
username = self.driver.find_element_by_id("User")
password = self.driver.find_element_by_name("Pass")
username.send_keys("your_username")
password.send_keys("your_password")
self.driver.find_element_by_xpath("//input[#name='commit']").click()
then you can do:
time.sleep(2)
next = self.driver.find_element_by_class_name('dxWeb_pNext').click()
time.sleep(2)
etc.
EDIT: If you need to render javascript and are worried about speed/non-blocking, you can use http://splash.readthedocs.org/en/latest/index.html which should do the trick.
http://splash.readthedocs.org/en/latest/scripting-ref.html#splash-add-cookie has details on passing a cookie, you should be able to pass it from scrapy, but I have not done it before.
log in with scrapy api first
# call scrapy post request with after_login as callback
return FormRequest.from_response(
response,
# formxpath=formxpath,
formdata=formdata,
callback=self.browse_files
)
pass session to selenium chrome driver
# logged in previously with scrapy api
def browse_files(self, response):
print "browse files for: %s" % (response.url)
# response.headers
cookie_list2 = response.headers.getlist('Set-Cookie')
print cookie_list2
self.driver.get(response.url)
self.driver.delete_all_cookies()
# extract all the cookies
for cookie2 in cookie_list2:
cookies = map(lambda e: e.strip(), cookie2.split(";"))
for cookie in cookies:
splitted = cookie.split("=")
if len(splitted) == 2:
name = splitted[0]
value = splitted[1]
#for my particular usecase I needed only these values
if name == 'csrftoken' or name == 'sessionid':
cookie_map = {"name": name, "value": value}
else:
continue
elif len(splitted) == 1:
cookie_map = {"name": splitted[0], "value": ''}
else:
continue
print "adding cookie"
print cookie_map
self.driver.add_cookie(cookie_map)
self.driver.get(response.url)
# check if we have successfully logged in
files = self.wait_for_elements_to_be_present(By.XPATH, "//*[#id='files']", response)
print files

Categories