Scrapy crawler not processing XHR Request - python

My spider is only crawling the first 10 pages, so I am assuming it is not entering the load more button though the Request.
I am scraping this website: http://www.t3.com/reviews.
My spider code:
import scrapy
from scrapy.conf import settings
from scrapy.http import Request
from scrapy.selector import Selector
from reviews.items import ReviewItem
class T3Spider(scrapy.Spider):
name = "t3" #spider name to call in terminal
allowed_domains = ['t3.com'] #the domain where the spider is allowed to crawl
start_urls = ['http://www.t3.com/reviews'] #url from which the spider will start crawling
def parse(self, response):
sel = Selector(response)
review_links = sel.xpath('//div[#id="content"]//div/div/a/#href').extract()
for link in review_links:
yield Request(url="http://www.t3.com"+link, callback=self.parse_review)
#if there is a load-more button:
if sel.xpath('//*[#class="load-more"]'):
req = Request(url=r'http://www\.t3\.com/more/reviews/latest/\d+', headers = {"Referer": "http://www.t3.com/reviews", "X-Requested-With": "XMLHttpRequest"}, callback=self.parse)
yield req
else:
return
def parse_review(self, response):
pass #all my scraped item fields
What I am doing wrong? Sorry but I am quite new to scrapy. Thanks for your time, patience and help.

If you inspect the "Load More" button, you would not find any indication of how the link to load more reviews is constructed. The idea behind is rather easy - the numbers after http://www.t3.com/more/reviews/latest/ suspiciously look like a timestamp of the last loaded article. Here is how you can get it:
import calendar
from dateutil.parser import parse
import scrapy
from scrapy.http import Request
class T3Spider(scrapy.Spider):
name = "t3"
allowed_domains = ['t3.com']
start_urls = ['http://www.t3.com/reviews']
def parse(self, response):
reviews = response.css('div.listingResult')
for review in reviews:
link = review.xpath("a/#href").extract()[0]
yield Request(url="http://www.t3.com" + link, callback=self.parse_review)
# TODO: handle exceptions here
# extract the review date
time = reviews[-1].xpath(".//time/#datetime").extract()[0]
# convert a date into a timestamp
timestamp = calendar.timegm(parse(time).timetuple())
url = 'http://www.t3.com/more/reviews/latest/%d' % timestamp
req = Request(url=url,
headers={"Referer": "http://www.t3.com/reviews", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
yield req
def parse_review(self, response):
print response.url
Notes:
this requires dateutil module to be installed
you should recheck the code and make sure you are getting all of the reviews without skipping any of them
you should somehow end this "Load more" thing

Related

Scrapy crawl every link after authentication

Introduction
Since my crawler is more or less finished yet, i need to redo a crawler which only crawls whole domain for links, i need this for my work.
The spider which crawls every link should run once per month.
I'm running scrapy 2.4.0 and my os is Linux Ubuntu server 18.04 lts
Problem
The website which i have to crawl changed their "privacy", so you have to be logged in before you can see the products, which is the reason why my "linkcrawler" wont work anymore.
I already managed to login and scrape all my stuff, but the start_urls where given in a csv file.
Code
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(deny='https://www.duif.nl/nl/'), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links)
def search_links(self, response):
link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href').get()
for a in link:
link = response.url
yield response.follow(url=link, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Unfortunately i cant provide a dummyaccount, where you can try login by yourself, because its a b2b-service website.
I can imagine that my "def search_links" is wrong.
My planned structure is:
visit login_page, pass my login credentials
check if logged in by xpath, where it checks, if the logout button is given or not.
If logged in, it prints 'success'
Given by xpath expression, it should start to follow links by:
by visiting every link, it should check by xpath xpression, if specific container is given or not, so it knows whether its a productpage or not.
if product page, save visited link, if not productpage, take next link
Console output
Like you can see, the authentication is working, but it wont do anything afterwards.
Update
i reworked my code a very bit:
import scrapy
from ..items import DuifItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest, Request
from scrapy_splash import SplashRequest
class DuifLinkSpider(CrawlSpider):
name = 'duiflink'
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
start_urls = ['https://www.duif.nl/']
custom_settings = {'FEED_EXPORT_FIELDS' : ['Link']}
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse_login,
args={'wait': 3},
dont_filter=True
)
rules = (
Rule(LinkExtractor(), callback='parse_login', follow=True),
)
def parse_login(self, response):
return FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type' : 'submit'},
callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]//a/#href')[13]
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)
def search_links(self, response):
# link = response.xpath('//ul[#class="nav navbar-nav navbar-secondary navbar-right"]/li/a/#href')
link = response.xpath('//a/#href')
for a in link:
link = a.get()
link = 'https://www.duif.nl' + link if link else link
yield response.follow(url=link, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = DuifItem()
items['Link'] = response.url
yield items
Now i know, that i am definitely logged in, but it doesnt follow the "sub"-links, but i thought if i use response.xpath('//a/#href') , it will automatically searches the whole dom for every link.
Below my new console output
After you login, you go back to parsing your start url. Scrapy filters out duplicate requests by default, so in your case it stops here. You can avoid this by using 'dont_filter=True' in your request, like this:
yield response.follow(url=url, callback=self.search_links, dont_filter=True)

Scrapy spider outputs empy csv file

This is my first question here and I'm learning how to code by myself so please bear with me.
I'm working on a final CS50 project which I'm trying to built a website that aggregates online Spanish course from edx.org and other open online couses websites maybe. I'm using scrapy framework to scrap the filter results of Spanish courses on edx.org... Here is my first scrapy spider which I'm trying to get in each courses link to then get it's name (after I get the code right, also get the description, course url and more stuff).
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
from scrapy.loader import ItemLoader
class Course_item(Item):
name = Field()
#description = Field()
#img_url = Field()
class Course_spider(CrawlSpider):
name = 'CourseSpider'
allowed_domains = ['https://www.edx.org/']
start_urls = ['https://www.edx.org/course/?language=Spanish']
rules = (Rule(LinkExtractor(allow=r'/course'), callback='parse_item', follow='True'),)
def parse_item(self, response):
item = ItemLoader(Course_item, response)
item.add_xpath('name', '//*[#id="course-intro-heading"]/text()')
yield item.load_item()
When I run the spider with "scrapy runspider edxSpider.py -o edx.csv -t csv" I get an empty csv file and I also think is not getting into the right spanish courses results.
Basically I want to get in each courses of this link edx Spanish courses and get the name, description, provider, page url and img url.
Any ideas for why might be the problem?
You can't get edx content with a simple request, it uses javascript rendering for getting the course element dynamically, so CrawlSpider won't work on this case, because you need to find specific elements inside the response body to generate a new Request that will get what you need.
The real request (to get the urls of the courses) is this one, but you need to generate it from the previous response body (although you could just visit it an also get the correct data).
So, to generate the real request, you need data that is inside a script tag:
from scrapy import Spider
import re
import json
class Course_spider(Spider):
name = 'CourseSpider'
allowed_domains = ['edx.org']
start_urls = ['https://www.edx.org/course/?language=Spanish']
def parse(self, response):
script_text = response.xpath('//script[contains(text(), "Drupal.settings")]').extract_first()
parseable_json_data = re.search(r'Drupal.settings, ({.+})', script_text).group(1)
json_data = json.loads(parseable_json_data)
...
Now you have what you need on json_data and only need to create the string URL.
This page use JavaScript to get data from server and add to page.
It uses urls like
https://www.edx.org/api/catalog/v2/courses/course-v1:IDBx+IDB33x+3T2017
Last part is course's number which you can find in HTML
<main id="course-info-page" data-course-id="course-v1:IDBx+IDB33x+3T2017">
Code
from scrapy.http import Request
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
from scrapy.loader import ItemLoader
import json
class Course_spider(CrawlSpider):
name = 'CourseSpider'
allowed_domains = ['www.edx.org']
start_urls = ['https://www.edx.org/course/?language=Spanish']
rules = (Rule(LinkExtractor(allow=r'/course'), callback='parse_item', follow='True'),)
def parse_item(self, response):
print('parse_item url:', response.url)
course_id = response.xpath('//*[#id="course-info-page"]/#data-course-id').extract_first()
if course_id:
url = 'https://www.edx.org/api/catalog/v2/courses/' + course_id
yield Request(url, callback=self.parse_json)
def parse_json(self, response):
print('parse_json url:', response.url)
item = json.loads(response.body)
return item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(Course_spider)
c.start()
from scrapy.http import Request
from scrapy import Spider
import json
class edx_scraper(Spider):
name = "edxScraper"
start_urls = [
'https://www.edx.org/api/v1/catalog/search?selected_facets[]=content_type_exact%3Acourserun&selected_facets[]=language_exact%3ASpanish&page=1&page_size=9&partner=edx&hidden=0&content_type[]=courserun&content_type[]=program&featured_course_ids=course-v1%3AHarvardX+CS50B+Business%2Ccourse-v1%3AMicrosoft+DAT206x+1T2018%2Ccourse-v1%3ALinuxFoundationX+LFS171x+3T2017%2Ccourse-v1%3AHarvardX+HDS2825x+1T2018%2Ccourse-v1%3AMITx+6.00.1x+2T2017_2%2Ccourse-v1%3AWageningenX+NUTR101x+1T2018&featured_programs_uuids=452d5bbb-00a4-4cc9-99d7-d7dd43c2bece%2Cbef7201a-6f97-40ad-ad17-d5ea8be1eec8%2C9b729425-b524-4344-baaa-107abdee62c6%2Cfb8c5b14-f8d2-4ae1-a3ec-c7d4d6363e26%2Ca9cbdeb6-5fc0-44ef-97f7-9ed605a149db%2Cf977e7e8-6376-400f-aec6-84dcdb7e9c73'
]
def parse(self, response):
data = json.loads(response.text)
for course in data['objects']['results']:
url = 'https://www.edx.org/api/catalog/v2/courses/' + course['key']
yield response.follow(url, self.course_parse)
if 'next' in data['objects'] is not None:
yield response.follow(data['objects']['next'], self.parse)
def course_parse(self, response):
course = json.loads(response.text)
yield{
'name': course['title'],
'effort': course['effort'],
}

scrapy crawl multiple pages, extracting data and saving into mysql

Hi can someone help me out I seem to be stuck, I am learning how to crawl and save into mysql us scrapy. I am trying to get scrapy to crawl all of the website pages. Starting with "start_urls", but it does not seem to automatically crawl all of the pages only the one, it does save into mysql with pipelines.py. It does also crawl all pages when provided with urls in a f = open("urls.txt") as well as saves data using pipelines.py.
here is my code
test.py
import scrapy
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from gotp.items import GotPItem
from scrapy.log import *
from gotp.settings import *
from gotp.items import *
class GotP(CrawlSpider):
name = "gotp"
allowed_domains = ["www.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/sss"]
rules = [
Rule(SgmlLinkExtractor(
allow=('')),
callback ="parse",
follow=True
)
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
prices = hxs.select("//div[#class="sliderforward arrow"]")
for price in prices:
item = GotPItem()
item ["price"] = price.select("text()").extract()
yield item
If I understand correctly, you are trying to follow the pagination and extract the results.
In this case, you can avoid using CrawlSpider and use regular Spider class.
The idea would be to parse the first page, extract total results count, calculate how much pages to go and yield scrapy.Request instances to the same URL providing s GET parameter value.
Implementation example:
import scrapy
class GotP(scrapy.Spider):
name = "gotp"
allowed_domains = ["www.sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/sss"]
results_per_page = 100
def parse(self, response):
total_count = int(response.xpath('//span[#class="totalcount"]/text()').extract()[0])
for page in xrange(0, total_count, self.results_per_page):
yield scrapy.Request("http://sfbay.craigslist.org/search/sss?s=%s&" % page, callback=self.parse_result, dont_filter=True)
def parse_result(self, response):
results = response.xpath("//p[#data-pid]")
for result in results:
try:
print result.xpath(".//span[#class='price']/text()").extract()[0]
except IndexError:
print "Unknown price"
This would follow the pagination and print prices on the console. Hope this is a good starting point.

scrapy crawl spider ajax pagination

I was trying to scrap link which has ajax call for pagination.
I am trying to crawl http://www.demo.com link. and in .py file I provided this code for restrict XPATH and coding is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import sumSpider, Rule
from scrapy.selector import HtmlXPathSelector
from sum.items import sumItem
class Sumspider1(sumSpider):
name = 'sumDetailsUrls'
allowed_domains = ['sum.com']
start_urls = ['http://www.demo.com']
rules = (
Rule(LinkExtractor(restrict_xpaths='.//ul[#id="pager"]/li[8]/a'), callback='parse_start_url', follow=True),
)
#use parse_start_url if your spider wants to crawl from first page , so overriding
def parse_start_url(self, response):
print '********************************************1**********************************************'
#//div[#class="showMoreCars hide"]/a
#.//ul[#id="pager"]/li[8]/a/#href
self.log('Inside - parse_item %s' % response.url)
hxs = HtmlXPathSelector(response)
item = sumItem()
item['page'] = response.url
title = hxs.xpath('.//h1[#class="page-heading"]/text()').extract()
print '********************************************title**********************************************',title
urls = hxs.xpath('.//a[#id="linkToDetails"]/#href').extract()
print '**********************************************2***url*****************************************',urls
finalurls = []
for url in urls:
print '---------url-------',url
finalurls.append(url)
item['urls'] = finalurls
return item
My items.py file contains
from scrapy.item import Item, Field
class sumItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = Field()
urls = Field()
Still I'm not getting exact output not able to fetch all pages when I am crawling it.
I hope the below code will help.
somespider.py
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from demo.items import DemoItem
from selenium import webdriver
def removeUnicodes(strData):
if(strData):
strData = strData.encode('utf-8').strip()
strData = re.sub(r'[\n\r\t]',r' ',strData.strip())
return strData
class demoSpider(scrapy.Spider):
name = "domainurls"
allowed_domains = ["domain.com"]
start_urls = ['http://www.domain.com/used/cars-in-trichy/']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(5)
hxs = Selector(response)
item = DemoItem()
finalurls = []
while True:
next = self.driver.find_element_by_xpath('//div[#class="showMoreCars hide"]/a')
try:
next.click()
# get the data and write it to scrapy items
item['pageurl'] = response.url
item['title'] = removeUnicodes(hxs.xpath('.//h1[#class="page-heading"]/text()').extract()[0])
urls = self.driver.find_elements_by_xpath('.//a[#id="linkToDetails"]')
for url in urls:
url = url.get_attribute("href")
finalurls.append(removeUnicodes(url))
item['urls'] = finalurls
except:
break
self.driver.close()
return item
items.py
from scrapy.item import Item, Field
class DemoItem(Item):
page = Field()
urls = Field()
pageurl = Field()
title = Field()
Note:
You need to have selenium rc server running because HTMLUNITWITHJS works with selenium rc only using Python.
Run your selenium rc server issuing the command :
java -jar selenium-server-standalone-2.44.0.jar
Run your spider using command:
spider crawl domainurls -o someoutput.json
You can check with your browser how the requests are made.
Behind the scene, right after you click on that button "show more cars" your browser will request a JSON data to feed your next page. You can take advantage of this fact and deal directly with the JSON data without the necessity to work with a JavaScript engine as Selenium or PhantomJS.
In your case, as the first step you should simulate an user scrolling down the page given by your start_url parameter and profile at the same time your network requests to discover the endpoint used by the browser to request that JSON. To discover this endpoint in general there is a XHR(XMLHttpRequest) section on the browser's profile tool as here in Safari where you can navigate thought all resources/endpoints used to request the data.
Once you discover this endpoint it's a straightforward task: you give your Spider as start_url the endpoint that you just discovered and according you process and navigate through the JSON's you can discover if it a next page to request.
P.S.: I saw for you that the endpoint url is http://www.carwale.com/webapi/classified/stockfilters/?city=194&kms=0-&year=0-&budget=0-&pn=2
In this case my browser requested the second page, as you can see in the parameter pn. It's is important you set the some header parameters before you send the request. I noticed in your case the headers are:
Accept text/plain, /; q=0.01
Referer http://www.carwale.com/used/cars-in-trichy/
X-Requested-With XMLHttpRequest
sourceid 1
User-Agent Mozilla/5.0...

using scrapy to extract dynamic data - location based on postcodes

I'm new to Scrapy, and with some tutorials I was able to scrape a few simple websites, but I'm facing an issue now with a new website where I have to fill a search form and extract the results. The response I get doesn't have the results.
Let's say for example, for the following site: http://www.beaurepaires.com.au/store-locator/
I want to provide a list of postcodes and extract information about stores in each postcode (store name and address).
I'm using the following code but it's not working, and I'm not sure where to start from.
class BeaurepairesSpider(BaseSpider):
name = "beaurepaires"
allowed_domains = ["http://www.beaurepaires.com.au"]
start_urls = ["http://www.beaurepaires.com.au/store-locator/"]
#start_urls = ["http://www.beaurepaires.com.au/"]
def parse(self, response):
yield FormRequest.from_response(response, formname='frm_dealer_locator', formdata={'dealer_postcode_textfield':'2115'}, callback=self.parseBeaurepaires)
def parseBeaurepaires(self, response):
hxs = HtmlXPathSelector(response)
filename = "postcodetest3.txt"
open(filename, 'wb').write(response.body)
table = hxs.select("//div[#id='jl_results']/table/tbody")
headers = table.select("tr[position()<=1]")
data_rows = table.select("tr[position()>1]")
Thanks!!
The page load here heavily uses javascript and is too complex for Scrapy. Here's an example of what I've come up to:
import re
from scrapy.http import FormRequest, Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class BeaurepairesSpider(BaseSpider):
name = "beaurepaires"
allowed_domains = ["beaurepaires.com.au", "gdt.rightthere.com.au"]
start_urls = ["http://www.beaurepaires.com.au/store-locator/"]
def parse(self, response):
yield FormRequest.from_response(response, formname='frm_dealer_locator',
formdata={'dealer_postcode_textfield':'2115'},
callback=self.parseBeaurepaires)
def parseBeaurepaires(self, response):
hxs = HtmlXPathSelector(response)
script = str(hxs.select("//div[#id='jl_container']/script[4]/text()").extract()[0])
url, script_name = re.findall(r'LoadScripts\("([a-zA-Z:/\.]+)", "(\w+)"', script)[0]
url = "%s/locator/js/data/%s.js" % (url, script_name)
yield Request(url=url, callback=self.parse_js)
def parse_js(self, response):
print response.body # here are your locations - right, inside the js file
see that regular expressions are used, hardcoded urls, and you'll have to parse js in order to get your locations - too fragile even if you'll finish it and get the locations.
Just switch to in-browser tools like selenium (or combine scrapy with it).

Categories