I made a crawler, splash is working (i tested it in my browser), scrapy though can't crawl and extract items.
My actual code is:
# -*- coding: utf-8 -*-
import scrapy
import json
from scrapy.http.headers import Headers
from scrapy.spiders import CrawlSpider, Rule
from oddsportal.items import OddsportalItem
class OddbotSpider(CrawlSpider):
name = "oddbot"
allowed_domains = ["oddsportal.com"]
start_urls = (
'http://www.oddsportal.com/matches/tennis/',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 5.5}
}
})
def parse(self, response):
item = OddsportalItem()
print response.body
Try importing scrap_splash and call new request through SplashRequest as:
from scrapy_splash import SplashRequest
yield SplashRequest(url, endpoint='render.html', args={'any':any})
You should modify CrawlSpider
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
Related
My problem is the following, my spider has just successfully clicked on a button within the function parse_search_page(). In the function parse_identity I am on the next page where I can start scraping some information. But the variable "response" is of type SplashJsonResponse which is not supported by xpath() and response.body is of type bytes which is as well not supported
The solutions of my problem that I think can work are:
Convert SplashJsonResponse to SplashTextResponse (which is an html response)
Use xpath on bytes
Convert scrapy_splash.response.SplashJsonResponse to scrapy.http.response.html.HtmlResponse
Code:
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy_splash import SplashRequest
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['https://app.nominations.hospimedia.fr']
def parse(self, response):
# the function "callback" is called after you have logged in
return scrapy.FormRequest.from_response(
response,
formdata={'user[email]': 'XXX', 'user[password]': 'XXX'},
callback=self.parse_landing_page
)
def parse_landing_page(self, response):
# open webpage after logging in
#open_in_browser(response)
start_urls = 'https://app.nominations.hospimedia.fr'
# we extract the title
# title = response.xpath('//title/text()').extract()
print("hello1")
# regarder a quoi sert le extract() si on le mets pas
next_page_partial_url = response.xpath('//div[#class="l-action l-action--small"]/a/#href').extract()
#print(next_page_partial_url)
next_page_url = start_urls + next_page_partial_url[0]
yield scrapy.Request(next_page_url, callback=self.parse_search_page)
def parse_search_page(self, response):
# if you click on the page below you know if your scrapy-splash is working
# http://localhost:8050/
script = '''
function main(splash, args)
splash:go(splash.args.url)
splash:runjs('document.getElementsByClassName("button tertiary")[0].click()')
return {
html = splash:html(),
}
end
'''
open_in_browser(response)
print("----------")
# scrapy.http.response.html.HtmlResponse
print(type(response))
print("------------")
#yield SplashRequest(response.request.url, callback = self.parse_identity, endpoint='execute', args={'lua_source': script})
yield SplashRequest(callback = self.parse_identity,
endpoint='execute',
args={'url':response.request.url,
'lua_source': script}
)
def parse_identity(self, response):
print("----------------------------------------")
# scrapy_splash.response.SplashJsonResponse
print(type(response))
# <class 'bytes'>
print(type(response.body))
print(response.body)
print(("----------------------------------------"))
next_page_partial_url = response.xpath('//div[#class="medium-6 small-12 columns"]/text()').extract()
#next_page_partial_url = response.xpath('//a[#rel="noopener noreferrer"]/text()').extract()
print(next_page_partial_url)
print(("----------------------------------------"))
#inspect_response(response, self)
#open_in_browser(response)
Actually the solution is to use HtmlResponse
from scrapy.http import HtmlResponse
html_response = HtmlResponse(url=response.url, body=response.text, encoding='utf-8')
I'm building a scaper with Scrapy for swedish ecommerce site Blocket.se.
It's scraping the first page as it should, but it won't jump the next.
The command for next url
response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
outputs an "incomplete" link when I try it in Scrapy shell:
?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Does it have to be a "full" link to work?:
https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Starting-url: https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th
Full code:
import scrapy
class BlocketSpider(scrapy.Spider):
name = "blocket"
start_urls = ["https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th"]
def parse(self, response):
urls = response.css("h1.media-heading > a::attr(href)").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
#follow pagination links
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
"Objekt": response.css("h1.h3::text").extract(),
"Säljare":response.css("li.mrl > strong > a::text").extract(),
"Uppladdad": response.css("li.mrl > time::text").extract(),
"Pris": response.css("div.h3::text").extract(),
"Område": response.css("span.area_label::text").extract(),
"Bild-URL": response.css("div.item > img::attr(src)").extract(),
}
Yes, scrapy needs the full URL, usually. But you can keep using urljoin() or using the response.follow() method:
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
More about this in Scrapy Tutorial.
I'm trying to scrape a website using scrapy.
When I scrape a specific page, pagination scraping works but when I try to scrape all the pages with one jump pagination does not work.
I tried creating an extra function for the pagination but this does not fix the problem. All help would be appreciated. What am I doing wrong ? Here's my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.http import Request
from avtogumi.items import AvtogumiItem
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/index.php' ]
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_params)
def parse_params(self, response):
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse_params)
The issue here is this:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
This snippet of code will parse and load exactly one result. If you have a page with multiple results, you would have to put this code inside a for loop and iterate over all the search results you want to parse:
objects = response.xpath('my_selector_here')
for object in objects:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
Hope this helps
use/rewrite this code
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/']
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
yield Request(url=response.urljoin(url), callback=self.parse_params)
def parse_params(self, response):
subjects = response.xpath('//div[#class="full-product-box search-box"]')
for subject in subjects:
yield {
'title': subject.xpath('.//h4/a/text()').extract_first(),
'subtitle': subject.xpath('.//p[#class="ft-darkgray"]/text()').extract_first(),
'price': subject.xpath('.//span[#class="promo-price"]/text()').extract_first(),
'stock': subject.xpath('.//div[#class="product-box-stock"]//span/text()').extract_first(),
'category': subject.xpath('.//div[#class="labels hidden-md hidden-lg"][0]//text()').extract_first(),
'brand': subject.xpath('.//h4[#class="brand-header"][0]//text()').extract_first(),
'img_path': subject.xpath('.//div/img[#class="prod-imglist"]/#src').extract_first(),
}
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url, callback=self.parse_params)
13407 items scraped
Up to now I have found how to scrape one page or multiple pages with same url, but changing number. However, I could not find how to scrape pages with subcategories and their subcategories and finally get the content needed.
I am trying to scrape this website: http://www.askislam.org/index.html
I am using Scrapy, but I do not know where to start.
Or you can suggest a better option, I just use python and check from there.
Thanks
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy import Selector
from ask_islam.items import AskIslamItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
class AskislamSpider(Spider):
name = "askislam"
allowed_domains = ["askislam.org"]
start_urls = ['http://www.askislam.org/']
rules = [Rule(LinkExtractor(allow = ()), callback = 'parse', follow=True)]
def parse(self, response):
hxs = Selector(response)
links = hxs.css('div[id="categories"] li a::attr(href)').extract()
for link in links:
url = 'http://www.askislam.org' + link.replace('index.html', '')
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('div[id="categories"] li').extract()
questions = hxs.xpath('a').extract()
if(categories):
for categoryLink in categories:
url = 'http://www.askislam.org' + categoryLink.replace('index.html', '')
yield Request(url, callback=self.parse_page)
# print (question)
EDIT
def start_requests(self):
yield Request("http://www.askislam.org", callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('#categories li')
for cat in categories:
item = AskIslamItem()
link = cat.css('a::attr(href)').extract()[0]
link = "http://www.askislam.org" + link
item['catLink'] = link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
yield Request(link, callback=self.parse_categories)
def parse_categories(self, response):
logging.info("The Cat Url")
Read links from that http://www.askislam.org/index.html page using xPath or CSS Selectors of those sub-categories and then do another Request()
EDIT:
import logging
class AskislamSpider(Spider):
name = "askislam"
def start_requests(self):
yield Request("http://www.askislam.org/", callback=self.parse_page)
def parse_page(self, response):
categories = response.css('#categories li').extract()
for cat in categories:
link = cat.css("a::attr(href)").extract()[0]
link = "http://www.askislam.org/" + link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.