There is an example in Scrapy Documentation Release 1.0.3,in 7th row, urljoin method is used when the links is relative.when the links is absolute,what should i do?
example code:
import scrapy
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['http://stackoverflow.com/questions?sort=votes']
def parse(self, response):
for href in response.css('.question-summary h3 a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
yield {
'title': response.css('h1 a::text').extract()[0],
'votes': response.css('.question .vote-count-post::text').extract()[0],
'body': response.css('.question .post-text').extract()[0],
'tags': response.css('.question .post-tag::text').extract(),
'link': response.url,
}
You don't need to worry about, urljoin() handles both cases properly:
In [1]: response.urljoin("http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery")
Out[1]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'
In [2]: response.urljoin("/questions/426258/checking-a-checkbox-with-jquery")
Out[2]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'
Related
I'm working on a scrapy bot that can get specific details for optics.
I need to click on a javascript button to show a virtual page, so that
my scrapy bot can scrape the Optic details.
This is what I need the playwright to click on show in a red rectangle.
Details tab highlighted in red
On certain pages, the first item details page is already showing. Example:
Virtual page details tab open
I probably need to create some sort of If else statement for this?
I would to work on it but I've been stuck on the prior issue.
import scrapy
class UpperSpider(scrapy.Spider):
name = 'Optic'
start_urls = [
'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url,
meta={'playwright': True})
# Issue Here is I'm not sure if this is working I want to click on the Details Tab
def virtualpage(self, response, page):
# Virtual Page button
vpButton = response.css('div[id="wrap"]')
for page in vpButton:
page.click('#detailTab')
# Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
# Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.
def parse(self, response):
container = response.css('div[id="wrap"]')
for products in container:
yield {
'ProductName': products.css('span[itemprop="name"]::text').get(),
'Stock': products.css('span[itemprop="availability"]::text').get(),
'Brand': response.css('#listMain .wrap .mbm a::text').get(),
'Name': response.css('#listMain span+ span::text').get(),
'Price': products.css('#priceContainer > span > p > span::text').get(),
'Image': products.css('#lnkImgSku img::attr(src)').get(),
'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
'URL': response.url,
'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
}
Okay, So I tried to make scrapy crawler work. I'm pretty sure I know the problem is at my start_request for URL in self.start_urls: I believe it's telling the playwright to start at Start_URLS. How do I tell the playwright to also start on each crawled page? So that "Clickallbtns" can run?
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine
class UpperSpider(CrawlSpider):
name = 'Upper'
allowed_domains = ['brownells.com']
start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']
le_item_details = LinkExtractor(restrict_css='.listing')
rule_product_detail = Rule(le_item_details,
callback='parse_item',
follow=True,
)
rules = (
rule_product_detail,
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,
meta={'playwright': True,
'playwright_page_coroutines': {
#"waitforload": PageCoroutine("waitforNavagation", 'url'),
"clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
}
}
)
def parse_item(self, response):
container = response.css('div[id="wrap"]')
for products in container:
yield {
'ProductName': products.css('span[itemprop="name"]::text').get(),
'Stock': products.css('span[itemprop="availability"]::text').get(),
'Brand': response.css('#listMain .wrap .mbm a::text').get(),
'Name': response.css('#listMain span+ span::text').get(),
'Price': products.css('#priceContainer > span > p > span::text').get(),
'Image': products.css('#lnkImgSku img::attr(src)').get(),
'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
'URL': response.url,
'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
}
You need to include the clicking logic in the playwright PageCoroutines dictionary so that the buttons are clicked before the response is returned.
See below sample code. If you are defining the scrapy-playwright values in settings.py then you can comment out the custom_settings variable. Otherwise if you are running it from a script, the below code will be sufficient (using scrapy 2.6.1).
import scrapy
from scrapy_playwright.page import PageCoroutine
class UpperSpider(scrapy.Spider):
name = 'Optic'
custom_settings = dict(
DOWNLOAD_HANDLERS={
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
TWISTED_REACTOR="twisted.internet.asyncioreactor.AsyncioSelectorReactor",
)
start_urls = [
'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url,
meta={'playwright': True,
'playwright_page_coroutines': {
"clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
}
}
)
def parse(self, response):
container = response.css('div[id="wrap"]')
for products in container:
yield {
'ProductName': products.css('span[itemprop="name"]::text').get(),
'Stock': products.css('span[itemprop="availability"]::text').get(),
'Brand': response.css('#listMain .wrap .mbm a::text').get(),
'Name': response.css('#listMain span+ span::text').get(),
'Price': products.css('#priceContainer > span > p > span::text').get(),
'Image': products.css('#lnkImgSku img::attr(src)').get(),
'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
'URL': response.url,
'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
}
I'm having trouble structuring scrapy data as I want. My spider get some data from one page, then follows a list of links on that page to get a link of this next page.
def parse_page(self, response):
links = response.css(LINK_SELECTOR).extract()
data = {
'name': response.css(NAME_SELECTOR).extract_first(),
'date': response.css(DATE_SELECTOR).extract(),
}
for link in links:
next_link = response.urljoin(link)
yield scrapy.Request(next_link, callback=self.parse_url, meta={'data': data})
def parse_url(self, response):
data = response.meta['data']
data['url'] = response.css(a::attr(href)').get()
yield data
What I would like is to get the data with the following structure:
{'name': name, 'date': date, 'url': [url1, url2, url3, url4]}
Instead of
{'name': name, 'date': date, 'url': url1}
{'name': name, 'date': date, 'url': url2}
{'name': name, 'date': date, 'url': url3}
{'name': name, 'date': date, 'url': url4}
I've tried to use items but I don't get how to pass the data from parse_url to the parse_page function. How would I do that?
Thanks in advance.
You can use scrapy's coroutine support to do this pretty easily.
The code would look something like this:
async def parse_page(self, response):
...
for link in links:
request = response.follow(link)
response = await self.crawler.engine.download(request, self)
urls.append(response.css('a::attr(href)').get())
The following is one of the ways how you can achieve that. There is a library inline_requests which will help you get the expected output.
import scrapy
from scrapy.crawler import CrawlerProcess
from inline_requests import inline_requests
class YellowpagesSpider(scrapy.Spider):
name = "yellowpages"
start_urls = ["https://www.yellowpages.com/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771"]
#inline_requests
def parse(self, response):
data = {
'name':response.css(".sales-info > h1::text").get(),
'phone':response.css(".contact > p.phone::text").get(),
'target_link':[]
}
for item_link in response.css(".review-info > a.author[href]::attr(href)").getall():
resp = yield scrapy.Request(response.urljoin(item_link), meta={'handle_httpstatus_all': True})
target_link = resp.css("a.review-business-name::attr(href)").get()
data['target_link'].append(target_link)
print(data)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YellowpagesSpider)
c.start()
Output it produces:
{'name': 'Honey Honey Cafe & Crepery', 'phone': '(415) 351-2423', 'target_link': ['/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771', '/walnut-ca/mip/akasaka-japanese-cuisine-455476824', '/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771']}
import scrapy
class rottenTomatoesSpider(scrapy.Spider):
name = "movieList"
start_urls = [
'https://www.rottentomatoes.com/'
]
def parse(self, response):
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]'):
yield {
'score': response.css('td.left_col').extract_first(),
'title': response.css('td.middle_col').extract_first(),
'openingDate': response.css('td.right_col right').extract_first()
}
So the spider is instead scraping <div id='homepage-tv-top'>
I'm assuming it is the homepage- that is confusing the script. Anyone know the workaround?
You need to iterate over each tr and and also in for loop use movieList instead of response
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]//tr'):
yield {
'score': "".join(a for a in movieList.css('td.left_col *::text').extract()),
'title': "".join(a for a in movieList.css('td.middle_col *::text').extract()),
'openingDate': "".join(a for a in movieList.css('td.right_col *::text').extract())
}
I am new to scrapy and Python, so my question may be a simple one. By using an existing website guide, I've written a scraper which scrapes a website's pages and shows the images URL, name and ... in a output file. I want to download the images in a directory but the output directory is empty!
Here is my code:
myspider.py
import scrapy
class BrickSetSpider(scrapy.Spider):
name = 'brick_spider`enter code here`'
start_urls = ['http://brickset.com/sets/year-2016']
def parse(self, response):
SET_SELECTOR = '.set'
for brickset in response.css(SET_SELECTOR):
NAME_SELECTOR = 'h1 a ::text'
PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()'
MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()'
IMAGE_SELECTOR = 'img ::attr(src)'
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
settings.py
ITEM_PIPELINES = {'brickset.pipelines.BricksetPipeline': 1}
IMAGES_STORE = '/home/nmd/brickset/brickset/spiders/output'
#items.py
import scrapy
class BrickSetSpider(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass
Scrapy provides a media pipeline if your interested in downloading files or images
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
Then you need to add image_urls in your item for the pipeline to download the file, so change
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
to
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image_urls': brickset.css(IMAGE_SELECTOR).extract_first(),
}
For more details refer to https://doc.scrapy.org/en/latest/topics/media-pipeline.html
When i am executing this code i am getting result in the form of {[text1,author1,tag1],[text2,author2,tag2],...}
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
But, in the same code for another URL (below) i am getting result as {[name1,name2,..],[city1,city2,...]}
I want to have it in the form of {[name1,city1],[name2,city2],...] as it wah happening for the above code.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('div.list-pages'):
yield {
'name': students.css('div.title a::text').extract(),
'city': students.css('div.clg-state a::text').extract(),
}
Your students selector is faulty:
for students in response.css('div.list-pages'):
This only selects the whole page.
What you are looking for here I think is:
for students in response.css('li.search-result'):