I'm learning how to use Scrapy
spider.py
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for brickset in response.xpath('//div[#class="product-name"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
I run this spider with command: scrapy crawl test_spider -o test.csv
This is working fine for //div[#class="product-name", but I don't know how to add another CSS/XPath class in the same spider file
I'm trying this but it does't work
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for test in response.xpath('//div[#class="product-name"]'):
yield {
'name': test.xpath('h1/text()').extract_first(),
}
def parse(self, response):
for attempt in response.xpath('//div[#class="another-class"]'):
yield {
'color': attempt.xpath('h1/a/text()').extract_first(),
}
Please help me to do this.
Just use two for loops:
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for brickset in response.xpath('//div[#class="product-name"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
for brickset in response.xpath('//div[#class="another-class"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
def parse(self, response):
product_name_lst = []
# we will append all data to product_name_lst
for test in response.xpath('//div[#class="product-name"]'):
product_name_lst.append('name': test.xpath('h1/text()').extract_first())
another_product_name_lst = []
# we will append all data to another_product_name_lst
for test in response.xpath('//div[#class="another-product-name"]'):
another_product_name_lst.append('name': test.xpath('h1/text()').extract_first())
# after that write to out.csv all the data you need from
# product_name_lst and another_prodct_name_lst lists
out_file = open('out.csv', 'a') # a meen append to file not rewrite file
# and here you need to write in out.csv file
out.write(data) # data is what you need to write
# and close the file
out.close()
Related
I'm using generic spiders with a list of multiple urls in the start_urls field.
Is it possible to export one json file for each URL?
As far as I know it's only possible to set one path to one specific output file.
Any ideas how to solve this are rewarded!
EDIT: This is my spider class:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'my_spider'
start_urls = start_urls = ['www.domain1.com','www.domain2.com',
'www.domain3.com']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'FEED_URI': 'file:///C:/path/to/result.json',
}
rules = (
Rule(LinkExtractor(allow=r"abc"), callback='parse_item', follow=True),
)
def parse_item(self, response):
all_text = response.xpath("//p/text()").getall()
yield {
"text": " ".join(all_text),
"url": response.url,
}
First option
You can save the items in the spider as Scrapy tutorial for example:
import scrapy
import json
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = "mydomain"
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
filename = DICT[response.url]
with open(filename, 'w') as fp:
json.dump({"content": response.body.decode("utf-8")}, fp)
The DICT variable is just for specifying the JSON filename but you can use the domain as the filename too.
Second option
You can try using process_item in pipelines.py as follow:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
JsonItemExporter(open(filename, "wb")).export_item(item)
return item
item['filename'] is for save the filename for each start_url. You need to set the items.py too, for example:
import scrapy
class MydomainItem(scrapy.Item):
filename = scrapy.Field()
content = scrapy.Field()
your spider:
import scrapy
from ..items import MydomainItem
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = 'mydomain'
allowed_domains = ['mydomain.com']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
item = MydomainItem()
item["filename"] = DICT[response.url]
item["content"] = response.body.decode("utf-8")
yield item
Before running you need to add the pipeline in your settings.
ITEM_PIPELINES = {
'myproject.pipelines.SaveJsonPipeline': 300,
}
I am using scarpy to extract data from line using spider, But having issue that while yielding the result it saving brackets of list in a cell
here is my spider
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c3.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
for i in ids:
yield {
'extract': response.xpath("string(//td[#id='LC%s'])" % i).extract()
}
and the result is
I also tried this
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c4.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
Code = []
for i in ids:
result = response.xpath("string(//td[#id='LC%s'])"%i)
Code.append(result.extract())
yield {'extract': Code}
But it gives these error
But the required result is
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
I'm very new to scrapy and python and could really do with some help. I've got this code to work in command line. I can see it pulling out all the right information as it goes through the different pages.
My problem is that when I try to save the output of the script to a file it comes out empty. I have looked at lots of other questions on here but can't find anything that helps.
Here is the code
import scrapy
from urlparse import urljoin
class Aberdeenlocations1Spider(scrapy.Spider):
name = "aberdeenlocations2"
start_urls = [
'http://brighthouse.co.uk/store-finder/all-stores',
]
def parse(self, response):
products = response.xpath('//ul/li/a/#href').extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for div in response.css('div'):
yield {
title: (response.css('title::text').extract()),
address: (response.css('[itemprop=streetAddress]::text').extract()),
locality: (response.css('[itemprop=addressLocality]::text').extract()),
region: (response.css('[itemprop=addressRegion]::text').extract()),
postcode: (response.css('[itemprop=postalCode]::text').extract()),
telephone: (response.css('[itemprop=telephone]::text').extract()),
script: (response.xpath('//div/script').extract()),
gmaplink: (response.xpath('//div/div/div/p/a/#href').extract_first())
}
I am then running this command on the above script
scrapy crawl aberdeenlocations2 -o data.json
What am I doing wrong?
Just some python errors in your yield I think. Like this I get some data in output:
import scrapy
from urlparse import urljoin
class Aberdeenlocations1Spider(scrapy.Spider):
name = "aberdeenlocations2"
start_urls = [
'http://brighthouse.co.uk/store-finder/all-stores',
]
def parse(self, response):
products = response.xpath('//ul/li/a/#href').extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
# not sure why this loop is there
for div in response.css('div'):
yield {
'title': response.css('title::text').extract(),
'address': response.css('[itemprop=streetAddress]::text').extract(),
'locality': response.css('[itemprop=addressLocality]::text').extract(),
'region': response.css('[itemprop=addressRegion]::text').extract(),
'postcode': response.css('[itemprop=postalCode]::text').extract(),
'telephone': response.css('[itemprop=telephone]::text').extract(),
'script': response.xpath('//div/script').extract(),
'gmaplink': response.xpath('//div/div/div/p/a/#href').extract_first()
}
scrapy crawl raamatuvahetus -o raamatuvahetus.csv is outputting an empty csv file. I have no idea why this is. All other Scrapy files generated by scrapy startproject are untouched, and all settings are left at default.
import scrapy
from scrapy.exceptions import CloseSpider
class RaamatuvahetusSpider(scrapy.Spider):
name = 'raamatuvahetus'
start_urls = ['https://www.raamatuvahetus.ee/et/bookwished.wishall?limit=200']
def parse(self, response):
for href in response.xpath("//a[#class='b-info']/#href"):
yield response.follow(href, callback=self.parse_book)
def parse_book(self, response):
wishings = response.xpath("//img[#class='uimg']")
wishings_count = 0
if wishings:
wishings_count = len(wishings)
if wishings_count < 15:
raise CloseSpider('Wishings fever than 15.')
title = response.xpath("//article[#class='text']/h1/text()").extract_first()
author = response.xpath("//div[#class='author']/a/text()").extract_first()
year = response.xpath("//div[#class='year']/text()").extract_first()
yield
{
"Pealkiri": title,
"Autor": author,
"Aasta": year,
"Soovid": wishings_count
}
Edit:
Solved! Heed all travelers who accost a similar complication -- fret not! I have the answers you seek.
Instead of
yield
{
}
write
yield {
}