I am using scarpy to extract data from line using spider, But having issue that while yielding the result it saving brackets of list in a cell
here is my spider
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c3.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
for i in ids:
yield {
'extract': response.xpath("string(//td[#id='LC%s'])" % i).extract()
}
and the result is
I also tried this
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c4.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
Code = []
for i in ids:
result = response.xpath("string(//td[#id='LC%s'])"%i)
Code.append(result.extract())
yield {'extract': Code}
But it gives these error
But the required result is
Related
I'm using generic spiders with a list of multiple urls in the start_urls field.
Is it possible to export one json file for each URL?
As far as I know it's only possible to set one path to one specific output file.
Any ideas how to solve this are rewarded!
EDIT: This is my spider class:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'my_spider'
start_urls = start_urls = ['www.domain1.com','www.domain2.com',
'www.domain3.com']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'FEED_URI': 'file:///C:/path/to/result.json',
}
rules = (
Rule(LinkExtractor(allow=r"abc"), callback='parse_item', follow=True),
)
def parse_item(self, response):
all_text = response.xpath("//p/text()").getall()
yield {
"text": " ".join(all_text),
"url": response.url,
}
First option
You can save the items in the spider as Scrapy tutorial for example:
import scrapy
import json
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = "mydomain"
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
filename = DICT[response.url]
with open(filename, 'w') as fp:
json.dump({"content": response.body.decode("utf-8")}, fp)
The DICT variable is just for specifying the JSON filename but you can use the domain as the filename too.
Second option
You can try using process_item in pipelines.py as follow:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
JsonItemExporter(open(filename, "wb")).export_item(item)
return item
item['filename'] is for save the filename for each start_url. You need to set the items.py too, for example:
import scrapy
class MydomainItem(scrapy.Item):
filename = scrapy.Field()
content = scrapy.Field()
your spider:
import scrapy
from ..items import MydomainItem
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = 'mydomain'
allowed_domains = ['mydomain.com']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
item = MydomainItem()
item["filename"] = DICT[response.url]
item["content"] = response.body.decode("utf-8")
yield item
Before running you need to add the pipeline in your settings.
ITEM_PIPELINES = {
'myproject.pipelines.SaveJsonPipeline': 300,
}
I have to scrape data(name, price, description, brand,...) on this website: https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww%7Cnew+in%7Cnew+products%7Cclothing
My code is as such:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
def remove_characters(self,value):
return value.strip('\n')
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def parse_item(self, response):
yield{
'name':response.xpath("//div[#class='product-hero']/h1/text()").get(),
'price':response.xpath("//span[#data-id='current-price']").get(),
'description':response.xpath("//div[#class='product-description']/ul/li/text()").getall(),
'about_me': response.xpath("//div[#class='about-me']//text()").getall(),
'brand_description':response.xpath("//div[#class='brand-description']/p/text()").getall()
}
However, due to javascript I cannot get the Price. I need to get it thorugh XHR.
My code for getting the price of only one item in the list is as followed:
import scrapy
import json
class AsosSpider(scrapy.Spider):
name = 'asos'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=200369183&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28']
def parse(self, response):
#print(response.body)
resp = json.loads(response.text)[0]
price = resp.get('productPrice').get('current').get('text')
print(price)
yield {
'price': price
Here, my start_urls is the Request URL. And it keeps changing for each item.
Item1: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=23443988&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28
Item2: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=22495685&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28
Only the productsIds are changing!!!
I need to insert the second code in the first code to get the price as well? How to do it please?
Thanks!
pix
items.py:
import scrapy
class AsosItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
about_me = scrapy.Field()
brand_description = scrapy.Field()
As I said in you last post I have a problem with this website on my computer for some reason, but you need to do something like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import AsosItem
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def remove_characters(self,value):
return value.strip('\n')
def parse_item(self, response):
price_url = 'https://www.asos.com' + re.search(r'window.asos.pdp.config.stockPriceApiUrl = \'(.+)\'', response.text).group(1)
item = AsosItem()
item['name'] = response.xpath("//div[#class='product-hero']/h1/text()").get()
item['description'] = response.xpath("//div[#class='product-description']/ul/li/text()").getall()
item['about_me'] = response.xpath("//div[#class='about-me']//text()").getall()
item['brand_description'] = response.xpath("//div[#class='brand-description']/p/text()").getall()
request = scrapy.Request(url=price_url, callback=self.parse_price)
request.meta['item'] = item
return request
def parse_price(self, response):
jsonresponse = response.json()[0]
price = jsonresponse['productPrice']['current']['text']
item = response.meta['item']
item['price'] = price
return item
Test the code and if it doesn't work then get the general idea and tweak it a bit, I can't test it myself.
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
I'm learning how to use Scrapy
spider.py
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for brickset in response.xpath('//div[#class="product-name"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
I run this spider with command: scrapy crawl test_spider -o test.csv
This is working fine for //div[#class="product-name", but I don't know how to add another CSS/XPath class in the same spider file
I'm trying this but it does't work
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for test in response.xpath('//div[#class="product-name"]'):
yield {
'name': test.xpath('h1/text()').extract_first(),
}
def parse(self, response):
for attempt in response.xpath('//div[#class="another-class"]'):
yield {
'color': attempt.xpath('h1/a/text()').extract_first(),
}
Please help me to do this.
Just use two for loops:
import scrapy
class TestSetSpider(scrapy.Spider):
name = "test_spider"
start_urls = ['https://example.html']
def parse(self, response):
for brickset in response.xpath('//div[#class="product-name"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
for brickset in response.xpath('//div[#class="another-class"]'):
yield {
'name': brickset.xpath('h1/text()').extract_first(),
}
def parse(self, response):
product_name_lst = []
# we will append all data to product_name_lst
for test in response.xpath('//div[#class="product-name"]'):
product_name_lst.append('name': test.xpath('h1/text()').extract_first())
another_product_name_lst = []
# we will append all data to another_product_name_lst
for test in response.xpath('//div[#class="another-product-name"]'):
another_product_name_lst.append('name': test.xpath('h1/text()').extract_first())
# after that write to out.csv all the data you need from
# product_name_lst and another_prodct_name_lst lists
out_file = open('out.csv', 'a') # a meen append to file not rewrite file
# and here you need to write in out.csv file
out.write(data) # data is what you need to write
# and close the file
out.close()
I am trying to learn Scrapy.
# -*- coding: utf-8 -*-
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com/']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.xpath('//*[#class="quote"]')
for quote in quotes:
text = quote.xpath(".//*[#class='text']/text()").extract_first()
author = quote.xpath("//*[#itemprop='author']/text()").extract_first()
tags = quote.xpath(".//*[#class='tag']/text()").extract();
item = {
'author_name':author,
'text':text,
'tags':tags
}
yield item
next_page_url = response.xpath("//*[#class='next']/a/#href").extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
But scrapy is only parsing first page. What is wrong in this code. I copied it from youtube tutorial.
Please help.
It is just that all the requests except the first one are getting filtered as "offsite". This is because you have this extra / at the end of the allowed_domains value:
allowed_domains = ['quotes.toscrape.com/']
# REMOVE THIS SLASH^
Remove or comment out the allowed_domains. Optionally, remove the semicolon line # 15.
Moreover, indent the following code into the parse method:
next_page_url = response.xpath("//*[#class='next']/a/#href").extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
so it will become this code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
#allowed_domains = ['quotes.toscrape.com/']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.xpath('//*[#class="quote"]')
for quote in quotes:
text = quote.xpath(".//*[#class='text']/text()").extract_first()
author = quote.xpath("//*[#itemprop='author']/text()").extract_first()
tags = quote.xpath(".//*[#class='tag']/text()").extract()
item = {
'author_name':author,
'text':text,
'tags':tags
}
yield item
next_page_url = response.xpath("//*[#class='next']/a/#href").extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)