After writing my first ?recursive? spider, i face some problems, i cant get fixed the whole day..
I did research, which misstakes can cause that 301 error, but every solution i tried, didnt help me out yet.
My console output
My modified settings.py
USER_AGENT = 'kartonage (Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0)'
DOWNLOAD_DELAY = 0.5
HTTPERROR_ALLOW_ALL = True
this user_Agent and httperror_allow_all were some solutions for other people with a redirected 301 error
My modified items.py
import scrapy
class KartonageItem(scrapy.Item):
SKU = scrapy.Field()
Title = scrapy.Field()
Link = scrapy.Field()
Price = scrapy.Field()
Delivery_Status = scrapy.Field()
Weight = scrapy.Field()
QTY = scrapy.Field()
Volume = scrapy.Field()
My code i used
import scrapy
from ..items import KartonageItem
class KartonSpider(scrapy.Spider):
name = "kartons12"
allow_domains = ['karton.eu']
start_urls = [
'https://www.karton.eu/Faltkartons'
]
custom_settings = {'FEED_EXPORT_FIELDS': ['SKU', 'Title', 'Link', 'Price', 'Delivery_Status', 'Weight', 'QTY', 'Volume'] }
def parse(self, response):
url = response.xpath('//div[#class="cat-thumbnails"]')
for a in url:
link = a.xpath('a/#href')
yield response.follow(url=link.get(), callback=self.parse_category_cartons)
def parse_category_cartons(self, response):
url2 = response.xpath('//div[#class="cat-thumbnails"]')
for a in url2:
link = a.xpath('a/#href')
yield response.follow(url=link.get(), callback=self.parse_target_page)
def parse_target_page(self, response):
card = response.xpath('//div[#class="text-center articelbox"]')
for a in card:
items = KartonageItem()
link = a.xpath('a/#href')
items ['SKU'] = a.xpath('.//div[#class="delivery-status"]/small/text()').get()
items ['Title'] = a.xpath('.//h5[#class="title"]/a/text()').get()
items ['Link'] = a.xpath('.//h5[#class="text-center artikelbox"]/a/#href').extract()
items ['Price'] = a.xpath('.//strong[#class="price-ger price text-nowrap"]/span/text()').get()
items ['Delivery_Status'] = a.xpath('.//div[#class="signal_image status-2"]/small/text()').get()
yield response.follow(url=link.get(),callback=self.parse_item, meta={'items':items})
def parse_item(self,response):
table = response.xpath('//span[#class="product-info-inner"]')
items = KartonageItem()
items = response.meta['items']
items['Weight'] = a.xpath('.//span[#class="staffelpreise-small"]/text()').get()
items['Volume'] = a.xpath('.//td[#class="icon_contenct"][7]/text()').get()
yield items
HTTP 301 isn't an error, it is a response for Moved Permanently. It automatically redirects you to the new address for that page. You can see in your execution logs that you got redirected.
That by itself shouldn't be a problem. Is it something else this may be causing? Any behavior from the spider that is unexpected?
Related
I am building a Scrapy spider WuzzufLinks that scrapes all the links to specific jobs in a job website in this link:
https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt
After scraping the links, I would like to send them to another spider WuzzufSpider, which scrapes data from inside each link. The start_urls would be the first link in the scraped list, and the next_page would be the following link, and so on.
I have thought of importing the WuzzufLinks into WuzzufSpider then accessing its data:
import scrapy
from ..items import WuzzufscraperItem
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
items = WuzzufscraperItem()
jobURL = response.css('h2[class=css-m604qf] a::attr(href)').extract()
items['jobURL'] = jobURL
yield items
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page, callback = self.parse)
WuzzuflinksSpider.page_number += 1
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from spiders.WuzzufLinks import WuzzuflinksSpider
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
parseClass = WuzzuflinksSpider().parse()
start_urls = []
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
Regardless of whether I have written the outlined parts correctly, I have realized that accessing jobURL would return an empty value since it is only a temporary container. I have thought of saving the scraped links in another file, then importing them to WuzzufSpider, but I don't know whether the import is valid and if they will still be a list:
# links.xml
<?xml version="1.0" encoding="utf-8"?>
<items>
<item><jobURL><value>/jobs/p/P5A2NWkkWfv6-Sales-Operations-Specialist-Amreyah-Cement---InterCement-Alexandria-Egypt?o=1&l=sp&t=sj&a=search-v3</value><value>/jobs/p/pEmZ96R097N3-Senior-Laravel-Developer-Learnovia-Cairo-Egypt?o=2&l=sp&t=sj&a=search-v3</value><value>/jobs/p/IgHkjP37ymQp-French-Talent-Acquisition-Specialist-Guide-Academy-Giza-Egypt?o=3&l=sp&t=sj&a=search-v3</value><value>/jobs/p/zOLTqLqegEZe-Export-Sales-Representative-packtec-Cairo-Egypt?o=4&l=sp&t=sj&a=search-v3</value><value>/jobs/p/U3Q1TDpxzsJJ-Finishing-Site-Engineer--Assiut-Assiut-Egypt?o=5&l=sp&t=sj&a=search-v3</value><value>/jobs/p/7aQ4QxtYV8N6-Senior-QC-Automation-Engineer-FlairsTech-Cairo-Egypt?o=6&l=sp&t=sj&a=search-v3</value><value>/jobs/p/qHWyGU7ClMG6-Technical-Office-Engineer-Cairo-Egypt?o=7&l=sp&t=sj&a=search-v3</value><value>/jobs/p/ptN7qnERUvPT-B2B-Sales-Representative-Smart-Zone-Cairo-Egypt?o=8&l=sp&t=sj&a=search-v3</value><value>/jobs/p/VUVc0ZAyUNYU-Digital-Marketing-supervisor-National-Trade-Distribution-Cairo-Egypt?o=9&l=sp&t=sj&a=search-v3</value><value>/jobs/p/WzJhyeVpT5jb-Receptionist-Value-Cairo-Egypt?o=10&l=sp&t=sj&a=search-v3</value><value>/jobs/p/PAdZOdzWjqbr-Insurance-Specialist-Bancassuranc---Sohag-Allianz-Sohag-Egypt?o=11&l=sp&t=sj&a=search-v3</value><value>/jobs/p/nJD6YbE4QjNX-Senior-Research-And-Development-Specialist-Cairo-Egypt?o=12&l=sp&t=sj&a=search-v3</value><value>/jobs/p/DVvMG4BFWEeI-Technical-Sales-Engineer-Masria-Group-Cairo-Egypt?o=13&l=sp&t=sj&a=search-v3</value><value>/jobs/p/3RtCveEFjveW-Technical-Office-Engineer-Masria-Group-Cairo-Egypt?o=14&l=sp&t=sj&a=search-v3</value><value>/jobs/p/kswGaw4kXTe8-Administrator-Kreston-Cairo-Egypt?o=15&l=sp&t=sj&a=search-v3</value></jobURL></item>
</items>
# WuzzufSpider
import scrapy
from ..items import WuzzufscraperItem
from links import jobURL
class WuzzufspiderSpider(scrapy.Spider):
name = 'WuzzufSpider'
start_urls = [jobURL[0]]
def parse(self, response):
items = WuzzufscraperItem()
# CSS selectors
title = response.css('').extract()
company = response.css('').extract()
location = response.css('').extract()
country = response.css('').extract()
date = response.css('').extract()
careerLevel = response.css('').extract()
experienceNeeded = response.css('').extract()
jobType = response.css('').extract()
jobFunction = response.css('').extract()
salary = response.css('').extract()
description = response.css('').extract()
requirements = response.css('').extract()
skills = response.css('').extract()
industry = response.css('').extract()
jobURL = response.css('').extract()
# next_page and if statement here
Is there is a way to make the second method work or a completely different approach?
I have checked forums Scrapy:Pass data between 2 spiders and Pass scraped URL's from one spider to another. I understand that I can do all of the work in one spider, and that there is a way to save to a database or temporary file in order to send data to another spider. However I am not yet very experienced and don't understand how to implement such changes, so marking this question as a duplicate won't help me. Thank you for your help.
First of all you can keep crawling the urls from the same spider and honestly I don't see a reason for you not to.
Anyway, if you really want to have two spiders, which the output of the first will be the input of the second, you can do something like this:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher
from scrapy import signals
from twisted.internet import reactor, defer
# grab all the products urls
class ExampleSpider(scrapy.Spider):
name = "exampleSpider"
start_urls = ['https://scrapingclub.com/exercise/list_basic']
def parse(self, response):
all_urls = response.xpath('//div[#class="card"]/a/#href').getall()
for url in all_urls:
yield {'url': 'https://scrapingclub.com' + url}
# get the product's details
class ExampleSpider2(scrapy.Spider):
name = "exampleSpider2"
def parse(self, response):
title = response.xpath('//h3/text()').get()
price = response.xpath('//div[#class="card-body"]//h4//text()').get()
yield {
'title': title,
'price': price
}
if __name__ == "__main__":
# this will be the yielded items from the first spider
output = []
def get_output(item):
output.append(item)
configure_logging()
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
runner = CrawlerRunner(settings)
# run spiders sequentially
# (https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process)
#defer.inlineCallbacks
def crawl():
dispatcher.connect(get_output, signal=signals.item_scraped)
yield runner.crawl('exampleSpider')
urls = [url['url'] for url in output] # create a list of the urls from the first spider
# crawl the second spider with the urls from the first spider
yield runner.crawl('exampleSpider2', start_urls=urls)
reactor.stop()
crawl()
reactor.run()
Run this and see that you first get the results from the first spider, and that those results are passed as the "start_urls" for the second spider.
EDIT:
Doing it all in the same spider. See how we loop over all the urls and scraping them in the function "parse_item". I filled in some of the values you want to scrape as an example, so just fill in the rest and you're done.
import scrapy
# from ..items import WuzzufscraperItem
class WuzzufscraperItem(scrapy.Item):
title = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
country = scrapy.Field()
jobURL = scrapy.Field()
date = scrapy.Field()
careerLevel = scrapy.Field()
experienceNeeded = scrapy.Field()
jobType = scrapy.Field()
jobFunction = scrapy.Field()
salary = scrapy.Field()
description = scrapy.Field()
requirements = scrapy.Field()
skills = scrapy.Field()
industry = scrapy.Field()
class WuzzuflinksSpider(scrapy.Spider):
name = 'WuzzufLinks'
page_number = 1
start_urls = ['https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt']
def parse(self, response):
all_urls = response.css('h2[class=css-m604qf] a::attr(href)').getall()
if all_urls:
for url in all_urls:
yield response.follow(url=url, callback=self.parse_item)
next_page = 'https://wuzzuf.net/search/jobs/?filters%5Bcountry%5D%5B0%5D=Egypt&start=' + str(WuzzuflinksSpider.page_number)
if WuzzuflinksSpider.page_number <= 100:
yield response.follow(next_page)
WuzzuflinksSpider.page_number += 1
def parse_item(self, response):
items = WuzzufscraperItem()
# CSS selectors
# Some values as an example:
items['title'] = response.xpath('(//h1)[last()]/text()').get(default='')
items['company'] = response.xpath('(//a[#class="css-p7pghv"])[last()]/text()').get(default='')
items['location'] = response.xpath('(//strong[#class="css-9geu3q"])[last()]/text()').get(default='')
items['country'] = response.xpath('//meta[#property="og:country_name"]/#content').get(default='')
items['jobURL'] = response.url
# items['date'] = response.css('').get(default='')
# items['careerLevel'] = response.css('').get(default='')
# items['experienceNeeded'] = response.css('').get(default='')
# items['jobType'] = response.css('').get(default='')
# items['jobFunction'] = response.css('').get(default='')
# items['salary'] = response.css('').get(default='')
# items['description'] = response.css('').get(default='')
# items['requirements'] = response.css('').get(default='')
# items['skills'] = response.css('').get(default='')
# items['industry'] = response.css('').get(default='')
yield items
I am trying to use scrapy on this page: http://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/
But I can't bring the image of the product, it can't find anything I might be missing?
I tried by attribute, by ID, by class and nothing
import scrapy
from scrapy import Request
import random
class BrickSetSpider(scrapy.Spider):
name = 'spider'
USER_AGENT_LIST = [
'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
]
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/',
]
download_delay = 5
FEED_EXPORT_ENCODING = 'utf-8'
def start_requests(self):
for url in self.start_urls:
headers = {'User-Agent': random.choice(self.USER_AGENT_LIST)}
yield Request(url, headers=headers)
def parse(self, response):
SET_SELECTOR = '.content-left'
for brickset in response.css(SET_SELECTOR):
SEARCH_SELECTOR = response.url
NAME_SELECTOR = 'span.keyValue span ::text'
IMAGE_SELECTOR = 'img[itemprop="image"] ::attr(src)'
yield {
'search': SEARCH_SELECTOR,
'name': brickset.css(NAME_SELECTOR).re('[^\t\n]+'),
'link': brickset.css(IMAGE_SELECTOR).extract(),
}
If you are using Chrome, you can test this in the console $$(".images [data-test='zoom-wrap'] img") to get the image.
So, you can use this CSS selector in the Scrapy code. You will have to extract the src parameter.
I hope it helps!
The image is generated dynamically by JS. Try the following code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
import re
class MySpider(Spider):
name = 'rs-online.com'
# allowed_domains = ['example.com']
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/'
]
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# print (doc.html)
div = doc.getElementByClass('content-left')
imgs = re.compile(u'largeImageURL: ".*"').findall(div.script.html)
imgs = ['https:'+img[len('largeImageURL: "'):-1] for img in imgs]
lis = doc.getElementByClass('keyDetailsLL').lis
names = {}
for li in lis:
spans=li.spans
names[spans[0].text]=spans[1].text
data = [{'imgs':imgs,'names':names}]
print (data)
return {"Urls": [], "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling
Result:
[{'imgs': ['https://media.rs-online.com/t_large/F7858468-01.jpg', 'https://media.rs-online.com/t_large/F7858468-02.jpg'], 'names': {'Codice RS': '785-8468', 'Codice costruttore': 'E2E-S05S12-WC-B1 2M', 'Costruttore': 'Omron'}}]
I am trying to grab details from a real estate listing page. I can grab all the data, I just can't seem to export it..
Perhaps a problem with the way I use the yield keyword. The code work for the most part:
Visits page 1, example.com/kittens
Goes to page 2, example.com/puppers. Here are 10 apartments listed in blocks. I can get data from each block, but I need additional info from inside the hyperlink.
Visits the hyperlink, say, example.com/puppers/apartment1. It grabs some info from here as well, but I can't seem to return this data to include it in my HousingItem() class.
import scrapy
from urllib.parse import urljoin
class HousingItem(scrapy.Item):
street = scrapy.Field()
postal = scrapy.Field()
city = scrapy.Field()
url = scrapy.Field()
buildY = scrapy.Field()
on_m = scrapy.Field()
off_m = scrapy.Field()
class FAppSpider(scrapy.Spider):
name = 'f_app'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/kittens']
def parse(self, response):
yield scrapy.Request(url="https://www.example.com/puppers",
callback=self.parse_puppers)
def parse_inside_pupper(self, response):
item = HousingItem()
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
# Problem area from here..
yield response.follow(url=item['url'],callback=self.parse_inside_pupper)
# yield scrapy.request(url=item['url'],callback=self.parse_inside_pupper)?
yield item
FEED_EXPORT_FIELDS is adjusted in my SETTINGS.py. The 4 items from parse_puppers() get exported correctly, parse_inside_puppers() data is correct in the console, but wont export.
I use scrapy crawl f_app -o raw_data.csv to run me spider. Thanks in advance, appreciate all the help.
p.s. im fairly new to python and practising, i bet you noticed.
You need to send you current item to the parse_inside_pupper using meta param:
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
yield response.follow(url=item['url'],callback=self.parse_inside_pupper, meta={"item": item})
After that you can use it inside parse_inside_pupper (and yield it from here):
def parse_inside_pupper(self, response):
item = response.meta["item"]
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
yield item
I am new to scrapy and is currently learning how to scrape information from a list of scraped URL. I have been able to scrape information from a url by going thru the tutorial in scrapy website. However, i am facing problem scraping information from a list of url scraped from a url even after googling for solution online.
The scraper that i have written below is able to scrape from the first url. However, it is unsuccessful in scraping from a list of scraped URL. The problem starts at def parse_following_urls(self, response): whereby i am unable to scrape from the list of scraped URL
Can anyone help to solve this? Thank in advance.
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
yield item
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/"+ i for i in urll]
for url in urls:
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True)
yield request
request.meta['item'] = item
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I have re wrote the code after trying suggestions given and looking at the output. Below is the edited code. However, i got another error that states that Request url must be str or unicode, got %s:' % type(url).__name__). How do i convert the URL from list to a string?
I thought URL should be in string as it is in a For loop. I have added this as comment in the code below. Is there any way to solve this?
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
url = ["http://marketdata.set.or.th/mkt/"]+ sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True) #Request url must be str or unicode, got list: How to solve this?
request.meta['item'] = item
yield item
yield request
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I see what you are trying to do here, it's called - chaining requests.
What this means is that you want to keep yielding Requests and keep carrying your filled Item in the Request
s meta attribute.
For your case all you need to do is instead of yielding Item yield a Request with an item in it. Change your parse to:
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/" + i for i in urll]
for url in urls:
yield scrapy.Request(url,
callback=self.parse_following_urls,
meta={'item': item})
I try to change the inverse 5th line
item = response.meta['item']
to
item = SET()
then it works!
Actually I didn't realize your "meta"way very much,since I never use this to describe item.
I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...