I am new to scrapy and is currently learning how to scrape information from a list of scraped URL. I have been able to scrape information from a url by going thru the tutorial in scrapy website. However, i am facing problem scraping information from a list of url scraped from a url even after googling for solution online.
The scraper that i have written below is able to scrape from the first url. However, it is unsuccessful in scraping from a list of scraped URL. The problem starts at def parse_following_urls(self, response): whereby i am unable to scrape from the list of scraped URL
Can anyone help to solve this? Thank in advance.
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
yield item
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/"+ i for i in urll]
for url in urls:
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True)
yield request
request.meta['item'] = item
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I have re wrote the code after trying suggestions given and looking at the output. Below is the edited code. However, i got another error that states that Request url must be str or unicode, got %s:' % type(url).__name__). How do i convert the URL from list to a string?
I thought URL should be in string as it is in a For loop. I have added this as comment in the code below. Is there any way to solve this?
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
url = ["http://marketdata.set.or.th/mkt/"]+ sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True) #Request url must be str or unicode, got list: How to solve this?
request.meta['item'] = item
yield item
yield request
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I see what you are trying to do here, it's called - chaining requests.
What this means is that you want to keep yielding Requests and keep carrying your filled Item in the Request
s meta attribute.
For your case all you need to do is instead of yielding Item yield a Request with an item in it. Change your parse to:
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/" + i for i in urll]
for url in urls:
yield scrapy.Request(url,
callback=self.parse_following_urls,
meta={'item': item})
I try to change the inverse 5th line
item = response.meta['item']
to
item = SET()
then it works!
Actually I didn't realize your "meta"way very much,since I never use this to describe item.
Related
I am trying to scrape data of # pages. I have already done a scraper which can scrape data from a single # page. But it suddenly finished the work after scraping of the first page
The whole file with parse function and scrapd function - Scraper.py
# -*- coding: utf-8 -*-
import scrapy
import csv
import os
from scrapy.selector import Selector
from scrapy import Request
class Proddduct(scrapy.Item):
price = scrapy.Field()
description = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class LapadaScraperSpider(scrapy.Spider):
name = 'lapada_scraper2'
allowed_domains = ['http://www.lapada.org']
start_urls = ['https://lapada.org/art-and-antiques/?search=antique']
def parse(self, response):
next_page_url = response.xpath("//ul/li[#class='next']//a/#href").get()
for item in self.scrape(response):
yield item
if next_page_url:
print("Found url: {}".format(next_page_url))
yield scrapy.Request(url=next_page_url, callback=self.parse)
def scrape(self, response):
parser = scrapy.Selector(response)
products = parser.xpath("//div[#class='content']")
for product in products:
item = Proddduct()
XPATH_PRODUCT_DESCRIPTION = ".//strong/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='price']/text()"
XPATH_PRODUCT_LINK = ".//a/#href"
raw_product_description = product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
raw_product_price = product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_link = product.xpath(XPATH_PRODUCT_LINK).extract_first()
item['description'] = raw_product_description
item['price'] = raw_product_price
item['link'] = raw_product_link
yield item
def get_information(self, response):
item = response.meta['item']
item['phonenumber'] = "12345"
yield item
How can I scrape all items in all pages?
Thanks
Change allowed_domains = ['http://www.lapada.org'] to allowed_domains = ['lapada.org']
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old
I am trying to grab details from a real estate listing page. I can grab all the data, I just can't seem to export it..
Perhaps a problem with the way I use the yield keyword. The code work for the most part:
Visits page 1, example.com/kittens
Goes to page 2, example.com/puppers. Here are 10 apartments listed in blocks. I can get data from each block, but I need additional info from inside the hyperlink.
Visits the hyperlink, say, example.com/puppers/apartment1. It grabs some info from here as well, but I can't seem to return this data to include it in my HousingItem() class.
import scrapy
from urllib.parse import urljoin
class HousingItem(scrapy.Item):
street = scrapy.Field()
postal = scrapy.Field()
city = scrapy.Field()
url = scrapy.Field()
buildY = scrapy.Field()
on_m = scrapy.Field()
off_m = scrapy.Field()
class FAppSpider(scrapy.Spider):
name = 'f_app'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/kittens']
def parse(self, response):
yield scrapy.Request(url="https://www.example.com/puppers",
callback=self.parse_puppers)
def parse_inside_pupper(self, response):
item = HousingItem()
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
# Problem area from here..
yield response.follow(url=item['url'],callback=self.parse_inside_pupper)
# yield scrapy.request(url=item['url'],callback=self.parse_inside_pupper)?
yield item
FEED_EXPORT_FIELDS is adjusted in my SETTINGS.py. The 4 items from parse_puppers() get exported correctly, parse_inside_puppers() data is correct in the console, but wont export.
I use scrapy crawl f_app -o raw_data.csv to run me spider. Thanks in advance, appreciate all the help.
p.s. im fairly new to python and practising, i bet you noticed.
You need to send you current item to the parse_inside_pupper using meta param:
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
yield response.follow(url=item['url'],callback=self.parse_inside_pupper, meta={"item": item})
After that you can use it inside parse_inside_pupper (and yield it from here):
def parse_inside_pupper(self, response):
item = response.meta["item"]
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
yield item
I try to get this spider work and if request the components to be scraped separately it works, however when try to use Srapy callback function to receive the arguments later i get crashed. The goal is to craw over multiple pages and scrape data while write in output json file in format:
author | album | title | lyrics
the data for each is located on separate web pages, so that is why I'm tying to use Scrapy callback function to get that accomplished.
Also each of the above items are defined under Scrapy items.py as:
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
album = scrapy.Field()
title = scrapy.Field()
lyrics = scrapy.Field()
Spider Code start here:
import scrapy
import re
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tutorial.items import TutorialItem
# urls class
class DomainSpider(scrapy.Spider):
name = "domainspider"
allowed_domains = ['www.domain.com']
start_urls = [
'http://www.domain.com',
]
rules = (
Rule(LinkExtractor(allow='www\.domain\.com/[A-Z][a-zA-Z_/]+$'),
'parse', follow=True,
),
)
# Parsing start here
# crawling and scraping the links from menu list
def parse(self, response):
links = response.xpath('//html/body/nav[1]/div/ul/li/div/a/#href')
for link in links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_artist_page)
# crawling and scraping artist names and links
def parse_artist_page(self, response):
artist_links = response.xpath('//*/div[contains(#class, "artist-col")]/a/#href')
author = response.xpath('//*/div[contains(#class, "artist-col")]/a/text()').extract()
item = TutorialItem(author=author)
for link in artist_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_album_page)
request.meta['author'] = item
yield item
return
# crawling and scraping album names and links
def parse_album_page(self, response):
album_links = response.xpath('//*/div[contains(#id, "listAlbum")]/a/#href')
album = response.xpath('//*/div[contains(#class, "album")]/b/text()').extract()
item = TutorialItem(album=album)
for link in album_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_lyrics_page)
request.meta['album'] = item
yield item
return
# crawling and scraping titles and lyrics
def parse_lyrics_page(self, response):
title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())
item = response.meta['author', 'album']
item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
yield item
The code crash when get to call back function:
request.meta['author'] = item
yield item
return
Can anyone help?
I did found where was the problem, the way callback function was set by me, now works:
# crawling and scraping artist names and links
def parse_artist_page(self, response):
artist_links = response.xpath('//*/div[contains(#class, "artist-col")]/a/#href')
author = response.xpath('//*/div[contains(#class, "artist-col")]/a/text()').extract()
for link in artist_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
request = scrapy.Request(next_page, callback=self.parse_album_page)
request.meta['author'] = author
return request
# crawling and scraping album names and links
def parse_album_page(self, response):
author = response.meta.get('author')
album_links = response.xpath('//*/div[contains(#id, "listAlbum")]/a/#href')
album = response.xpath('//*/div[contains(#class, "album")]/b/text()').extract()
for link in album_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
request = scrapy.Request(next_page, callback=self.parse_lyrics_page)
request.meta['author'] = author
request.meta['album'] = album
return request
# crawling and scraping song titles and lyrics
def parse_lyrics_page(self, response):
author = response.meta.get('author')
album = response.meta.get('album')
title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())
item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
yield item
I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...