I am building a spider with scrapy, I want to access in every item in a list and then scrape all the data inside each link. but when I run the spider it doesn´t scrape the data. What I am missing?
from ..items import JobscraperItem
from scrapy.linkextractors import LinkExtractor
class JobscraperSpider(scrapy.Spider):
name ='jobspider'
start_urls = ['https://cccc/bolsa/ofertas?oferta=&lugar=&categoria=']
def parse(self, response):
job_detail = response.xpath('//div[#class="list"]/div/a')
yield from response.follow_all(job_detail, self.parse_jobspider)
def parse(self, response):
items = JobscraperItem()
job_title = response.xpath('//h1/text()').extract()
company = response.xpath('//h2/b/text()').extract()
company_url = response.xpath('//div[#class="pull-left"]/a/text()').extract()
description = response.xpath('//div[#class="aviso"]/text()').extract()
salary = response.xpath('//div[#id="aviso"]/p[1]/text()').extract()
city = response.xpath('//div[#id="aviso"]/p[2]/text()').extract()
district = response.xpath('//div[#id="aviso"]/p[5]/text()').extract()
publication_date = response.xpath('//div[#id="publicado"]/text()').extract()
apply = response.xpath('//p[#class="text-center"]/b/text()').extract()
job_type = response.xpath('//div[#id="resumen"]/p[3]/text()').extract()
items['job_title'] = job_title
items['company'] = company
items['company_url'] = company_url
items['description'] = description
items['salary'] = salary
items['city'] = city
items['district'] = district
items['publication_date'] = publication_date
items['apply'] = apply
items['job_type'] = job_type
yield items```
From what I can see, one of the issues is that you are creating two functions called parse(). Since you are using a self.parse_jobspider in your first parse function, I'm guessing that your second parse function is named incorrectly.
Also, are you sure that the URL in the start_urls is correct? https://cccc/bolsa/ofertas?oferta=&lugar=&categoria= doesn't direct to anywhere which would also explain why data isn't being scraped.
rules = (
Rule(LinkExtractor(allow=('/bolsa/166',)), follow=True, callback='parse_item'),
)
I resolve this adding this code to access in every link and scrape the data inside
Related
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old
I am trying to scrape the underlying data on the table in the following pages: https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries
What I want to do is access the underlying link for each row, and capture:
The ID tag (e.g. QDE001),
The name
The reason for listing / additional information
Other linked entities
This is what I have, but it does not seems to be working, I keep getting a "NotImplementedError('{}.parse callback is notdefined'.format(self.class.name)).I believe that the Xpaths I have defined are OK, not sure what I am missing.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class UNSCItem(scrapy.Item):
name = scrapy.Field()
uid = scrapy.Field()
link = scrapy.Field()
reason = scrapy.Field()
add_info = scrapy.Field()
class UNSC(scrapy.Spider):
name = "UNSC"
start_urls = [
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=0',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=1',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=2',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=3',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=4',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=5',
'https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page=6',]
rules = Rule(LinkExtractor(allow=('/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries/',)),callback='data_extract')
def data_extract(self, response):
item = UNSCItem()
name = response.xpath('//*[#id="content"]/article/div[3]/div//text()').extract()
uid = response.xpath('//*[#id="content"]/article/div[2]/div/div//text()').extract()
reason = response.xpath('//*[#id="content"]/article/div[6]/div[2]/div//text()').extract()
add_info = response.xpath('//*[#id="content"]/article/div[7]//text()').extract()
related = response.xpath('//*[#id="content"]/article/div[8]/div[2]//text()').extract()
yield item
Try the below approach. It should fetch you all the ids and corresponding names from all the six pages. I suppose, the rest of the fields you can manage yourself.
Just run it as it is:
import scrapy
class UNSC(scrapy.Spider):
name = "UNSC"
start_urls = ['https://www.un.org/sc/suborg/en/sanctions/1267/aq_sanctions_list/summaries?type=All&page={}'.format(page) for page in range(0,7)]
def parse(self, response):
for item in response.xpath('//*[contains(#class,"views-table")]//tbody//tr'):
idnum = item.xpath('.//*[contains(#class,"views-field-field-reference-number")]/text()').extract()[-1].strip()
name = item.xpath('.//*[contains(#class,"views-field-title")]//span[#dir="ltr"]/text()').extract()[-1].strip()
yield{'ID':idnum,'Name':name}
I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title
According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)
I'm writing a spider (CrawlSpider) for an online store. According to client requisites, I need to write two rules: one for determining which pages have items and other for extracting the items.
I have both rules already working independently:
if my start_urls = ["www.example.com/books.php",
"www.example.com/movies.php"] and I comment the Rule and the code
of parse_category, my parse_item will extract every item.
On the other hand, if start_urls = "http://www.example.com" and I
comment the Ruleand the code of parse_item, parse_category will
return every link in which there a items for extracting, i.e.
parse_category will return www.example.com/books.php and
www.example.com/movies.php.
My problem is that I don't know how to merge both modules, so that start_urls = "http://www.example.com" and then parse_category extracts www.example.com/books.php and www.example.com/movies.php and feed those links to parse_item, where I actually extract the info of each item.
I need to find a way to do it this way instead of just using start_urls = ["www.example.com/books.php", "www.example.com/movies.php"] because if in the future a new category is added (e.g. www.example.com/music.php), the spider wouldn't be able to automatically detect that new category and should be manually edited. Not a big deal, but the client doesn't want this.
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
#start_urls = ["http://www.example.com/books.php", "http://www.example.com/movies.php"]
rules = (
Rule(LinkExtractor(), follow=True, callback='parse_category'),
Rule(LinkExtractor(), follow=False, callback="parse_item"),
)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
return category
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
the CrawlSpider rules don't work like you want, you'll need to implement the logic by yourself. when you specify follow=True you can't use callback, because the idea is to keep getting links (no items) while following the rules, check the documentation
you could try with something like:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
# no rules
def parse(self, response): # this is parse_category
category_le = LinkExtractor("something for categories")
for a in category_le.extract_links(response):
yield Request(a.url, callback=self.parse_category)
item_le = LinkExtractor("something for items")
for a in item_le.extract_links(response):
yield Request(a.url, callback=self.parse_item)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
yield category
for req in self.parse(response):
yield req
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Instead of using a parse_category, I used restrict_css in LinkExtractorto get the links I want, and it seems to be feeding the second Rule with the extracted links, so my question is answered. It ended up this way:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
rules = (
Rule(LinkExtractor(restrict_css=("#movies", "#books"))),
Rule(LinkExtractor(), callback="parse_item"),
)
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Still it can't detect new added categories (and there is not a clear pattern for using in restrict_css without fetching other garbage), but at least it's complying with the requisites of the client: 2 rules, one for extracting category's links and other for extracting item's data.
I am attempting to scrape the Library of Congress/Thomas website. This Python script is intended to access a sample of 40 bills from their site (# 1-40 identifiers in the URLs). I want to parse the body of each piece of legislation, search in the body/content, extract links to potential multiple versions & follow.
Once on the version page(s) I want to parse the body of each piece of legislation, search the body/content & extract links to potential sections & follow.
Once on the section page(s) I want to parse the body of each section of a bill.
I believe there is some issue with the Rules/LinkExtractor segment of my code. The python code is executing, crawling the start urls, but not parsing or any of the subsequent tasks.
Three issues:
Some bills do not have multiple versions (and ergo no links in the body portion of the URL
Some bills do not have linked sections because they are so short, while some are nothing but links to sections.
Some section links do not contain just section-specific content, and most of the content is just redundant inclusion of prior or subsequent section content.
My question is again, why is Scrapy not crawling or parsing?
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
spider = Lrn2CrawlSpider()
Just for the record, the problem with your script is that the variable rules is not inside the scope of Lrn2CrawlSpider because it doesn't share the same indentation, so when alecxe fixed the indentation the variable rules became now an attribute of the class. Later the inherited method __init__() reads the attribute and compiles the rules and enforces them.
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
Erasing the last line had nothing to do with that.
I've just fixed the indentation, removed spider = Lrn2CrawlSpider() line at the end of the script, ran the spider via scrapy runspider lrn2crawl.py and it scrapes, follows links, returns items - your rules work.
Here's what I'm running:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
Hope that helps.