Scrapy linkextractor, follow not working - python

I have tried to extract all link from a web. My spider is a subclass of a superclass called GeneralSpider. The problem is that when I change the name of the method 'parse_url' by parse (overriding a method of the superclass) link extractor get all links of the main page, but is not following the links. If I don't change the method name, spider does not work. Am I doing something wrong?
# -*- coding: utf-8 -*-
from core.generalSpider import GeneralSpider
from scrapy.linkextractors import LinkExtractor
from scrapy import log
from scrapy.contrib.spiders import Rule
from scrapy.item import Item, Field
from spiders.settings import GET_ITEMS
class MyItem(Item):
url = Field()
text = Field()
item = Field()
class GetItemsSpider(GeneralSpider):
name = GET_ITEMS
start_urls = 'http://www.example.com'
allowed_domains = ['example.com']
rules = (Rule(LinkExtractor(allow=()), callback='parse_url', follow=True), )
def __init__(self, port, **kwargs):
super(GetItemsSpider, self).__init__(port, **kwargs)
# User agent
self.user_agent = Utils.get_random_item_from_list(core_settings.USER_AGENT_LIST)
# Scrapy logs
self.log('GetItemsSpider init start_urls= %s parameters= %s ' %
(self.start_urls, str(self.parameters)), level=log.DEBUG)
self.log('%s init start_urls= %s parameters= %s ' %
(self.name, self.start_urls, str(self.parameters)), level=log.INFO)
self.log('USER AGENT = %s' % self.user_agent, level=log.INFO)
self.log('PORT = %s' % self._proxy_port, level=log.INFO)
def parse_url(self, response):
items = []
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
items.append(item)
return items

there is no better explanation that the one on documentation, check the warning here
Just don't override parse.

At the end I could not find why my code was not working, but I found an alternative solution:
def parse_url(self, response):
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
if condition:
yield Request(urlparse.urljoin(response.url, link.url), callback=self.parse)
yield item
This solution is based in Philip Adzanoukpe's example. I hope this can be useful.

Related

Is it possible to run pipelines and crawl multiple URL at the same time in scrapy?

My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item

scrapy not following links

The following scrapy code for returning medical treatment information does return the first set of returned results, but does not follow links. Learning code and checked similar results here on stackoverflow, but integrating them did not work. True, I'm learning. Any pointers would be appreciated.
import urlparse
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import w3lib.url
from yelp.items import YelpItem
class YelpSpider(BaseSpider):
name = "yelp"
download_delay = 10
concurrent_requests = 1
concurrent_requests_per_domain = 1
allowed_domains = ["yelp.com"]
start_urls = ["http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=0",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=20",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=30"]
def parse(self, response):
selector = Selector(response)
for title in selector.css("span.indexed-biz-name"):
page_url = urlparse.urljoin(response.url,
title.xpath("a/#href").extract()[0])
self.log("page URL: %s" % page_url)
#continue
yield Request(page_url,
callback=self.parse_page)
for next_page in selector.css(u'ul > li > a.prev-next:contains(\u2192)'):
next_url = urlparse.urljoin(response.url,
next_page.xpath('#href').extract()[0])
self.log("next URL: %s" % next_url)
#continue
yield Request(next_url,
callback=self.parse)
def parse_page(self, response):
selector = Selector(response)
item = YelpItem()
item["name"] = selector.xpath('.//h1[#itemprop="name"]/text()').extract()[0].strip()
item["addresslocality"] = u"\n".join(
selector.xpath('.//address[#itemprop="address"]//text()').extract()).strip()
item["link"] = response.url
website = selector.css('div.biz-website a')
if website:
website_url = website.xpath('#href').extract()[0]
item["website"] = w3lib.url.url_query_parameter(website_url, "url")
return item
Your next URL extraction and selection logic is not correct. Target the link element having next and pagination-links_anchor classes. The following works for me:
next_url = response.css('a.pagination-links_anchor.next::attr(href)').extract_first()
if next_url:
next_url = urlparse.urljoin(response.url, next_url)
self.log("next URL: %s" % next_url)
yield Request(next_url, callback=self.parse)

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

scrapy transport start_url to subsequent requests

since three days I am trying to save the respective start_urs in a meta attribute to pass it as item to subsequent requests in scrapy, so I can use the start_url to call a dictionary to populate my output with additional data. Actually it should be straightforward, because it is explained in the documentation ...
There is a discussion in the google scrapy group and there was a question here also but I can't get it to run :(
I am new to scrapy and I think it is a great framework, but for my project I have to know the start_urls of all requests and it is quite complicated as it looks like.
I would really appreciate some help!
At the moment my code looks like this:
class example(CrawlSpider):
name = 'example'
start_urls = ['http://www.example.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/blablabla/', )), callback='parse_item'),
)
def parse(self, response):
for request_or_item in super(example, self).parse(response):
if isinstance(request_or_item, Request):
request_or_item = request_or_item.replace(meta = {'start_url': response.meta['start_url']})
yield request_or_item
def make_requests_from_url(self, url):
return Request(url, dont_filter=True, meta = {'start_url': url})
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = testItem()
print response.request.meta, response.url
I wanted to delete this answer as it doesn't solve OP's problem, but i thought to leave it as a scrapy example.
Warning:
When writing crawl spider rules, avoid using parse as callback, since
the CrawlSpider uses the parse method itself to implement its logic.
So if you override the parse method, the crawl spider will no longer
work.
Use BaseSpider instead:
class Spider(BaseSpider):
name = "domain_spider"
def start_requests(self):
last_domain_id = 0
chunk_size = 10
cursor = settings.db.cursor()
while True:
cursor.execute("""
SELECT domain_id, domain_url
FROM domains
WHERE domain_id > %s AND scraping_started IS NULL
LIMIT %s
""", (last_domain_id, chunk_size))
self.log('Requesting %s domains after %s' % (chunk_size, last_domain_id))
rows = cursor.fetchall()
if not rows:
self.log('No more domains to scrape.')
break
for domain_id, domain_url in rows:
last_domain_id = domain_id
request = self.make_requests_from_url(domain_url)
item = items.Item()
item['start_url'] = domain_url
item['domain_id'] = domain_id
item['domain'] = urlparse.urlparse(domain_url).hostname
request.meta['item'] = item
cursor.execute("""
UPDATE domains
SET scraping_started = %s
WHERE domain_id = %s
""", (datetime.now(), domain_id))
yield request
...

Scrapy - Follow RSS links

I was wondering if anyone ever tried to extract/follow RSS item links using
SgmlLinkExtractor/CrawlSpider. I can't get it to work...
I am using the following rule:
rules = (
Rule(SgmlLinkExtractor(tags=('link',), attrs=False),
follow=True,
callback='parse_article'),
)
(having in mind that rss links are located in the link tag).
I am not sure how to tell SgmlLinkExtractor to extract the text() of
the link and not to search the attributes ...
Any help is welcome,
Thanks in advance
CrawlSpider rules don't work that way. You'll probably need to subclass BaseSpider and implement your own link extraction in your spider callback. For example:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
class MySpider(BaseSpider):
name = 'myspider'
def parse(self, response):
xxs = XmlXPathSelector(response)
links = xxs.select("//link/text()").extract()
return [Request(x, callback=self.parse_link) for x in links]
You can also try the XPath in the shell, by running for example:
scrapy shell http://blog.scrapy.org/rss.xml
And then typing in the shell:
>>> xxs.select("//link/text()").extract()
[u'http://blog.scrapy.org',
u'http://blog.scrapy.org/new-bugfix-release-0101',
u'http://blog.scrapy.org/new-scrapy-blog-and-scrapy-010-release']
There's an XMLFeedSpider one can use nowadays.
I have done it using CrawlSpider:
class MySpider(CrawlSpider):
domain_name = "xml.example.com"
def parse(self, response):
xxs = XmlXPathSelector(response)
items = xxs.select('//channel/item')
for i in items:
urli = i.select('link/text()').extract()
request = Request(url=urli[0], callback=self.parse1)
yield request
def parse1(self, response):
hxs = HtmlXPathSelector(response)
# ...
yield(MyItem())
but I am not sure that is a very proper solution...
XML Example From scrapy doc XMLFeedSpider
from scrapy.spiders import XMLFeedSpider
from myproject.items import TestItem
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
#item = TestItem()
item = {} # change to dict for removing the class not found error
item['id'] = node.xpath('#id').extract()
item['name'] = node.xpath('name').extract()
item['description'] = node.xpath('description').extract()
return item

Categories