does not work parcing scrapy - python

It does not collect data from the title. I made in the sample, but it still refuses to work.Here is my code:
toster.py:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from toster.items import DjangoItem
class DjangoSpider(CrawlSpider):
name = "django"
allowed_domains = ["www.toster.ru"]
start_urls = [
'http://www.toster.ru/tag/django/questions',
]
rules = [
Rule(LinkExtractor(
allow=['/tag/django/questions\?page=\d']),
callback='parse_item',
follow=True)
]
def parse_item(self, response):
selector_list = response.css('div.thing')
for selector in selector_list:
item = DjangoItem()
item['title'] = selector.xpath('div/h2/a/text()').extract()
yield item
any help?

Multiple issues in your code:
remove www. from the allowed_domains
fix your regular expression inside the Link Extractor - replace \d with \d+
set unique=False to let Scrapy handle the pagination pages
fix your extracting logic in parse_item() - for instance, there are no elements with thing class on these pages
Fixed version (works for me):
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from toster.items import DjangoItem
class DjangoSpider(CrawlSpider):
name = "django"
allowed_domains = ["toster.ru"]
start_urls = [
'http://www.toster.ru/tag/django/questions',
]
rules = [
Rule(LinkExtractor(
allow=['/tag/django/questions\?page=\d+'], unique=False),
callback='parse_item',
follow=True,)
]
def parse_item(self, response):
selector_list = response.css('div.question__content')
for selector in selector_list:
item = DjangoItem()
item['title'] = selector.css('a.question__title-link[itemprop=url]::text').extract_first().strip()
yield item

Related

Scrapy CrawlSpider unable to crawl first page

My problem is the crawler crawling all pages except first page & I'm unable to understand why?
I'm sure that there is no problem with my items.py or anything else.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
just implement parse_start_url to return the response. details url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
This may be due to js rendering issue, try using splash as talked here

Regular Expression for website

http://www.bbc.com/news/business-41097280
Is the website I want the regular expression for.
So far, I am using the following, where
'.+\/news\/business[-.]\d{8}$
Which is part of this code segment here, used with Scrapy
from scrapy.item import Item, Field
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class TryItem(Item):
url = Field()
class BbchrcrawlerSpider(CrawlSpider):
name = "bbchrcrawler"
allowed_domains = ["www.bbc.com"]
start_urls = ['http://www.bbc.com/news/business-']
rules = (Rule(LinkExtractor(allow=['.+\/news\/business+\-d{8}$']), callback='parse_item', follow=True),)
def parse_item(self, response):
Item = TryItem()
Item['url'] = response.url
yield Item
What's the correct way to get the URL there for extracting multiple pages with the same format?
The result should collect URLs with the following format:
bbc.com/news/business-########
You can try this:
pattern = "bbc\.com/news/business-\d+"
rules = (Rule(LinkExtractor(allow=[pattern]), callback='parse_item', follow=True),)

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

Scrapy, only follow internal URLS but extract all links found

I want to get all external links from a given website using Scrapy. Using the following code the spider crawls external links as well:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule (LinkExtractor(), callback="parse_obj", follow=True),
)
def parse_obj(self,response):
item = someItem()
item['url'] = response.url
return item
What am I missing? Doesn't "allowed_domains" prevent the external links to be crawled? If I set "allow_domains" for LinkExtractor it does not extract the external links. Just to clarify: I wan't to crawl internal links but extract external links. Any help appriciated!
You can also use the link extractor to pull all the links once you are parsing each page.
The link extractor will filter the links for you. In this example the link extractor will deny links in the allowed domain so it only gets outside links.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
An updated code based on 12Ryan12's answer,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
return item
A solution would be make usage a process_link function in the SgmlLinkExtractor
Documentation here http://doc.scrapy.org/en/latest/topics/link-extractors.html
class testSpider(CrawlSpider):
name = "test"
bot_name = 'test'
allowed_domains = ["news.google.com"]
start_urls = ["https://news.google.com/"]
rules = (
Rule(SgmlLinkExtractor(allow_domains=()), callback='parse_items',process_links="filter_links",follow= True) ,
)
def filter_links(self, links):
for link in links:
if self.allowed_domains[0] not in link.url:
print link.url
return links
def parse_items(self, response):
### ...

Constructing a regular expression for url in start_urls list in scrapy framework python

I am very new to scrapy and also i didn't used regular expressions before
The following is my spider.py code
class ExampleSpider(BaseSpider):
name = "test_code
allowed_domains = ["www.example.com"]
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
Now if we look at start_urls all the three urls are same except they differ at integer value 2?, 3? and so on i mean unlimited according to urls present on the site , i now that we can use crawlspider and we can construct regular expression for the URL like below,
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
rules = (
Rule(SgmlLinkExtractor(allow=(........),))),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
can u please guide me , that how can i construct a crawl spider Rule for the above start_url list.
If i understand you correctly, you want a lot of start URL with a certain pattern.
If so, you can override BaseSpider.start_requests method:
class ExampleSpider(BaseSpider):
name = "test_code"
allowed_domains = ["www.example.com"]
def start_requests(self):
for i in xrange(1000):
yield self.make_requests_from_url("http://www.example.com/bookstore/new/%d?filter=bookstore" % i)
...
If you are using CrawlSpider, it's not usually a good idea to override the parse method.
Rule object can filter the urls you are interesed to the ones you do not care for.
See CrawlSpider in the docs for reference.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/bookstore']
rules = (
Rule(SgmlLinkExtractor(allow=('\/new\/[0-9]\?',)), callback='parse_bookstore'),
)
def parse_boostore(self, response):
hxs = HtmlXPathSelector(response)

Categories