Regular Expression for website - python

http://www.bbc.com/news/business-41097280
Is the website I want the regular expression for.
So far, I am using the following, where
'.+\/news\/business[-.]\d{8}$
Which is part of this code segment here, used with Scrapy
from scrapy.item import Item, Field
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class TryItem(Item):
url = Field()
class BbchrcrawlerSpider(CrawlSpider):
name = "bbchrcrawler"
allowed_domains = ["www.bbc.com"]
start_urls = ['http://www.bbc.com/news/business-']
rules = (Rule(LinkExtractor(allow=['.+\/news\/business+\-d{8}$']), callback='parse_item', follow=True),)
def parse_item(self, response):
Item = TryItem()
Item['url'] = response.url
yield Item
What's the correct way to get the URL there for extracting multiple pages with the same format?
The result should collect URLs with the following format:
bbc.com/news/business-########

You can try this:
pattern = "bbc\.com/news/business-\d+"
rules = (Rule(LinkExtractor(allow=[pattern]), callback='parse_item', follow=True),)

Related

Scrapy: response.xpath prints None, but upon clicking into weblink, xPath is correct

I am trying to print out the h1 title of the item that I am trying to scrape. I have tried printing the result of
print(response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()) from a product like this https://www.steinersports.com/football/tampa-bay-buccaneers/tom-brady-tampa-bay-buccaneers-super-bowl-lv-champions-autographed-white-nike-game-jersey-with-lv-mvp-inscription/o-8094+t-92602789+p-2679909745+z-8-2492872768?_ref=p-FALP:m-GRID:i-r20c0:po-60.
I am not sure how to go about debugging this error, since when I click into the links that are returning none and check the xpath, it is correct. Any help is appreciated, full code below:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class SteinerSportsCrawlSpiderSpider(CrawlSpider):
name = 'steinersports_crawl_spider'
allowed_domains = ['steinersports.com']
start_urls = [
'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV',
]
base_url = 'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV'
rules = (
Rule(LinkExtractor(allow=r'/signed'), follow=True),
Rule(LinkExtractor(allow=r'football/', deny=r'/signed'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
description_flag = True
price_flag = True
item_description = response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[17]/div/div[2]/div').get()
print(item)
#item_price = response.xpath('//span[#class="product__price"]/text()').get()
print(response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get())
item['item_name'] = response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()
return item
You can directly access the h1 tag using the data-talos attribute. This xpath should get the title:
response.xpath("//h1[#data-talos='labelPdpProductTitle']/text()").extract_first()

How do I ignore pdf links while scraping using Scrapy?

I'm new to Scrapy and I'm currently making a spider that extracts only the event title and event description from a website. I am able to get the title and description, however, the spider is also trying to extract data from a pdf link which causes a "raise NotSupported("Response content isn't text")" error. How can I prevent the spider from doing this?
Here is my code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class EventsspiderSpider(CrawlSpider):
name = 'eventsspider'
allowed_domains =['cs.acadiau.ca']
start_urls = ['https://cs.acadiau.ca/news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html']
rules = (
Rule(LinkExtractor(allow=('news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html', )), callback='parse_item', follow=True),)
def parse_item(self, response):
i = {}
title_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/h1/text()').extract()
data_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/div[1]/p[7]/span/text()').extract()
for x in range(0, len(title_list)):
i['title'] = title_list[x]
i['data'] = data_list[x]
yield i

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

Scrapy, only follow internal URLS but extract all links found

I want to get all external links from a given website using Scrapy. Using the following code the spider crawls external links as well:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule (LinkExtractor(), callback="parse_obj", follow=True),
)
def parse_obj(self,response):
item = someItem()
item['url'] = response.url
return item
What am I missing? Doesn't "allowed_domains" prevent the external links to be crawled? If I set "allow_domains" for LinkExtractor it does not extract the external links. Just to clarify: I wan't to crawl internal links but extract external links. Any help appriciated!
You can also use the link extractor to pull all the links once you are parsing each page.
The link extractor will filter the links for you. In this example the link extractor will deny links in the allowed domain so it only gets outside links.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
An updated code based on 12Ryan12's answer,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
return item
A solution would be make usage a process_link function in the SgmlLinkExtractor
Documentation here http://doc.scrapy.org/en/latest/topics/link-extractors.html
class testSpider(CrawlSpider):
name = "test"
bot_name = 'test'
allowed_domains = ["news.google.com"]
start_urls = ["https://news.google.com/"]
rules = (
Rule(SgmlLinkExtractor(allow_domains=()), callback='parse_items',process_links="filter_links",follow= True) ,
)
def filter_links(self, links):
for link in links:
if self.allowed_domains[0] not in link.url:
print link.url
return links
def parse_items(self, response):
### ...

Constructing a regular expression for url in start_urls list in scrapy framework python

I am very new to scrapy and also i didn't used regular expressions before
The following is my spider.py code
class ExampleSpider(BaseSpider):
name = "test_code
allowed_domains = ["www.example.com"]
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
Now if we look at start_urls all the three urls are same except they differ at integer value 2?, 3? and so on i mean unlimited according to urls present on the site , i now that we can use crawlspider and we can construct regular expression for the URL like below,
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
rules = (
Rule(SgmlLinkExtractor(allow=(........),))),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
can u please guide me , that how can i construct a crawl spider Rule for the above start_url list.
If i understand you correctly, you want a lot of start URL with a certain pattern.
If so, you can override BaseSpider.start_requests method:
class ExampleSpider(BaseSpider):
name = "test_code"
allowed_domains = ["www.example.com"]
def start_requests(self):
for i in xrange(1000):
yield self.make_requests_from_url("http://www.example.com/bookstore/new/%d?filter=bookstore" % i)
...
If you are using CrawlSpider, it's not usually a good idea to override the parse method.
Rule object can filter the urls you are interesed to the ones you do not care for.
See CrawlSpider in the docs for reference.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/bookstore']
rules = (
Rule(SgmlLinkExtractor(allow=('\/new\/[0-9]\?',)), callback='parse_bookstore'),
)
def parse_boostore(self, response):
hxs = HtmlXPathSelector(response)

Categories