How to use Scrapy for URL crawling - python

I want to crawl the link https://www.aparat.com/.
I crawl it correctly and get all the video links with header tag;like this :
import scrapy
class BlogSpider(scrapy.Spider):
name = 'aparatspider'
start_urls = ['https://www.aparat.com/']
def parse(self, response):
print '=' * 80 , 'latest-trend :'
ul5 = response.css('.block-grid.xsmall-block-grid-2.small-block-grid-3.medium-block-grid-4.large-block-grid-5.is-not-center')
ul5 = ul5.css('ul').css('li')
latesttrend = []
for li5 in ul5:
latesttrend.append(li5.xpath('div/div[1]/a').xpath('#onmousedown').extract_first().encode('utf8'))
print(latesttrend)
now my question is this:
How can I get all the links from the داغ ترین ها tag, more than 1000? Currently, I get only 60, more or less.

I did this with the following code :
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class aparat_hotnewsItem(scrapy.Item):
videourl = scrapy.Field()
class aparat_hotnewsSpider(CrawlSpider):
name = 'aparat_hotnews'
allowed_domains = ['www.aparat.com']
start_urls = ['http://www.aparat.com/']
# Xpath for selecting links to follow
xp = 'your xpath'
rules = (
Rule(LinkExtractor(restrict_xpaths=xp), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = aparat_hotnewsItem()
item['videourl'] = response.xpath('your xpath').extract()
yield item

Related

Get price from XHR and Combine Scrapy

I have to scrape data(name, price, description, brand,...) on this website: https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww%7Cnew+in%7Cnew+products%7Cclothing
My code is as such:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
def remove_characters(self,value):
return value.strip('\n')
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def parse_item(self, response):
yield{
'name':response.xpath("//div[#class='product-hero']/h1/text()").get(),
'price':response.xpath("//span[#data-id='current-price']").get(),
'description':response.xpath("//div[#class='product-description']/ul/li/text()").getall(),
'about_me': response.xpath("//div[#class='about-me']//text()").getall(),
'brand_description':response.xpath("//div[#class='brand-description']/p/text()").getall()
}
However, due to javascript I cannot get the Price. I need to get it thorugh XHR.
My code for getting the price of only one item in the list is as followed:
import scrapy
import json
class AsosSpider(scrapy.Spider):
name = 'asos'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=200369183&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28']
def parse(self, response):
#print(response.body)
resp = json.loads(response.text)[0]
price = resp.get('productPrice').get('current').get('text')
print(price)
yield {
'price': price
Here, my start_urls is the Request URL. And it keeps changing for each item.
Item1: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=23443988&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Item2: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=22495685&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Only the productsIds are changing!!!
I need to insert the second code in the first code to get the price as well? How to do it please?
Thanks!
pix
items.py:
import scrapy
class AsosItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
about_me = scrapy.Field()
brand_description = scrapy.Field()
As I said in you last post I have a problem with this website on my computer for some reason, but you need to do something like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import AsosItem
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def remove_characters(self,value):
return value.strip('\n')
def parse_item(self, response):
price_url = 'https://www.asos.com' + re.search(r'window.asos.pdp.config.stockPriceApiUrl = \'(.+)\'', response.text).group(1)
item = AsosItem()
item['name'] = response.xpath("//div[#class='product-hero']/h1/text()").get()
item['description'] = response.xpath("//div[#class='product-description']/ul/li/text()").getall()
item['about_me'] = response.xpath("//div[#class='about-me']//text()").getall()
item['brand_description'] = response.xpath("//div[#class='brand-description']/p/text()").getall()
request = scrapy.Request(url=price_url, callback=self.parse_price)
request.meta['item'] = item
return request
def parse_price(self, response):
jsonresponse = response.json()[0]
price = jsonresponse['productPrice']['current']['text']
item = response.meta['item']
item['price'] = price
return item
Test the code and if it doesn't work then get the general idea and tweak it a bit, I can't test it myself.

How do I ignore pdf links while scraping using Scrapy?

I'm new to Scrapy and I'm currently making a spider that extracts only the event title and event description from a website. I am able to get the title and description, however, the spider is also trying to extract data from a pdf link which causes a "raise NotSupported("Response content isn't text")" error. How can I prevent the spider from doing this?
Here is my code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class EventsspiderSpider(CrawlSpider):
name = 'eventsspider'
allowed_domains =['cs.acadiau.ca']
start_urls = ['https://cs.acadiau.ca/news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html']
rules = (
Rule(LinkExtractor(allow=('news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html', )), callback='parse_item', follow=True),)
def parse_item(self, response):
i = {}
title_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/h1/text()').extract()
data_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/div[1]/p[7]/span/text()').extract()
for x in range(0, len(title_list)):
i['title'] = title_list[x]
i['data'] = data_list[x]
yield i

Scrapy, only follow internal URLS but extract all links found

I want to get all external links from a given website using Scrapy. Using the following code the spider crawls external links as well:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule (LinkExtractor(), callback="parse_obj", follow=True),
)
def parse_obj(self,response):
item = someItem()
item['url'] = response.url
return item
What am I missing? Doesn't "allowed_domains" prevent the external links to be crawled? If I set "allow_domains" for LinkExtractor it does not extract the external links. Just to clarify: I wan't to crawl internal links but extract external links. Any help appriciated!
You can also use the link extractor to pull all the links once you are parsing each page.
The link extractor will filter the links for you. In this example the link extractor will deny links in the allowed domain so it only gets outside links.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
An updated code based on 12Ryan12's answer,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
return item
A solution would be make usage a process_link function in the SgmlLinkExtractor
Documentation here http://doc.scrapy.org/en/latest/topics/link-extractors.html
class testSpider(CrawlSpider):
name = "test"
bot_name = 'test'
allowed_domains = ["news.google.com"]
start_urls = ["https://news.google.com/"]
rules = (
Rule(SgmlLinkExtractor(allow_domains=()), callback='parse_items',process_links="filter_links",follow= True) ,
)
def filter_links(self, links):
for link in links:
if self.allowed_domains[0] not in link.url:
print link.url
return links
def parse_items(self, response):
### ...

How to recursively crawl whole website using scrapy

I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?
There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()

Get RSS links given a domain

I have a file which has a list of domains. I need to crawl the domain(i.e. the whole website) to get rss links. Recursively crawl each page of the website to get rss links from each page and write to a json file corresponding to the domain This is my code just for one website:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[#type=application/rss+xml]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
tried running
scrapy crawl apple -o items.json -t json
But items.json only contains a bracket [
This is my items.py file:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
Your XPath expression needs to have quotes around the "application/rss+xml" test value.
Try something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[#type="application/rss+xml"]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item

Categories