I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')
Related
i am trying to crawl into zomato to get info of the restaurants in istanbul. so, i am trying to get all the hrefs in search result pages. however, i am only getting the first search result of every page.
import scrapy
from ..items import ZomatodataItem
class ZomatoSpider(scrapy.Spider):
name = 'zomato'
allowed_domains = ["zomato.com"]
start_urls = [
'https://www.zomato.com/istanbul/restaurants?page=1'
]
def parse(self, response):
all_css = response.css('.search_left_featured')
all_product = all_css.css('a::attr(href)').get()
yield scrapy.Request(all_product, callback=self.parse_dir_contents)
max_page_number = 6
for i in range(1, max_page_number):
url_next = 'https://www.zomato.com/istanbul/restaurants?page=' + str(i)+''
yield scrapy.Request(url_next, callback=self.parse)
def parse_dir_contents(self, response):
items = ZomatodataItem()
items['name'] = response.css('.diBDma::text').extract()
items['genre'] = response.css('.gQXqL::text').extract_first()
items['tags'] = response.css('.cunMUz::text').extract()
items['address'] = response.css('.clKRrC::text').extract()
items['phone_number'] = response.css('.kKemRh::text').extract()
yield items
Makes sense that you only get 1 result - 'all_product' will only contain 1 item. If you want to get the full list, you'll have to update it to this:
all_products = all_css.css('a::attr(href)').getall()
Now you can loop through the links and get the detailed information like this:
for product in all_products:
yield scrapy.Request(product, callback=self.parse_dir_contents)
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old
I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')
I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.
Scrapy is used to parse an html page. My question is why sometimes scrapy returns the response I want, but sometimes does not return a response. Is it my fault? Here's my parsing function:
class AmazonSpider(BaseSpider):
name = "amazon"
allowed_domains = ["amazon.org"]
start_urls = [
"http://www.amazon.com/s?rh=n%3A283155%2Cp_n_feature_browse-bin%3A2656020011"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(#class, "result")]')
items = []
titles = {'titles': sites[0].xpath('//a[#class="title"]/text()').extract()}
for title in titles['titles']:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
I believe you are just not using the most adequate XPath expression.
Amazon's HTML is kinda messy, not very uniform and therefore not very easy to parse. But after some experimenting I could extract all the 12 titles of a couple of search results with the following parse function:
def parse(self, response):
sel = Selector(response)
p = sel.xpath('//div[#class="data"]/h3/a')
titles = p.xpath('span/text()').extract() + p.xpath('text()').extract()
items = []
for title in titles:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
If you care about the actual order of the results the above code might not be appropriate but I believe that is not the case.