Scrapy: making loop to add one item to other values - python

I'm using scrapy for scraping some pages and I want in each row:
Title
Url
Author
The problem is that (sometimes) there are more titles and urls but the author comes just one time in each page. So I want to add the respective author to urls and titles (which come out fine).
This is my (bad) code, I tried to make a loop but it doesn't work very well I think, plus, it raises me the error "Spider must return Request, BaseItem, dict or None, got 'list'". Can you tell me where is my mistake?
def parse(self, response):
sels = response.xpath('//td[#class="default"]')
items = []
for sel in sels:
item = ThisItem()
item['URL'] = sel.xpath('//td[#class]/a/#href').extract()
item['TITLE'] = sel.xpath('//td[#class]/a').extract()
i = item['TITLE']
for i in sels:
item['AUTHOR'] = sel.xpath('//td[#class]/b[1]').extract()
items.append(item)
yield items
Thanks in advance.

You should yield every item separately. Try this
def parse(self, response):
author = response.xpath('//td[#class]/b[1]').extract()
for sel in response.xpath('//td[#class="default"]'):
item = ThisItem()
item['URL'] = sel.xpath('//td[#class]/a/#href').extract()
item['TITLE'] = sel.xpath('//td[#class]/a').extract()
item['AUTHOR'] = author
yield item

Related

Scrapy, crawl data by onclick

I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')

Multiple pages per item - using scraped links

My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request

Avoid redundant code in python/scrapy

I am VERY new to python and scrapy. I wrote a working script with the use of scrapy and need a little improvement to avoid redundancies.
At the parse_article_page function I came across 2 possibilities. Either the article has variants (more pages to scrap) or not. Can you help me to avoid the double use of the code in the else-statement and parse_data function?
I tried a second request, but this seems not to work. Log says "DEBUG: Filtered duplicate request" or it says nothing.
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[#class="variants"]/select/option[not(#disabled)]/#variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
#yield scrapy.Request(response.url, callback=self.parse_data) #Does not work
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item
calling else: self.parse_data(response) won't work because you still need to yield the item inside that method for scrapy to get it, You'll have to do something like this:
def parse_article_page(self, response):
#Check for variants
variants = response.xpath('//div[#class="variants"]/select/option[not(#disabled)]/#variant_href').extract()
if len(variants) > 1:
for variant in variants:
variant_url = response.urljoin(variant)
#Request article variants:
yield scrapy.Request(variant_url, callback=self.parse_data)
else:
for item in self.parse_data(response):
yield item
def parse_data(self, response):
item = ShopItem()
item['desc'] = response.css(description_selector).extract()
item['price'] = response.css(price_selector).extract()
item['itno'] = response.css(no_selector).extract()
item['url'] = response.url
yield item

Passing class between functions

I am building a simple(ish) parser in Scrapy and I am blissfully ignorant when it comes to scrapy and Python :-) In the file item.py I have a definition of thisItem() which I assign to item in the code below. All worked rather swimmingly, parseusing a callback to get to parse_dir_content... But then I realized I needed to scrape an extra bit of data and created another function parse_other_content. How do I get what is already in item into parse_other_content?
import scrapy
from this-site.items import *
import re
import json
class DmozSpider(scrapy.Spider):
name = "ABB"
allowed_domains = ["this-site.com.au"]
start_urls = [
"https://www.this-site.com.au?page=1",
"https://www.this-site.com.au?page=2",
]
def parse(self, response):
for href in response.xpath('//h3/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
item['title'] = sel.xpath('text()').extract()
item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
item['propid'] = propID
item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
item['rate_detail'] = sel["this"][0]["that"]
yield item
I know I am missing something simple here, but I can't seem to figure it out.
Per the scrapy documentation (http://doc.scrapy.org/en/1.0/topics/request-response.html#topics-request-response-ref-request-callback-arguments):
In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You can use the Request.meta attribute for that.
In your case I would do something like this:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
request.meta['item'] = item
yield request
def parse_other_content(self, response):
item = response.meta['item']
# do something with the item
return item
According to Steve (see comments) you can also pass a dictionary of meta data as a keyword argument to the Request constructor like so:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content, meta={'item':item})
yield request
You can either allow item to be visible to parse_other_content() by changing it to self.item, or sending it as a parameter to the function. (The first one might be easier.)
For the first solution just add self. to any reference to the item variable. This makes it visible to the entire class.
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
self.item = thisItem()
self.item['title'] = sel.xpath('text()').extract()
self.item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
self.item['propid'] = propID
self.item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
self.item['rate_detail'] = sel["this"][0]["that"]
yield self.item

Python Scrapy not always downloading data from website

Scrapy is used to parse an html page. My question is why sometimes scrapy returns the response I want, but sometimes does not return a response. Is it my fault? Here's my parsing function:
class AmazonSpider(BaseSpider):
name = "amazon"
allowed_domains = ["amazon.org"]
start_urls = [
"http://www.amazon.com/s?rh=n%3A283155%2Cp_n_feature_browse-bin%3A2656020011"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(#class, "result")]')
items = []
titles = {'titles': sites[0].xpath('//a[#class="title"]/text()').extract()}
for title in titles['titles']:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
I believe you are just not using the most adequate XPath expression.
Amazon's HTML is kinda messy, not very uniform and therefore not very easy to parse. But after some experimenting I could extract all the 12 titles of a couple of search results with the following parse function:
def parse(self, response):
sel = Selector(response)
p = sel.xpath('//div[#class="data"]/h3/a')
titles = p.xpath('span/text()').extract() + p.xpath('text()').extract()
items = []
for title in titles:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
If you care about the actual order of the results the above code might not be appropriate but I believe that is not the case.

Categories