I have a question about save list item one by one in scrapy
My code is like this :
class MySpider(Spider):
name = "test"
start_urls=[""]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//a[contains(#href, '.html')]") #many .html
for i,site in enumerate(sites):
item = YoutoItem()
item['link'] = site.xpath("./#href").extract()[0]
item['title'] = site.xpath("./text()").extract()[0]
yield Request(url=link,meta={'item':item}, callback=self.parse_ler)
break #just test the first one.html
def parse_ler(self, response):
item = response.meta['item']
sel = Selector(response)
url = sel.xpath("//embed/#src").extract()
for t in url:
print t #it will print url1,url2,url3
item['url'] = t
yield item
And My pipline.py
class YoutoPipeline(object):
def process_item(self, item, spider):
item.save()
return item
And the terminal will printout:
{'link': u'http://test.html',
'title': u'A',
'url': u'url1'}
{'link': u'http://test.html',
'title': u'A',
'url': u'url2'}
{'link': u'http://test.html',
'title': u'A',
'url': u'url3'}
But when it save to database , it will ony saved one of them
{'link': u'http://test.html',
'title': u'A',
'url': u'url1'}
I thinl it's because item['url'] is get by forloop
Please teach me how to edit to save these 3 datas seperately into database
My database is postgreSQL
EDIT:
I found a method :
just put the item = YoutoItem() under the forloop
And It can work:
for t in url:
item = YoutoItem()
item['url'] = t
yield item
If you need to store the items separately in database, just use another YoutoItem()
yield Request(url=link,meta={'item':item}, callback=self.parse_ler)
can be rewritten to:
# def parse
# or Request if you import scrapy.Request, and you don't need request.meta
yield scrapy.Request(link, callback=self.parse_ler)
# def parse_ler, which you don't need the request.meta anymore
sel = Selector(response)
url = sel.xpath("//embed/#src").extract()
for t in url:
item = YoutiItem() # just construct a new YoutoItem
item['url'] = t
yield item
Try disable the filter request like in this example:
Request(url=u, callback=self.parse_ler, meta={"item": item}, dont_filter = True)
Related
My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request
I have a list of links which also have some interesting urls:
start_urls = ['link1.com', 'link2.com', 'link3.com', ...,'linkN.com']
Using scrapy, how can I get?:
'link1.com' 'extracted1.link.com'
'link2.com' 'extracted2.link.com'
'link3.com' 'extracted3.link.com'
...
'linkN.com' 'extractedN.link.com'
Since I am new with scrapy I tried this just for one link:
class ToySpider(scrapy.Spider):
name = "toy"
allowed_domains = ["https://www.example.com/"]
start_urls = ['link1.com']
def parse(self, response):
for link in response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]"):
item = ToyItem()
item['link'] = link.xpath('#href').extract_first()
item['interesting_link'] = link
yield item
However, this returned me:
{'link': 'extracted1.link.com',
'name': <Selector xpath=".//*[#id='object']//tbody//tr//td//span//a[2]" data='<a href="extracted1.link.com'>}
How can I do the above for all the elements of start_urls and return the following list:
[
{'link': 'extracted1.link.com',
'name': 'link1.com'},
{'link': 'extracted2.link.com',
'name': 'link2.com'},
{'link': 'extracted3.link.com',
'name': 'link3.com'},
....
{'link': 'extractedN.link.com',
'name': 'linkN.com'}
]
UPDATE
After trying #Granitosaurus answer which is for returning NaN for links who do not have: response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]") I did:
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
list_of_dics = []
list_of_dics.append(item)
df = pd.DataFrame(list_of_dics)
print(df)
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
However, instead of returning (*):
'link1.com' 'NaN'
'link2.com' 'NAN'
'link3.com' 'extracted3.link.com'
I got:
'link3.com' 'extracted3.link.com'
How can I return (*)
You can retrieve current url your spider is crawling from response.url attribute:
start_urls = ['link1.com', 'link2.com', 'link3.com', ...,'linkN.com']
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = None
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
I'm using scrapy for scraping some pages and I want in each row:
Title
Url
Author
The problem is that (sometimes) there are more titles and urls but the author comes just one time in each page. So I want to add the respective author to urls and titles (which come out fine).
This is my (bad) code, I tried to make a loop but it doesn't work very well I think, plus, it raises me the error "Spider must return Request, BaseItem, dict or None, got 'list'". Can you tell me where is my mistake?
def parse(self, response):
sels = response.xpath('//td[#class="default"]')
items = []
for sel in sels:
item = ThisItem()
item['URL'] = sel.xpath('//td[#class]/a/#href').extract()
item['TITLE'] = sel.xpath('//td[#class]/a').extract()
i = item['TITLE']
for i in sels:
item['AUTHOR'] = sel.xpath('//td[#class]/b[1]').extract()
items.append(item)
yield items
Thanks in advance.
You should yield every item separately. Try this
def parse(self, response):
author = response.xpath('//td[#class]/b[1]').extract()
for sel in response.xpath('//td[#class="default"]'):
item = ThisItem()
item['URL'] = sel.xpath('//td[#class]/a/#href').extract()
item['TITLE'] = sel.xpath('//td[#class]/a').extract()
item['AUTHOR'] = author
yield item
I use next code in my spider:
def parse_item(self, response):
item = MyItem()
item['price'] = [i for i in self.get_usd_price(response)]
return item
def get_usd_price(self, response):
yield FormRequest(
'url',
formdata={'key': 'value'},
callback=self.get_currency
)
def get_currency(self, response):
self.log('lalalalala')
The problem is I can't reach my get_currency callback. In my logger I see that price item takes [<POST url>] value. What am I doing wrong? I tried to add dont_filter to FormRequest, change FormRequest to simple get Request
Update
I've also tried GHajba's suggestion (so far without success):
def parse_item(self, response):
item = MyItem()
self.get_usd_price(response, item)
return item
def get_usd_price(self, response, item):
request = FormRequest(
'url',
formdata={'key': 'value'},
callback=self.get_currency
)
request.meta['item'] = item
yield request
def get_currency(self, response):
self.log('lalalalala')
item = response.meta['item']
item['price'] = 123
return item
This is not how scrapy works, you can only yield a request or an item on every method, but you can't yield the response this way, If you want to update the price information for the item and then yield it you should do something like:
def parse_item(self, response):
item = MyItem()
# populate the item with this response data
yield FormRequest(
'url',
formdata={'key': 'value'},
callback=self.get_currency, meta={'item':item}
)
def get_currency(self, response):
self.log('lalalalala')
item = response.meta['item']
item['price'] = 123 # get your price from the response body.
# keep populating the item with this response data
yield item
So check that for passing information between requests, you need to use the meta parameter.
Your problem is that you assign the values of the generator created in get_usd_price to your item. You can solve this with changing the method and how you call it.
You have to yield the FormRequest but you mustn't use this value to have an effect with Scrapy. Just call the function get_usd_price without assigning it to item['price']:
self.get_usd_price(response, item)
You have to provide item to your function because Scrapy works asynchronous so you cannot be sure when the FormRequest is executing. Now you have to pass along the item as a meta parameter of the FormRequest and then you can access the item in the get_currency function and yield the item there.
You can read more about meta in the docs: http://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta
Suppose I have a Bookitem, I need to add information to it in both the parse phase and detail phase
def parse(self, response)
data = json.loads(response)
for book in data['result']:
item = BookItem();
item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self,response):
hxs = HtmlXPathSelector(response)
item['price'] = ......
#I want to continue the same book item as from the for loop above
Using the code as is would led to undefined item in the detail phase. How can I pass the item to the detail? detail(self,response,item) doesn't seem to work.
There is an argument named meta for Request:
yield Request(url, callback=self.detail, meta={'item': item})
then in function detail, access it this way:
item = response.meta['item']
See more details here about jobs topic.
iMom0's approach still works, but as of scrapy 1.7, the recommended approach is to pass user-defined information through cb_kwargs and leave meta for middlewares, extensions, etc:
def parse(self, response):
....
yield Request(url, callback=self.detail, cb_kwargs={'item': item})
def detail(self,response, item):
item['price'] = ......
You could also pass the individual key-values into the cb_kwargs argument and then only instantiate the BookItem instance in the final callback (detail in this case):
def parse(self, response)
data = json.loads(response)
for book in data['result']:
yield Request(url,
callback=self.detail,
cb_kwargs=dict(id_=book['id'],
url=book['url']))
def detail(self,response, id_, url):
hxs = HtmlXPathSelector(response)
item = BookItem()
item['id'] = id_
item['url'] = url
item['price'] = ......
You can define variable in init method:
class MySpider(BaseSpider):
...
def __init__(self):
self.item = None
def parse(self, response)
data = json.loads(response)
for book in data['result']:
self.item = BookItem();
self.item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self, response):
hxs = HtmlXPathSelector(response)
self.item['price'] = ....