How to scrape from a list of links? - python

I have a list of links which also have some interesting urls:
start_urls = ['link1.com', 'link2.com', 'link3.com', ...,'linkN.com']
Using scrapy, how can I get?:
'link1.com' 'extracted1.link.com'
'link2.com' 'extracted2.link.com'
'link3.com' 'extracted3.link.com'
...
'linkN.com' 'extractedN.link.com'
Since I am new with scrapy I tried this just for one link:
class ToySpider(scrapy.Spider):
name = "toy"
allowed_domains = ["https://www.example.com/"]
start_urls = ['link1.com']
def parse(self, response):
for link in response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]"):
item = ToyItem()
item['link'] = link.xpath('#href').extract_first()
item['interesting_link'] = link
yield item
However, this returned me:
{'link': 'extracted1.link.com',
'name': <Selector xpath=".//*[#id='object']//tbody//tr//td//span//a[2]" data='<a href="extracted1.link.com'>}
How can I do the above for all the elements of start_urls and return the following list:
[
{'link': 'extracted1.link.com',
'name': 'link1.com'},
{'link': 'extracted2.link.com',
'name': 'link2.com'},
{'link': 'extracted3.link.com',
'name': 'link3.com'},
....
{'link': 'extractedN.link.com',
'name': 'linkN.com'}
]
UPDATE
After trying #Granitosaurus answer which is for returning NaN for links who do not have: response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]") I did:
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = 'NaN'
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item
list_of_dics = []
list_of_dics.append(item)
df = pd.DataFrame(list_of_dics)
print(df)
df.to_csv('/Users/user/Desktop/crawled_table.csv', index=False)
However, instead of returning (*):
'link1.com' 'NaN'
'link2.com' 'NAN'
'link3.com' 'extracted3.link.com'
I got:
'link3.com' 'extracted3.link.com'
How can I return (*)

You can retrieve current url your spider is crawling from response.url attribute:
start_urls = ['link1.com', 'link2.com', 'link3.com', ...,'linkN.com']
def parse(self, response):
links = response.xpath(".//*[#id='object']//tbody//tr//td//span//a[2]")
if not links:
item = ToyItem()
item['link'] = None
item['name'] = response.url
return item
for links in links:
item = ToyItem()
item['link'] = links.xpath('#href').extract_first()
item['name'] = response.url # <-- see here
yield item

Related

Unable to crawling next page in scrapy

Shows results of first page and i want results from all the pages and it should crawl like 2nd page then 3rd page
import scrapy
class QuoteSpider(scrapy.Spider):
name = 'quotes'
base_url = 'https://www.yell.com'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
all_data = response.css('div.row.businessCapsule--mainRow')
for data in all_data:
title = data.css('.text-h2::text').extract()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url + business_url
yield response.follow(final_url, self.parse)
avg_rating = response.css('span.starRating--average::text').get()
items = {
'Title': title ,
'Title Url' : final_url,
'Average Rating': avg_rating
}
yield items
pass
next_page = response.urljoin(response.css('a.pagination--next::attr(href)').extract_first())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
This should do it.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
avg_rating = response.css('span.starRating--average::text').get()
yield {
'Title': title ,
'Title Url' : final_url,
'Average Rating': avg_rating
}
next_page = response.css('a.pagination--next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)

Scrapy: How to pass an item between methods using meta

I'm new to scrapy and python and I'm trying to pass the item item['author'] in parse_quotes to the next parse method parse_bio
I tried the request.meta and response.meta approach as shown in the scrapy documentation but without succes. see code as per below.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/login',
#'http://quotes.toscrape.com/page/2',
]
# Scraping a site with login
# Important: Cookie settings must be "True" to keep the login session alive
custom_settings = {'COOKIES_ENABLED': True}
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.parse_quotes
)
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, callback = self.parse_bio)
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
I expect to get item['author'] from parse_quotes passed to parse_bio
I suggest you to use meta in this way:
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, self.parse_bio,
meta={'author': item['author']}) # <- you set it here
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_data'] = response.meta.get('author') # <- you get it here
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)

Multiple pages per item - using scraped links

My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request

Exporting scraped data to a CSV file

I'm trying to get data from a website that requires me to follow 2 URLs before scraping the data.
The goal is to get an exported file that looks like this:
My code is as follows:
import scrapy
from scrapy.item import Item, Field
from scrapy import Request
class myItems(Item):
info1 = Field()
info2 = Field()
info3 = Field()
info4 = Field()
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts first link
items = []
list1 = response.css("").extract() #extract all info from here
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link1, self.parseInfo1, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo1(self, response):
#Extracts second link
item = myItems()
items = response.meta['item']
list1 = response.css("").extract()
for i in list1:
link1 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
items.append(item)
return request
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items
I've executed the spider in the terminal with the command:
scrapy crawl techbot
The data I get is out of order, and with gaps like this:
For example it scrapes the first set of data multiple times and the rest is out of order.
If anyone could point me in the direction to get the results in a cleaner format as shown in the beginning that would be greatly appreciated.
Thanks
Solved it by consolidating the following of both links into one function instead of two. My spider is working now as follows:
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts links
items = []
list1 = response.css("").extract()
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link2, self.parse, dont_filter =True)
request.meta['item'] = items
yield request
list2 = response.css("").extract()
for i in list2:
link2 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items

scrapy only save one item

I have a question about save list item one by one in scrapy
My code is like this :
class MySpider(Spider):
name = "test"
start_urls=[""]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//a[contains(#href, '.html')]") #many .html
for i,site in enumerate(sites):
item = YoutoItem()
item['link'] = site.xpath("./#href").extract()[0]
item['title'] = site.xpath("./text()").extract()[0]
yield Request(url=link,meta={'item':item}, callback=self.parse_ler)
break #just test the first one.html
def parse_ler(self, response):
item = response.meta['item']
sel = Selector(response)
url = sel.xpath("//embed/#src").extract()
for t in url:
print t #it will print url1,url2,url3
item['url'] = t
yield item
And My pipline.py
class YoutoPipeline(object):
def process_item(self, item, spider):
item.save()
return item
And the terminal will printout:
{'link': u'http://test.html',
'title': u'A',
'url': u'url1'}
{'link': u'http://test.html',
'title': u'A',
'url': u'url2'}
{'link': u'http://test.html',
'title': u'A',
'url': u'url3'}
But when it save to database , it will ony saved one of them
{'link': u'http://test.html',
'title': u'A',
'url': u'url1'}
I thinl it's because item['url'] is get by forloop
Please teach me how to edit to save these 3 datas seperately into database
My database is postgreSQL
EDIT:
I found a method :
just put the item = YoutoItem() under the forloop
And It can work:
for t in url:
item = YoutoItem()
item['url'] = t
yield item
If you need to store the items separately in database, just use another YoutoItem()
yield Request(url=link,meta={'item':item}, callback=self.parse_ler)
can be rewritten to:
# def parse
# or Request if you import scrapy.Request, and you don't need request.meta
yield scrapy.Request(link, callback=self.parse_ler)
# def parse_ler, which you don't need the request.meta anymore
sel = Selector(response)
url = sel.xpath("//embed/#src").extract()
for t in url:
item = YoutiItem() # just construct a new YoutoItem
item['url'] = t
yield item
Try disable the filter request like in this example:
Request(url=u, callback=self.parse_ler, meta={"item": item}, dont_filter = True)

Categories