How to scrape on two different domain using scrapy? - python

Hi I would like to scrape 2 different domain in my script I have tried my if statement but I it seems that it is not working, any idea please?
Here's my code
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com']
start_urls = [
'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999',
'https://www.forever21.com/us/shop/catalog/category/f21/sale',
]
def parse_start_url(response):
if (response.url == 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'):
parse_1(response)
if (response.url == 'https://www.forever21.com/us/shop/catalog/category/f21/sale'):
parse_2(response)
def parse_1(self, response):
for product_item in response.css('li.product-item'):
item = {
'title': product_item.css('h3.item-heading a.link::text').extract_first(),
'regular-price': product_item.css('strong.item-price span.price.regular::text').extract_first(),
'sale-price': product_item.css('strong.item-price span.price.sale::text').extract_first(),
'photo-url': product_item.css('.image-container img::attr(data-src)').extract_first(),
'description-url': "https://www2.hm.com/" + product_item.css('h3.item-heading a::attr(href)').extract_first(),
}
yield item
def parse_2(self, response):
#Some code getting item on domain 2
Please Help thank you

Check your allowed_domains variable. You should add new domain, like ['www2.hm.com', 'forever21.com'] or remove it at all. Also you have no parse function.
I can suppose to remove your start_urls with if and use start_requests instead. Your code will be more readable.
import scrapy
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com', 'forever21.com']
def start_requests(self):
urls = (
(self.parse_1, 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'),
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_1(self, response):
print 111111111
def parse_2(self, response):
print 2222222222

Related

Scraping Tripadvisor attractions using scrapy and python

I am trying to scrape TripAdvisor's attractions, but I cannot get the names and addresses of each attraction. I suspect I wrote product.css(...) wrong (there are jsons?).
Can anyone tell me how to correct the code to get the name and address of each attraction?
My current code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)'):
yield response.follow(link.get(), callback=self.parse_categories)
def parse_categories(self, response):
products = response.css('div.eeqnt')
for product in products:
yield {
'name' : product.css('h1.WlYyy cPsXC GeSzT::text').get().strip(),
'address' : product.css('span.WlYyy cacGK Wb::text').get().strip(),
}
Updated code (exporting infro from each atrraction on each page from list):
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g274862-Activities-a_allAttractions.true-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa30-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa60-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa90-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa150-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa180-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa210-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa240-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa270-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa300-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa330-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa360-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa390-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa420-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa450-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa480-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa510-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa540-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa570-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa600-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa630-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa660-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa690-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa720-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa750-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa780-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa810-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa840-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa870-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa900-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa930-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa960-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa990-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1020-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1050-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1080-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1110-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1140-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1170-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1200-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1230-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1260-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1290-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1320-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1350-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1380-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1410-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1440-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1470-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1500-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1530-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1560-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1590-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1620-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1650-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1680-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1710-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1740-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1770-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1800-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1830-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1860-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1890-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1920-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1950-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1980-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2010-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2040-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2070-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2100-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2130-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2160-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2190-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2220-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2250-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2280-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2310-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2340-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2370-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2400-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2430-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2460-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2490-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2520-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2550-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2580-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2610-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2640-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2670-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2700-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2730-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2760-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2790-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2820-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2850-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2880-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2910-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2940-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2970-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3000-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3030-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3060-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3090-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3150-Slovenia.html'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name': response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'reviews': response.xpath('(//*[#class="cfIVb"])[1]//text()').getall(),
'address': response.xpath('(//*[#class="dGWve"])//text()').getall(),
'url': response.url,
}
It's not really related to python, but css-selectors.
CSS classes should separate with dot and not space WlYyy.cPsXC.GeSzT.
Best suggestion would be to use chrome with dev-toolbar. It will give you an ability to get path to the specific element via css-selector or xpath, just right-click on the element in a DOM-tree and select copy menu-item.
Avoid using classes (especially one without semantic meaning) as an anchor point. They might change from page to page, or in time.
Better to use semantically meaningful nodes, like in your case:
XPath for the title would looks like this //main//header//div[#data-automation="main_h1"]//h1.
You can't use for loop in each listing page
from scrapy.crawler import CrawlerProcess
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
#print(link)
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name' : response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'address' :''.join(response.xpath('(//*[#class="hxQKk"])[1]//text()').getall()[:-1]),
'url':response.url
}
if __name__ == "__main__":
process =CrawlerProcess(QuotesSpider)
process.crawl()
process.start()

How to run both items in scrapy function?

Whenever I use the link of captions and transcription in start_urls variable, it gives me the price of caption in both captions and transcription variable and again give me the price of transcription in both variables. Why and how to solve this issue?
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions',
'https://www.rev.com/freelancers/transcription']
def parse(self, response):
items = FetchingItem()
Transcription_price = response.css('#middle-benefit .mt1::text').extract()
Caption_price = response.css('#middle-benefit .mt1::text').extract()
items['Transcription_price'] = Transcription_price
items['Caption_price'] = Caption_price
yield items
I suspect that you need another structure of class, sequential:
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions']
def parse(self, response):
items = FetchingItem()
items['Caption_price'] = response.css('#middle-benefit .mt1::text').extract()
yield Request('https://www.rev.com/freelancers/transcription', self.parse_transcription, meta={'items': items})
def parse_transcription(self, response):
items = response.meta['items']
items['Transcription_price'] = response.css('#middle-benefit .mt1::text').extract()
yield items

Scrapy - scrape of all of the item instead of 1 item

I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you
It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)

How do I recurse in scrapy?

Here's my code
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
names = response.xpath("//ul[#class='mlist']//li/a/#title").extract()
on = response.meta.get("names", [])
cmp_names = on + names
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
meta={"names": cmp_names},
callback=self.parse)
yield scrapy.Request("http://www.piaov.com", meta={"names": cmp_names}, callback=self.parse_item)
def parse_item(self, response):
pass
When i debug my code in 'parse_item' function,the 'response.meta["names"]' only include the first page datas(12 titles in this case), how could i get the 6 pages datas list.
Its because you have URL http://www.piaov.com and scrapy ignores the duplicate URLs unless dont_filter=True is specified in Request like Request(url_here, dont_filter=True)
Also I don't like your logic of scraper, why are you calling parse_item at all? it is not necessary. Please see the code below and do it like that.
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
for name in response.xpath("//ul[#class='mlist']//li/a/#title").extract():
yield {"name": name}
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
callback=self.parse)

Python Scrapy allowed_damins Attriute

I am studying with coding simple code, getting post information of stack over flow questions.
I set allowed_domains = ["http://stackoverflow.com/questions/] with a base Spider.
And its parse() Method only return a Request with the url of format. "http://stackoverflow.com/questions/%d/" % no
I thought it will work...maybe i have a misunderstanding on allowed_domain.
All requests returned by parse() seem filtered by allowed_domain.
It only works when i remove the allowed_domain.
Can you explain..? sorry about my trivial question.
class StackOverFlowPost(scrapy.Spider):
startNo = 26200877
endNo = 26200880
curNo = 26200877
name = "stackOverFlowPost"
start_urls = ["http://stackoverflow.com/questions/%d/" % startNo ]
allowed_domains = ["http://stackoverflow.com/questions"]
baseUrl = "http://stackoverflow.com/questions/%d/"
def parse(self, response):
itemObj = items.StackOverFlowItem()
# getting items information from the page
...
yield itemObj
StackOverFlowPost.curNo += 1
nextPost = StackOverFlowPost.baseUrl % StackOverFlowPost.curNo
yield scrapy.Request(nextPost, callback = self.parse)
In you spider, allowed_domains should be a list of domain (not url):
allowed_domains = ["stackoverflow.com"]
Notice that, you can also set start_urls with a list of url:
start_urls = ["http://stackoverflow.com/questions/%d/" % i for i in range(startNo, endNo+1)]
It makes parse() easy to write.

Categories