I'm doing the scrapy tutorial and am on the 'Craigslist Scrapy Spider #3 – Multiple Pages' section but am unable to get more than one page after following the instructions given. The only difference between what I did and what the tutorial showed was I used 'all jobs' rather than only engineering jobs (since there was only one page of engineering jobs). Below is the code I have
import scrapy
from scrapy import Request
class JobsSpider(scrapy.Spider):
name = 'jobs-new'
allowed_domains = ['craigslist.org']
start_urls = ['https://newyork.craigslist.org/search/jjj']
def parse(self, response):
jobs = response.xpath('//p[#class="result-info"]')
for job in jobs:
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[#class="result-meta"]/span[#class="result-hood"]/text()').extract_first("")[2:-1]
relative_url = job.xpath('a/#href').extract_first()
absolute_url = response.urljoin(relative_url)
yield{'URL':absolute_url, 'Title':title, 'Address':address}
relative_next_url = response.xpat('//a[#class="button next"]/#href').extract_first()
absolute_next_url = response.urljoin(relative_next_url)
yield request(absolute_next_url, callback=self.parse)
I ran this in a terminal using
scrapy crawl jobs-new -o jobs-new.csv
but there was only the first page of results within the .csv file.
What do I need to do to get more than one page? Is the tutorial incorrect or did I understand it incorrectly?
I just edit your code and found it ok.
import scrapy
from scrapy import Request
class JobsSpider(scrapy.Spider):
name = 'jobs-new'
allowed_domains = ['craigslist.org']
start_urls = ['https://newyork.craigslist.org/search/jjj']
def parse(self, response):
jobs = response.xpath('//p[#class="result-info"]')
for job in jobs:
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[#class="result-meta"]/span[#class="result-hood"]/text()').extract_first("")[2:-1]
relative_url = job.xpath('a/#href').extract_first()
absolute_url = response.urljoin(relative_url)
yield {'URL': absolute_url, 'Title': title, 'Address': address}
relative_next_url = response.xpath('//a[#class="button next"]/#href').extract_first()
absolute_next_url = response.urljoin(relative_next_url)
yield scrapy.Request(absolute_next_url, callback=self.parse)
Here is some output
{'URL': 'https://newyork.craigslist.org/brk/trp/d/brooklyn-overnight-parking-attendant/7166876233.html', 'Title': 'Overnight Parking Attendant', 'Address': 'Brooklyn, NY'}
{'URL': 'https://newyork.craigslist.org/wch/fbh/d/yonkers-experience-grill-man/7166875818.html', 'Title': 'Experience grill man', 'Address': 'Yonkers'}
Related
I'm new to this scrapy concept. I have written a script for E-commerce website and need to scrape below mentioned details in that website. I facing issue with this script. please anyone help me to get out from this issue.
website:https://savedbythedress.com/collections/maternity-tops
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in link_products:
product_link = domain + link
yield{
'product_link': product_link.css('div[class="product-info-inner"] ::attr(href)').get(),
}
yield scrapy.Request(url=product_link, callback=self.parse_contents)
def parse_contents(self, response):
#scrape needed information
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
use yield response.follow(page_url, self.parse_contents) it will work for you
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
# link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in response.css('div.product-info'):
page_url = link.css('div[class="product-info-inner"] ::attr(href)').get()
print('PAGE URL IS ', page_url)
yield response.follow(page_url, self.parse_contents)
# product_link = domain + link
# yield{
# 'product_link': link.css('div[class="product-info-inner"] ::attr(href)').get(),
# }
print(page_url)
# yield scrapy.Request(response.follow(page_url), callback=self.parse_contents)
def parse_contents(self, response):
print()
#scrape needed information
print(response.url)
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
I wrote a script to get the data from a website. I have issue with collecting the website URL since the #href is the redirect link. How can I convert the redirect URL to the actual website it's redirecting to?
import scrapy
import logging
class AppSpider(scrapy.Spider):
name = 'app'
allowed_domains = ['www.houzz.in']
start_urls = ['https://www.houzz.in/professionals/searchDirectory?topicId=26721&query=Design-Build+Firms&location=Mumbai+City+District%2C+India&distance=100&sort=4']
def parse(self, response):
lists = response.xpath('//li[#class="hz-pro-search-results__item"]/div/div[#class="hz-pro-search-result__info"]/div/div/div/a')
for data in lists:
link = data.xpath('.//#href').get()
yield scrapy.Request(url=link, callback=self.parse_houses, meta={'Links': link})
next_page = response.xpath('(//a[#class="hz-pagination-link hz-pagination-link--next"])[1]/#href').extract_first()
if next_page:
yield response.follow(response.urljoin(next_page), callback=self.parse)
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield {
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
You must to have do a request to that target URL to see where it leads to
In your case, you can do simply the HEAD request, that will not load any body of target URL so that will save bandwidth and increase speed of your script as well
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield Request(url=website,
method="HEAD",
callback=self.get_final_link,
meta={'data':
{
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
}
)
def get_final_link(self, response):
data = response.meta['data']
data['website'] = response.headers['Location']
yield data
If your goal is to get the website, that actual website link is available in source-code of each listing as well, you can grab it by regex, no need to visit the encrypted url
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = re.findall(r"\"url\"\: \"(.*?)\"", response.text)[0]
you can do st like this:
class AppSpider(scrapy.Spider):
base_url = 'www.houzz.in{}'
.
.
.
def foo(self):
actual_url = self.base_url.format(redirect_url)
CODE:
import scrapy
from scrapy.spiders import CrawlSpider
from scrapy import Request
class TestSpider(CrawlSpider):
name = "test_spyder"
allowed_domains = ["stackoverflow.com"]
start_urls = ['https://stackoverflow.com/tags']
def parse(self, response):
title_1 = response.xpath('//h1/text()').extract_first()
next_url = 'https://stackoverflow.com/users'
title_2 = Request(url=next_url, callback=self.parse_some)
yield {'title_1': title_1, 'title_2': title_2}
def parse_some(self, response):
return response.xpath('//h1/text()').extract_first()
I don't understand why instead second page title (Users) i get other value (https://stackoverflow.com/users>).
Scrapy should return next values: Tags + Users, but returns: Tag + <Request GET htt... at list i think so.
Where is the error and how to fix it?
To crawl url you need to yield a Request object. So your parse callbacks should either:
Yield a dictionary/Item - this is the end of crawl chain. The item is being generated, it is sent through pipelines and finally saved somewhere if you have that set up.
Yield a Request object - this still continues the crawl chain to another callback.
Example of this process:
crawl url1 (2)
crawl url2 (2)
yield item (1)
Your spider in this case should look like this:
def parse(self, response):
title = response.xpath('//h1/text()').extract_first()
yield {'title': title}
next_url = 'https://stackoverflow.com/users'
yield Request(url=next_url, callback=self.parse_some)
And your end results crawling with scrapy crawl spider -o output.json:
# output.json
[
{'title': 'title1'},
{'title': 'title2'}
]
I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?
I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}