Here is my scrapy code.I dont know my mistake but in only scrapes first page.How can i scrape and traverse through pages ? Is there any other way for scraping next pages ?
import scrapy
class HurriyetEmlakPage(scrapy.Spider):
name = 'hurriyetspider'
allowed_domain = 'hurriyetemlak.com'
start_urls = ['https://www.hurriyetemlak.com/satilik']
def parse(self, response):
fiyat = response.xpath('//div[#class="list-view-price"]//text()').extract()
durum = response.xpath('//div[#class="middle sibling"]//div[#class="left"]//text()').extract()
oda_sayisi = response.xpath('//span[#class="celly houseRoomCount"]//text()').extract()
metrekare = response.xpath('//span[#class="celly squareMeter list-view-size"]//text()').extract()
bina_yasi = response.xpath('//span[#class="celly buildingAge"]//text()').extract()
bulundugu_kat = response.xpath('//span[#class="celly floortype"]//text()').extract()
konum = response.xpath('//div[#class="list-view-location"]//text()').extract()
scraped_info = {
'fiyat':fiyat,
'durum': durum,
'oda_sayisi' : oda_sayisi,
'metrekare' : metrekare,
'bina_yasi' : bina_yasi,
'bulundugu_kat': bulundugu_kat,
'konum' : konum
}
yield scraped_info
next_page_url = response.xpath('//li[#class="next-li pagi-nav"]//a').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url,callback = self.parse)
Actually, you could simply generate your url list like this :
url_list = [f"https://www.hurriyetemlak.com/satilik?page={page}" for page in range(1,7326)]
Output
['https://www.hurriyetemlak.com/satilik?page=1',
'https://www.hurriyetemlak.com/satilik?page=2',
'https://www.hurriyetemlak.com/satilik?page=3',
'https://www.hurriyetemlak.com/satilik?page=4',
'https://www.hurriyetemlak.com/satilik?page=5',
...]
Related
I'm new to this scrapy concept. I have written a script for E-commerce website and need to scrape below mentioned details in that website. I facing issue with this script. please anyone help me to get out from this issue.
website:https://savedbythedress.com/collections/maternity-tops
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in link_products:
product_link = domain + link
yield{
'product_link': product_link.css('div[class="product-info-inner"] ::attr(href)').get(),
}
yield scrapy.Request(url=product_link, callback=self.parse_contents)
def parse_contents(self, response):
#scrape needed information
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
use yield response.follow(page_url, self.parse_contents) it will work for you
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
# link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in response.css('div.product-info'):
page_url = link.css('div[class="product-info-inner"] ::attr(href)').get()
print('PAGE URL IS ', page_url)
yield response.follow(page_url, self.parse_contents)
# product_link = domain + link
# yield{
# 'product_link': link.css('div[class="product-info-inner"] ::attr(href)').get(),
# }
print(page_url)
# yield scrapy.Request(response.follow(page_url), callback=self.parse_contents)
def parse_contents(self, response):
print()
#scrape needed information
print(response.url)
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
i am trying to crawl into zomato to get info of the restaurants in istanbul. so, i am trying to get all the hrefs in search result pages. however, i am only getting the first search result of every page.
import scrapy
from ..items import ZomatodataItem
class ZomatoSpider(scrapy.Spider):
name = 'zomato'
allowed_domains = ["zomato.com"]
start_urls = [
'https://www.zomato.com/istanbul/restaurants?page=1'
]
def parse(self, response):
all_css = response.css('.search_left_featured')
all_product = all_css.css('a::attr(href)').get()
yield scrapy.Request(all_product, callback=self.parse_dir_contents)
max_page_number = 6
for i in range(1, max_page_number):
url_next = 'https://www.zomato.com/istanbul/restaurants?page=' + str(i)+''
yield scrapy.Request(url_next, callback=self.parse)
def parse_dir_contents(self, response):
items = ZomatodataItem()
items['name'] = response.css('.diBDma::text').extract()
items['genre'] = response.css('.gQXqL::text').extract_first()
items['tags'] = response.css('.cunMUz::text').extract()
items['address'] = response.css('.clKRrC::text').extract()
items['phone_number'] = response.css('.kKemRh::text').extract()
yield items
Makes sense that you only get 1 result - 'all_product' will only contain 1 item. If you want to get the full list, you'll have to update it to this:
all_products = all_css.css('a::attr(href)').getall()
Now you can loop through the links and get the detailed information like this:
for product in all_products:
yield scrapy.Request(product, callback=self.parse_dir_contents)
I have a code.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
And I don't know why it scrapes only 3 pages
Output marked in red is only 3 pages of 88
where's the problem in pagination?
Your selector was finding the first <a> tag he could find, which was the language <a> tag. You were changing languages not pages.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[#class="prev_next"]/#href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
I looks like the website that you are scraping uses the url format uri?page=x
a simple loop to replace x can solve your problems.
I have to scrape all movies from this IMDb page : https://www.imdb.com/list/ls055386972/.
My approach is first to scrape all the values of The Godfather', u'Schindler\'s List......]'I want only the "/title/tt0068646/?ref_=ttls_li_tt" portion.
How to proceed?
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.imdb.com/list/ls055386972/")
soup = BeautifulSoup(page.content, 'html.parser')
movies = soup.findAll('h3', attrs={'class' : 'lister-item-header'})
for movie in movies:
print(movie.a['href'])
OUTPUT:
/title/tt0068646/?ref_=ttls_li_tt
/title/tt0108052/?ref_=ttls_li_tt
/title/tt0050083/?ref_=ttls_li_tt
/title/tt0118799/?ref_=ttls_li_tt
.
.
.
.
/title/tt0088763/?ref_=ttls_li_tt
/title/tt0266543/?ref_=ttls_li_tt
I would suggest you to use requests-html to get all the hyperlinks and remove the ones that doesn't match your criteria. You can even get the absolute url using r.html.absolute_links
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.imdb.com/list/ls055386972/')
links = r.html.links
for i in range(len(links)):
if not links[i].startswith('/title/'):
del links[i]
print(links)
it is the working code please try:
class MoviesSpider():
name = 'movies' #name of the spider
allowed_domains = ['imdb.com']
start_url = 'http://imdb.com/list/ls055386972/'
def __init__(self):
super(MoviesSpider, self).__init__()
def start_requests(self):
yield Request(self.start_url, callback=self.parse, headers=self.headers)
def parse(self, response):
#events = response.xpath('//*[#property="url"]/#href').extract()
links = response.xpath('//h3[#class]/a/#href').extract()
final_links = []
for link in links:
final_link = 'http://www.imdb.com' + link
final_links.append(final_link)
for final_link in final_links:
absolute_url = response.urljoin(final_link)
yield Request(absolute_url, callback = self.parse_movies)
#process next page url
#next_page_url = response.xpath('//a[text() = "Next"]/#href').extract_first()
#absolute_next_page_url = response.urljoin(next_page_url)
#yield Request(absolute_next_page_url)
def parse_movies(self, response):
title = response.xpath('//div[#class = "title_wrapper"]/h1[#class]/text()').extract_first()
yield{
'title': title,
}