I'm trying to build a for each for products so I want to scrape one by one from an array and I would also like to know where to place my for each.
The array i want to use it's called EAN.
import scrapy
import re
import MySQLdb
class ProductSpider(scrapy.Spider):
db = MySQLdb.connect(host="localhost", # Host name
user="root", # User Name
passwd="", # Passwoord
db="ProductSpider") # Database name
cur = db.cursor()
cur.execute("SELECT EAN FROM product")
name = 'product'
EAN = []
rows = cur.fetchall()
for row in rows:
EAN = (row[0])
# print(row) #activate to see EAN codes.
start_urls = ['https://www.google.nl/search?client=opera&biw=1880&bih=1008&output=search&tbm=shop&q='+EAN+'&oq='+EAN+'&gs_l=products-cc.12...0.0.0.2112.0.0.0.0.0.0.0.0..0.0....0...1ac..64.products-cc..0.0.0....0.Mgj-aNT06E4']
custom_settings = {
'FEED_URI': 'tmp/' + EAN + '.csv'
}
Here is what I've made.
for EAN in range(len(EAN)): #forloop afmaken
EAN.append('EAN')
print(EAN)
def parse(self, response):
urls = response.css('.MCpGKc > a::attr("href")').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse)
response.selector.remove_namespaces()
all_sellers = response.css(".os-seller-name-primary > a::text").extract()
all_prices = response.css("td.os-total-col::text").re("\d+\,\d{1,2}")
all_urls= response.css(".os-seller-name-primary > a::attr('href')").extract()
for item in zip(all_prices, all_sellers, all_urls):
scrapped_info = {
'price': item[0],
'seller': item[1],
'url' : item[2]
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
Related
Shows results of first page and i want results from all the pages and it should crawl like 2nd page then 3rd page
import scrapy
class QuoteSpider(scrapy.Spider):
name = 'quotes'
base_url = 'https://www.yell.com'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
all_data = response.css('div.row.businessCapsule--mainRow')
for data in all_data:
title = data.css('.text-h2::text').extract()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url + business_url
yield response.follow(final_url, self.parse)
avg_rating = response.css('span.starRating--average::text').get()
items = {
'Title': title ,
'Title Url' : final_url,
'Average Rating': avg_rating
}
yield items
pass
next_page = response.urljoin(response.css('a.pagination--next::attr(href)').extract_first())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
This should do it.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
avg_rating = response.css('span.starRating--average::text').get()
yield {
'Title': title ,
'Title Url' : final_url,
'Average Rating': avg_rating
}
next_page = response.css('a.pagination--next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
I am scraping data from an url (product name, price, etc.) but there is a json file in the backend which I also want to scrape as it has relevant information. In a nutshell I want to change my request url to the json one and and then return to the original url so the crawling can continue.
Product url:https://www2.hm.com/hu_hu/productpage.0906822002.html
Related json url (this can be found in the Networks tab, and I store this in a variable named availability_url):
https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json
It would be very important to have the availability data in a variable before the actual yield as I have to return to the original url before checking the colors at the end of the code:
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[#class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
yield{
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
So to sum up: I want to change the scraped url from
https://www2.hm.com/hu_hu/productpage.0906822002.html
to https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json
then return back to
https://www2.hm.com/hu_hu/productpage.0906822002.html so my scraper can continue work.
You can do something like this, if you do the json request after extracting all item data you don't have to return to the function. (The colour varaitions request will still be created since we're yielding requests and nothing returning)
Try if this works for you:
import json
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[#class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
item = {
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
if availability_url:
yield scrapy.Request(
url=availability_url,
callback=self.parse_availability,
meta={
'item': item
}
)
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
def parse_availability(self, response):
item = response.meta.get('item')
json_data = json.loads(response.body)
#do something with json data here and add it to item
yield item
recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['play.google.com']
start_urls = ['http://play.google.com']
rules = (
Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
process_links='process_links',
callback='parse_1'),
)
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
scrapy.Request('{}'.format(link.url))
for link in links
]
else:
# print("============ START_URL ELSE PART ============")
pass
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith('https://play.google.com/store/apps/'):
continue
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
continue
else:
self.crawled_ids.append(old_url_query['id'])
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
urlencode(old_url_query))
new_links.append(link)
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith('https://play.google.com/'):
url = "https://play.google.com" + url
links.append(url)
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "https://play.google.com" + url + '&hl=en&gl=vn'
urls.append(url)
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="applicationCategory"]/#content').extract_first()
item['review_score'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="ratingValue"]/#content').extract_first()
item['review_count'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="reviewCount"]/#content').extract_first()
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="contentRating"]/#content').extract_first()
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="priceCurrency"]/#content').extract_first()
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help
I am actually very new to Scrapy and I'm not sure why am I not getting the information which I want. I am using Scrapy on the website www.kayak.com and i want to extract the check in and check out time from all the hotels in New York. I have successfully scraped out data from the same page which the check in and check out time is in but couldnt scrape out data for both these fields.
The code I have is shown below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hotel_crawl.items import HotelCrawlItem
from bs4 import BeautifulSoup
import time
import urlparse
class MySpider(CrawlSpider):
name = "kayaksite"
allowed_domains = ["www.kayak.com"]
start_urls = ["http://www.kayak.com/New-York-Hotels.15830.hotel.ksp"]
rules = (
Rule(LinkExtractor(
restrict_xpaths=("//a[#class='actionlink pagenumber' [contains(text(),'Next')]", )), callback="parse_item", follow=True),
def parse_start_url(self, response):
print "test"
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item2(self, response):
print "test--------------"
self.logger.info('Hi, this is an item page! %s', response.url)
item = response.meta['item']
item['location'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[1]/text()").extract()
item['postcode'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[3]/text()").extract()
item['check_in'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
item['check_out'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
yield item
I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.
Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item