Getting Variant product details by sending POST requests in scrapy - python

I am trying to scrape a website for product details and some of the products have variants. The product data changes when you click on the quantity and it sends a post request to get all the data. I am trying to make those requests in my spider but it doesn't return any data. Plus I am trying to add the variants to different rows in a csv file. All the answers I have found online don't help me at all and I don't know what more to do. How do I make the post request work and how do add the variant data to the output.
Here's the code:
import scrapy
import os
import json
from slugify import slugify
class GpSpider(scrapy.Spider):
name = 'gp'
start_urls = ['https://goldpet.pt/3-cao']
# urls = ['https://goldpet.pt/3-cao','https://goldpet.pt/4-gato','https://goldpet.pt/7-roedor','https://goldpet.pt/6-ave','https://goldpet.pt/5-peixe','https://goldpet.pt/281-reptil']
# for url in urls:
# start_urls.append(url)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
def parse(self, response):
products = response.css('h2.h3.product-title>a::attr(href)').extract()
for product in products:
product_link = product
yield scrapy.Request(product_link, callback=self.parse_products)
# next_page = response.css('a.next.js-search-link::attr(href)').get()
# if next_page:
# yield scrapy.Request(next_page, callback=self.parse)
def parse_products(self, response):
image_links = []
for img in response.css('img.js-qv-product-cover::attr(src)').getall():
image_links.append(img)
item = {}
item['Title'] = response.css('h1.h1.product-title::text').get()
item['Descrição'] = response.css('div.product-description>p::text').extract()
item['Marca'] = response.css('img.img.img-thumbnail.manufacturer-logo::attr(alt)').get()
if item['Marca'] is None:
item['Marca'] = 'No brand'
item['Quantidade'] = response.css('span.radio-label::text').extract()
item['Idade'] = response.xpath('//dt[text()="Idade"]/following-sibling::dd/text()').get()
item['Porte'] = response.xpath('//dt[text()="Porte"]/following-sibling::dd/text()').get()
item['Características'] = response.xpath('//dt[text()="Características"]/following-sibling::dd/text()').get()
item['Gama'] = response.xpath('//dt[text()="Gama"]/following-sibling::dd/text()').get()
item['Alimento'] = response.xpath('//dt[text()="Alimento"]/following-sibling::dd/text()').get()
item['ean13'] = response.xpath('//dt[text()="ean13"]/following-sibling::dd/text()').get()
item['Price'] = response.css('div.current-price>span::text').get().replace('\xa0€','').strip()
item['product_url'] = response.url
item['image_urls'] = image_links
breadcrumbs = list(filter(None, map(str.strip, response.css('li[itemprop=itemListElement]>a>span::text').extract())))
try:
item['category'] = breadcrumbs[0]
except:
item['category'] = ''
try:
item['sub_category1'] = breadcrumbs[1]
except:
item['sub_category1'] = ''
try:
item['sub_category2'] = breadcrumbs[2]
except:
item['sub_category2'] = ''
product_img = response.css('img.js-qv-product-cover::attr(src)').getall()
item['img_urls'] = product_img[0]
ext = item['img_urls'].split('?')[0].rsplit('.', 1)[-1]
filename = slugify(item['Title']) + '_1.' + ext
item['Photo_0'] = filename
item['Photo_Path0'] = os.path.join('product images', 'images', item['Marca'], filename)
for i in range(10):
item[f'Photo_{i + 1}'] = ''
item[f'Photo_Path_{i + 1}'] = ''
for i, image in enumerate(product_img[1:]):
ext = image.split('?')[0].rsplit('.', 1)[-1]
filename = slugify(item['Title']) + f'_{i + 1}.{ext}'
item[f'Photo_{i + 1}'] = filename
item[f'Photo_Path_{i + 1}'] = os.path.join('product images', 'images', item['Marca'], filename)
variants = response.css('div.products-variants')
if variants:
for variant in variants:
var_item = item.copy()
group = response.css('li.input-container.float-xs-left label input.input-radio::attr(value)').get()
token = response.css('div.product-actions form#add-to-cart-or-refresh input::attr(value)').get()
product_id = response.css('input#product_page_product_id::attr(value)').get()
customization_id = response.css('input#product_customization_id::attr(value)').get()
ajax_url = f'https://goldpet.pt/index.php?controller=product&token=d41d8cd98f00b204e9800998ecf8427e&id_product=19107&id_customization=0&group%5B8%5D={group}&qty=1'
payload = {"controller" : item['title']
,"token" : token
,"id_product" : product_id
,"id_customization" : customization_id
,"group%5B8%5D" : group
,"qty" : '1'
}
yield scrapy.Request(ajax_url,callback=self.parse_variants, method="POST", body=json.dumps(payload), headers={'Content-Type': 'application/x-www-form-urlencoded'})
def parse_variants(self,response):
yield json.loads(response.text)

Related

How to import the data (list of URLs) from a CSV instead of use an search string? Thank you a lot

I´m so newbie in Python and I need to unite the first and second code cause I want to read the data (phones and ZIPs) from a a list of URLs store in a CSV. I tried a lot.
This code is from Python 360 on YouTube:
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
Ans this from nageshsinghc4 (GitHub) :)
...
for row in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s) # Phone regex
emails = re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s) #Email regex
if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))
else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)
if len(emails) == 0:
print("No email address found.")
err_msg_mail = "No email address found."
mail_1.append((url_name,err_msg_mail))
else:
count = 1
for item in emails:
mail_1.append((url_name,item))
count += 1
print(mail_1)
Thank you very much.

Handling redirecting <301> from Indeed with Scrapy

I'm building a person scraper for Indeed primarily to practice on - I've set it up so that I extract details per 100 results in each page. By using the search query, I have a seed-list of cities and types of jobs looped within an f-string of the indeed url. I have these results stored as a dictionary, so that I can get the degree types as a column when these results are read into pandas.
My issue is that I keep getting Redirecting (301), I suppose that's because not all the links fulfil the requirement of a salary. Alternatively, I have included meta={'handle_httpstatus_list': [301]} but then I get no results regardless.
Here's my scraper:
class IndeedItem(scrapy.Item):
job_title = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
company = Field(output_processor = TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
max_results_per_city = 1000
#names = pd.read_csv("indeed_names.csv")
#degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names':['London', 'Manchester']})
degree = pd.DataFrame({'degrees':['degree+Finance+£25','degree+Engineering+£25'], 'degree_type':['Finance', 'Engineering']})
start_urls = defaultdict(list)
for city in names.names:
for qualification,name in zip(degree.degrees, degree.degree_type):
start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}%2C000&l={city}&fromage=7&filter=0&limit=100')
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2
}
def start_requests(self):
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback = self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs = {
'page_count':0,
'category':category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#id="mosaic-zone-jobcards"]//div')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector = jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/h2[#class="jobTitle jobTitle-color-purple jobTitle-newJob"]/span//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/a/div[#class="slider_container"]/div[#class="slider_list"]/div[#class="slider_item"]/div[#class="job_seen_beacon"]/table[#class="jobCard_mainContent"]/tbody/tr/td[#class="resultContent"]/div[#class="heading6 company_location tapItem-gutter"]/pre/span[#class="companyName"]//text()')
yield loader.load_item
next_page = response.xpath('//ul[#class="pagination-list"]/li[5]/a//#href').get()
page_count += 1
if next_page is not None:
yield response.follow(
next_page,
callback = self.parse,
cb_kwargs = {
'page_count': page_count,
'category': category
}
)
I didn't had any 301 status, but the start_urls gave me problems and your xpath was off
This fix the xpath:
import scrapy
from pandas._libs.internals import defaultdict
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
import pandas as pd
class IndeedItem(scrapy.Item):
job_title = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
company = Field(output_processor=TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY': 2
}
max_results_per_city = 1000
# names = pd.read_csv("indeed_names.csv")
# degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names': ['London', 'Manchester']})
degree = pd.DataFrame({'degrees': ['degree+Finance+£25,000', 'degree+Engineering+£25,000'], 'degree_type': ['Finance', 'Engineering']})
start_urls = defaultdict(list)
def start_requests(self):
for city in self.names.names:
for qualification, name in zip(self.degree.degrees, self.degree.degree_type):
self.start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}&l={city}&fromage=7&filter=0&limit=100')
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback=self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs={
'page_count': 0,
'category': category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#class="slider_container"]')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector=jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/span[#title]//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/span[#class="companyName"]//text()')
yield loader.load_item()
next_page = response.xpath('//ul[#class="pagination-list"]//li[last()]/a/#href').get()
page_count += 1
if next_page:
yield response.follow(
next_page,
callback=self.parse,
cb_kwargs={
'page_count': page_count,
'category': category
}
)
If you can give an example for a url that redirects I can try to help you.

Why does my web scraper only work half the time?

My goal is to get the product name and price of all Amazon pages detected in any website that I feed to my program.
My input is a text file containing five websites. In each of these websites, a total of five to fifteen amazon links are to be found.
My code is this:
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
from urllib2 import Request, urlopen, HTTPError, URLError
def isdead(url):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
req = Request(url, headers = headers)
sleep(10)
try:
page_open = urlopen(req)
except HTTPError, e:
return e.code #404 if link is broken
except URLError, e:
return e.reason
else:
return False
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip()
#re.... is a list. if empty, available. if not, unavailable.
#return re.findall(r'Available from',AVAILABILITY[:30], re.IGNORECASE)
if len(re.findall(r'unavailable',AVAILABILITY[:30],re.IGNORECASE)) == 1:
return "unavailable"
else:
return "available"
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
all_links = [x for x in all_links if "amazon.com/gp/prime" not in x]
all_links = [y for y in all_links if "amazon.com/product-reviews" not in y]
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
if isdead(i) == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == "unavailable":
price = "UNAVAILABLE"
else:
if (len(soup.select("#priceblock_ourprice")) == 0) and (len(soup.select("#priceblock_saleprice")) == 0):
price = soup.select("#a-offscreen")
elif len(soup.select("#priceblock_ourprice")) == 0:
price = soup.select("#priceblock_saleprice")
else:
price = soup.select("#priceblock_ourprice")
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Whenever it works fine, the output looks something like this (please don't judge the PRICE output, I have spent so much time trying to fix that but nothing works because I can't turn it into a string and get_text() doesn't work. This project is just for personal use so it's not that important, but if you have suggestions, I'm very receptive to those.):
LINK:
https://www.amazon.com/dp/B007Y6LLTM/ref=as_li_ss_tl?ie=UTF8&linkCode=ll1&tag=lunagtkf1-20&linkId=ee8c5299508af57c815ea6577ede4244
TITLE:Moen 7594ESRS Arbor Motionsense Two-Sensor Touchless One-Handle Pulldown Kitchen Faucet Featuring Power Clean, Spot Resist Stainless
PRICE:[<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$359.99</span>]
/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
... and so on.
The error looks like this:
Traceback (most recent call last):
File "name.py", line 75, in <module>
title = soup.select("#productTitle")[0].get_text().strip()
IndexError: list index out of range
It's so weird because there's a text file that's fed so many times and sometimes, all sites are scraped well, but sometimes, the error appears at the 10th Amazon product, sometimes, the error appears at the 1st product...
I'm suspecting it's a bot detection problem, but I have a header. What's the problem?
Your code is too messy. I've organized it for you, please check out if it works.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
if response.status_code == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
html = response.text
doc = SimplifiedDoc(html)
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
You should learn more:) and give you an example of using the framework.
Here are more examples of simplified_scrapy here
If you need any help, please let me know.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class MySpider(Spider):
name = 'amazon-product'
# allowed_domains = ['example.com']
start_urls = []
refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
filepath='' # Your file path
if filepath:
with open(filepath) as f:
start_urls = [line.rstrip('\n') for line in f]
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
amazon_links=None
data = None
if url['url'].find('https://www.amazon.com')>=0 or url['url'].find('https://amzn.to')>=0:
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
data = [{"title":title,'price':price}] # Get target data
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
else:
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
return {"Urls": amazon_links, "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling

Scrape page using Python requests

I have some problems with web scraping, here is my code:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
After this request I am getting different result than I put this:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
into my web browser, what is causing this problem? Is there is something related to page redirection or something?
I can see multiple things that will cause you to get different results:
You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.

how to extract certain string from URL

I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?

Categories