Python Scrapy unexpected indent error - python

We're trying to crawl items such as 'product', 'price', etc. but we keep getting a indentation error.
The code we're using (crawlproduct.py):
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from productcrawl.items import ProductCrawlItem
class MySpider(BaseSpider):
name = "crawlproduct"
allowed_domains = ["yorcom.nl"]
f = open("items.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
events = hxs.select("//div[#class='productOverview']")
items = []
for event in events:
item = ProductCrawlItem()
item ["product"] = events.select("table/tbody/tr/td[#class='productTitle']/a/text()").extract()
item ["price"] = events.select("table/tbody/tr/td[#class='productPrice']/a/text()").extract()
item ["stock"] = events.select("table/tbody/tr/td[#class='productStock voorraad']/a/text()").extract()
item ["link"] = events.select("table/tbody/tr/td[#class='productTitle']/a").extract()
yield item
and items.py:
from scrapy.item import Item, Field
class ProductCrawlItem(Item):
product = Field()
price = Field()
stock = Field()
link = Field()
When we only use 1 field, it does work...
Does anyone know the problem?
Thanks in advance,
Dean

With the following indentation, this is probably what you intended:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from productcrawl.items import ProductCrawlItem
class MySpider(BaseSpider):
name = "crawlproduct"
allowed_domains = ["yorcom.nl"]
f = open("items.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
events = hxs.select("//div[#class='productOverview']")
items = []
for event in events:
item = ProductCrawlItem()
item ["product"] = events.select("table/tbody/tr/td[#class='productTitle']/a/text()").extract()
item ["price"] = events.select("table/tbody/tr/td[#class='productPrice']/a/text()").extract()
item ["stock"] = events.select("table/tbody/tr/td[#class='productStock voorraad']/a/text()").extract()
item ["link"] = events.select("table/tbody/tr/td[#class='productTitle']/a").extract()
yield item

Related

Get price from XHR and Combine Scrapy

I have to scrape data(name, price, description, brand,...) on this website: https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww%7Cnew+in%7Cnew+products%7Cclothing
My code is as such:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
def remove_characters(self,value):
return value.strip('\n')
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def parse_item(self, response):
yield{
'name':response.xpath("//div[#class='product-hero']/h1/text()").get(),
'price':response.xpath("//span[#data-id='current-price']").get(),
'description':response.xpath("//div[#class='product-description']/ul/li/text()").getall(),
'about_me': response.xpath("//div[#class='about-me']//text()").getall(),
'brand_description':response.xpath("//div[#class='brand-description']/p/text()").getall()
}
However, due to javascript I cannot get the Price. I need to get it thorugh XHR.
My code for getting the price of only one item in the list is as followed:
import scrapy
import json
class AsosSpider(scrapy.Spider):
name = 'asos'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=200369183&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28']
def parse(self, response):
#print(response.body)
resp = json.loads(response.text)[0]
price = resp.get('productPrice').get('current').get('text')
print(price)
yield {
'price': price
Here, my start_urls is the Request URL. And it keeps changing for each item.
Item1: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=23443988&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Item2: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=22495685&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Only the productsIds are changing!!!
I need to insert the second code in the first code to get the price as well? How to do it please?
Thanks!
pix
items.py:
import scrapy
class AsosItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
about_me = scrapy.Field()
brand_description = scrapy.Field()
As I said in you last post I have a problem with this website on my computer for some reason, but you need to do something like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import AsosItem
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def remove_characters(self,value):
return value.strip('\n')
def parse_item(self, response):
price_url = 'https://www.asos.com' + re.search(r'window.asos.pdp.config.stockPriceApiUrl = \'(.+)\'', response.text).group(1)
item = AsosItem()
item['name'] = response.xpath("//div[#class='product-hero']/h1/text()").get()
item['description'] = response.xpath("//div[#class='product-description']/ul/li/text()").getall()
item['about_me'] = response.xpath("//div[#class='about-me']//text()").getall()
item['brand_description'] = response.xpath("//div[#class='brand-description']/p/text()").getall()
request = scrapy.Request(url=price_url, callback=self.parse_price)
request.meta['item'] = item
return request
def parse_price(self, response):
jsonresponse = response.json()[0]
price = jsonresponse['productPrice']['current']['text']
item = response.meta['item']
item['price'] = price
return item
Test the code and if it doesn't work then get the general idea and tweak it a bit, I can't test it myself.

Scrapy pipeline extracting in the wrong csv format

My Hacker News spider outputs all the results on one line, instead of one each line, as it can be seen here.
All on the same line
Here is my code.
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[#class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[#class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
item['score'] = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
yield item
and my settings.py file
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
FEED_FORMAT = 'csv'
I've tried to implement this among many other solutions but no luck so far. I'm still very new at this, so bear with me if possible.
It is happening because your item pipeline is getting all the lists at once. For expample: The item['title'] is getting a list of all the titles at once which is then transferred to the item pipeline and then written to the csv file directly.
The solution is to iterate over the list and yield it to the item pipeline one at a time. Here's a modified code:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[#class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
score_list = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item

Python: How to append a string to a scrapy list item?

I'm scraping a collection of urls, but they all lack the base of the url, so I want to append the "start_url" as a base to each scraped url.
Spider class:
class MySpider(BaseSpider):
name = "teslanews"
allowed_domains = ["teslamotors.com"]
start_urls = ["http://www.teslamotors.com/blog"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
updates = hxs.xpath('//div[#class="blog-wrapper no-image"]')
items = []
for article in updates:
item = TeslanewsItem()
item["date"] = article.xpath('./div/span/span/text()').extract()
item["title"] = article.xpath('./h2/a/text()').extract()
item["url"] = article.xpath('./h2/a/#href').extract()
items.append(item)
return items
I can't do a simple item["url"] = article.xpath('./h2/a/#href').extract() + base with base = "http://www.teslamotors.com"
because this adds the base to the end and it does it letter by letter due to being in a for-loop and each letter is separated by commas.
I'm relatively new to Scrapy so I don't exactly know which way to go with this.
from scrapy.spider import BaseSpider
from urlparse import urljoin
class MySpider(BaseSpider):
name = "teslanews"
allowed_domains = ["teslamotors.com"]
base = "http://www.teslamotors.com/blog"
start_urls = ["http://www.teslamotors.com/blog"]
def parse(self, response):
updates = response.xpath('//div[#class="blog-wrapper no-image"]')
items = []
for article in updates:
item = TeslanewsItem()
item["date"] = article.xpath('./div/span/span/text()').extract()
item["title"] = article.xpath('./h2/a/text()').extract()
item['url'] = urljoin(self.base, ''.join(article.xpath('./h2/a/#href').extract()))
return items

Using Regex to Select Index Page Only

Is there regex to select on the index page when crawling a specific website? I'm select certain pages but also need just the index page on top of those.
I can't seem to figure out the proper way to put it. Basically, I want to crawl index page, contact page, about page, and advertise page to look for contact information.
Here is the code.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[#name=\'description\']/#content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
What you need to do is to call parse_item() callback from the parse_start_url() - this way you would also parse the url coming from start_urls, which is I am assuming is an index page:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
See also:
Scrapy CrawlSpider doesn't crawl the first landing page

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

Categories