Scrapy yielding items and requesting link consecutively - python

my spider starts on this page https://finviz.com/screener.ashx and visits every link in the table to yield some items on the other side. This worked perfectly fine. I then wanted to add another layer of depth by having my spider visit a link on the page it initially visits like so:
start_urls > url > url_2
The spider is supposed to visit "url", yield some items along the way, then visit "url_2" and yield a few more items and then move on to the next url from the start_url.
Here is my spider code:
import scrapy
from scrapy import Request
from dimstatistics.items import DimstatisticsItem
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
def __init__(self):
self.start_urls = ['https://finviz.com/screener.ashx? v=111&f=ind_stocksonly&r=01']
npagesscreener = 1000
for i in range(1, npagesscreener + 1):
self.start_urls.append("https://finviz.com/screener.ashx? v=111&f=ind_stocksonly&r="+str(i)+"1")
def parse(self, response):
for href in response.xpath("//td[contains(#class, 'screener-body-table-nw')]/a/#href"):
url = "https://www.finviz.com/" + href.extract()
yield follow.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = {}
item['statisticskey'] = response.xpath("//a[contains(#class, 'fullview-ticker')]//text()").extract()[0]
item['shares_outstanding'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[9]
item['shares_float'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[21]
item['short_float'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[33]
item['short_ratio'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[45]
item['institutional_ownership'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[7]
item['institutional_transactions'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[19]
item['employees'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[97]
item['recommendation'] = response.xpath("//table[contains(#class, 'snapshot-table2')]/tr/td/descendant::text()").extract()[133]
yield item
url2 = response.xpath("//table[contains(#class, 'fullview-links')]//a/#href").extract()[0]
yield response.follow(url2, callback=self.parse_dir_stats)
def parse_dir_stats(self, response):
item = {}
item['effective_tax_rate_ttm_company'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate (TTM)']]/td[2]/text()").extract()
item['effective_tax_rate_ttm_industry'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate (TTM)']]/td[3]/text()").extract()
item['effective_tax_rate_ttm_sector'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate (TTM)']]/td[4]/text()").extract()
item['effective_tax_rate_5_yr_avg_company'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate - 5 Yr. Avg.']]/td[2]/text()").extract()
item['effective_tax_rate_5_yr_avg_industry'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate - 5 Yr. Avg.']]/td[3]/text()").extract()
item['effective_tax_rate_5_yr_avg_sector'] = response.xpath("//tr[td[normalize-space()='Effective Tax Rate - 5 Yr. Avg.']]/td[4]/text()").extract()
yield item
All of the xpaths and links are right, I just can't seem to yield anything at all now. I have a feeling there is an obvious mistake here. My first try at a more elaborate spider.
Any help would be greatly appreciated! Thank you!
***EDIT 2
{'statisticskey': 'AMRB', 'shares_outstanding': '5.97M', 'shares_float':
'5.08M', 'short_float': '0.04%', 'short_ratio': '0.63',
'institutional_ownership': '10.50%', 'institutional_transactions': '2.74%',
'employees': '101', 'recommendation': '2.30'}
2019-03-06 18:45:19 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.finviz.com/quote.ashx?t=AMR&ty=c&p=d&b=1>
{'statisticskey': 'AMR', 'shares_outstanding': '154.26M', 'shares_float':
'89.29M', 'short_float': '13.99%', 'short_ratio': '4.32',
'institutional_ownership': '0.10%', 'institutional_transactions': '-',
'employees': '-', 'recommendation': '3.00'}
2019-03-06 18:45:19 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.finviz.com/quote.ashx?t=AMD&ty=c&p=d&b=1>
{'statisticskey': 'AMD', 'shares_outstanding': '1.00B', 'shares_float':
'997.92M', 'short_float': '11.62%', 'short_ratio': '1.27',
'institutional_ownership': '0.70%', 'institutional_transactions': '-83.83%',
'employees': '10100', 'recommendation': '2.50'}
2019-03-06 18:45:19 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.finviz.com/quote.ashx?t=AMCX&ty=c&p=d&b=1>
{'statisticskey': 'AMCX', 'shares_outstanding': '54.70M', 'shares_float':
'43.56M', 'short_float': '20.94%', 'short_ratio': '14.54',
'institutional_ownership': '3.29%', 'institutional_transactions': '0.00%',
'employees': '1872', 'recommendation': '3.00'}
2019-03-06 18:45:19 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.finviz.com/screener.ashx?v=111&f=geo_bermuda>
{'effective_tax_rate_ttm_company': [], 'effective_tax_rate_ttm_industry':
[], 'effective_tax_rate_ttm_sector': [],
'effective_tax_rate_5_yr_avg_company': [],
'effective_tax_rate_5_yr_avg_industry': [],
'effective_tax_rate_5_yr_avg_sector': []}
2019-03-06 18:45:25 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.finviz.com/screener.ashx?v=111&f=geo_china>
{'effective_tax_rate_ttm_company': [], 'effective_tax_rate_ttm_industry':
[], 'effective_tax_rate_ttm_sector': [],
'effective_tax_rate_5_yr_avg_company': [],
'effective_tax_rate_5_yr_avg_industry': [],
'effective_tax_rate_5_yr_avg_sector': []}
*** EDIT 3
Managed to actually have the spider travel to url2 and yield the items there. The problem is it only does it rarely. Most of the time it redirects to the correct link and gets nothing, or doesn't seem to redirect at all and continues on. Not really sure why there is such inconsistency here.
2019-03-06 20:11:57 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.reuters.com/finance/stocks/financial-highlights/BCACU.A>
{'effective_tax_rate_ttm_company': ['--'],
'effective_tax_rate_ttm_industry': ['4.63'],
'effective_tax_rate_ttm_sector': ['20.97'],
'effective_tax_rate_5_yr_avg_company': ['--'],
'effective_tax_rate_5_yr_avg_industry': ['3.98'],
'effective_tax_rate_5_yr_avg_sector': ['20.77']}
The other thing is, I know I've managed to yield a few values on url2 succesfully though they don't appear in my CSV output. I realize this could be a export issue. I updated my code to how it is currently.

url2 is a relative path, but scrapy.Request expects a full URL.
Try this:
yield Request(
response.urljoin(url2),
callback=self.parse_dir_stats)
Or even simpler:
yield response.follow(url2, callback=self.parse_dir_stats)

Related

Scrapy Parse function not passing found values to the parse_page2 function

I am trying to scrape playstation webstore to scrape title, gamelink from the main page and Price for each game from the second page. However when using callback function to parse_page2, all the returned items contain the title and item['link'] value of the most recent item. (last of us remastered )
My code Below:
class PsStoreSpider(scrapy.Spider):
name = 'psstore'
start_urls =['https://store.playstation.com/en-ie/pages/browse']
def parse(self, response):
item = PlaystationItem()
products = response.css('a.psw-link')
for product in products:
item['main_url'] = response.url
item['title'] = product.css('span.psw-t-body.psw-c-t-1.psw-t-truncate-2.psw-m-b-2::text').get()
item['link'] = 'https://store.playstation.com' + product.css('a.psw-link.psw-content-link').attrib['href']
link = 'https://store.playstation.com' + product.css('a.psw-link.psw-content-link').attrib['href']
request = Request(link, callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['price'] = response.css('span.psw-t-title-m::text').get()
item['other_url'] = response.url
yield item
And part of the output:
2022-05-09 19:54:16 [scrapy.core.scraper] DEBUG: Scraped from <200 https://store.playstation.com/en-ie/concept/229261>
{'link': 'https://store.playstation.com/en-ie/concept/228638',
'main_url': 'https://store.playstation.com/en-ie/pages/browse',
'other_url': 'https://store.playstation.com/en-ie/concept/229261',
'price': 'Free',
'title': 'The Last of Us™ Remastered'}
2022-05-09 19:54:16 [scrapy.core.scraper] DEBUG: Scraped from <200 https://store.playstation.com/en-ie/concept/232847>
{'link': 'https://store.playstation.com/en-ie/concept/228638',
'main_url': 'https://store.playstation.com/en-ie/pages/browse',
'other_url': 'https://store.playstation.com/en-ie/concept/232847',
'price': '€59.99',
'title': 'The Last of Us™ Remastered'}
2022-05-09 19:54:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://store.playstation.com/en-ie/concept/224802> (referer: https://store.playstation.com/en-ie/pages/browse)
2022-05-09 19:54:16 [scrapy.core.scraper] DEBUG: Scraped from <200 https://store.playstation.com/en-ie/concept/224802>
{'link': 'https://store.playstation.com/en-ie/concept/228638',
'main_url': 'https://store.playstation.com/en-ie/pages/browse',
'other_url': 'https://store.playstation.com/en-ie/concept/224802',
'price': '€29.99',
'title': 'The Last of Us™ Remastered'}
As you can see the price is correctly returned but title and link are taken from the last scraped object. What am I missing here?
Thanks
The thing is that you create your item at the beginning of your parse method and then update it over and over. That also means that you always pass the same item to parse_page2.
If you were to create your item in the for-loop you would get a new one in every iteration and should get the expected result.
Like this:
def parse(self, response):
products = response.css('a.psw-link')
for product in products:
item = PlaystationItem()
item['main_url'] = response.url
item['title'] = product.css('span.psw-t-body.psw-c-t-1.psw-t-truncate-2.psw-m-b-2::text').get()
item['link'] = 'https://store.playstation.com' + product.css('a.psw-link.psw-content-link').attrib['href']
link = 'https://store.playstation.com' + product.css('a.psw-link.psw-content-link').attrib['href']
request = Request(link, callback=self.parse_page2)
request.meta['item'] = item
yield request

Craigslist Scraper using Scrapy Spider not performing functions

2021-05-07 10:07:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/robots.txt> (referer: None)
2021-05-07 10:07:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa/> (referer: None)
2021-05-07 10:07:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa?s=120> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa/)
2021-05-07 10:07:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa?s=240> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa?s=120)
this is the output I get, seems like it just moves to the page of results, performed by selecting the next button and performing a request in line 27
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, Request
from craig.items import CraigItem
from scrapy.selector import Selector
class PhonesSpider(scrapy.Spider):
name = 'phones'
allowed_domains = ['tampa.craigslist.org']
start_urls = ['https://tampa.craigslist.org/d/cell-phones/search/moa/']
def parse(self, response):
phones = response.xpath('//p[#class="result-info"]')
for phone in phones:
relative_url = phone.xpath('a/#href').extract_first()
absolute_url = response.urljoin(relative_url)
title = phone.xpath('a/text()').extract_first()
price = phone.xpath('//*[#id="sortable-results"]/ul/li[3]/a/span').extract_first()
yield Request(absolute_url, callback=self.parse_item, meta={'URL': absolute_url, 'Title': title, 'price': price})
relative_next_url = response.xpath('//a[#class="button next"]/#href').extract_first()
absolute_next_url = "https://tampa.craigslist.org" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_item(self, response):
item = CraigItem()
item["cl_id"] = response.meta.get('Title')
item["price"] = response.meta.get
absolute_url = response.meta.get('URL')
yield{'URL': absolute_url, 'Title': title, 'price': price}
Seems like in my code, for phone in phones loop, doesn't run, which results in never running parse_item and continuing to requesting the next url, I am following some tutorials and reading documentation but im still having trouble grasping what I am doing wrong. I have experience with coding arduinos as a hobby when I was young, but no professional coding experience, this is my first forte into a project like this, I have an ok grasp on the basics of loops, functions, callbacks, etc.
any help is greatly appreciated
UPDATE
current output
2021-05-07 15:29:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/robots.txt> (referer: None)
2021-05-07 15:29:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa/> (referer: None)
2021-05-07 15:29:33 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2021-05-07 15:29:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa/)
2021-05-07 15:29:36 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html>
{'cl_id': 'postid_7309734640',
'price': '$35',
'title': 'Cut that high cable bill, switch to SPC TV and save. 1400 hd '
'channels',
'url': 'https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html'}
2021-05-07 15:29:36 [scrapy.core.engine] INFO: Closing spider (finished)
CURRENT CODE
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, Request
from craig.items import CraigItem
from scrapy.selector import Selector
class PhonesSpider(scrapy.Spider):
name = 'phones'
allowed_domains = ['tampa.craigslist.org']
start_urls = ['https://tampa.craigslist.org/d/cell-phones/search/moa/']
base_url = 'https://tampa.craigslist.org'
def parse(self, response):
phones = response.xpath('//div[#class="result-info"]')
for phone in phones:
x = response.meta.get('x')
n = -1
url = response.xpath('//a[#class="result-title hdrlnk"]/#href').getall()
relative_url = phone.xpath('//a[#class="result-title hdrlnk"]/#href').get()
absolute_url = response.urljoin(relative_url)
title = phone.xpath('//a[#class="result-title hdrlnk"]/text()').getall()
price = phone.xpath('//span[#class="result-price"]/text()').getall()
cl_id = phone.xpath('//a[#class="result-title hdrlnk"]/#id').getall()
yield Request(absolute_url, callback=self.parse_item, meta={'absolute_url': absolute_url, 'url': url, 'title': title, 'price': price, 'cl_id': cl_id, 'n': n})
def parse_item(self, response):
n = response.meta.get('n')
x = n + 1
item = CraigItem()
item["title"] = response.meta.get('title')[x]
item["cl_id"] = response.meta.get('cl_id')[x]
item["price"] = response.meta.get('price')[x]
item["url"] = response.meta.get('url')[x]
yield item
absolute_next_url = response.meta.get('url')[x]
absolute_url = response.meta.get('absolute_url')
yield Request(absolute_next_url, callback=self.parse, meta={'x': x})
I am now able to retrieve the desired content for a posting, URL, Price, Title and craigslist id, now my spider automatically closes after pulling just 1 result, I am having trouble understanding the process of using variables between the 2 functions (x) and (n), logically, after pulling one listings data, as above in the format
cl_id
Price
title
url
I would like to proceed back to the initial parse function and swap to the next item in the list of urls retrieved by
response.xpath('//a[#class="result-title hdrlnk"]/#href').getall()
which (when run in scrapy shell, succesfully pulls all the URLs)
how do I go about implementing this logic of start with [0] in the list, perform parse, perform parse_item, output item, then update a variable (n which starts as 0, needs to + 1 after each item)then call n in parse_item with its updated value and use, for example (item["title"] = response.meta.get('title')[x]) to refer to the list of urls, etc, and which place to select, then run parse_item again outputting 1 at a time, until all the values in the URL list have been output with their related price, cl_id, and title.
I know the code is messy as hell and the basics aren't fully understood by me yet, but im committed to getting this to work and learning it the hard way rather than starting from the ground up for python.
Class result-info is used within the div block, so you should write:
phones = response.xpath('//div[#class="result-info"]')
That being said, I didn't check/fix your spider further (it seems there are only parsing errors, not functional ones).
As a suggestion for the future, you can use Scrapy shell for quickly debugging the issues:
scrapy shell "your-url-here"

Web Scraping: Empty / NA / Null entries when running the spider, correct entries in scrapy shell

Webscraping a website, I'm making a spider that crawls through the pagination for the newest videos, scraping metadata for each of the 32 videos per page.
Next is my code for the spider:
class NaughtySpider(scrapy.Spider):
name = "naughtyspider"
allowed_domains = ["example.com"]
max_pages = 3
# Start request
def start_requests(self):
for i in range(1, self.max_pages):
yield scrapy.Request('https://www.example.com/video?o=cm&page=%s' % i, callback=self.parse_video)
# First parsing method
def parse_video(self, response):
self.log('F i n i s h e d s c r a p i n g ' + response.url)
video_links = response.css('ul#videoCategory').css('li.videoBox').css('div.thumbnail-info-wrapper').css('span.title > a').css('::attr(href)') #Correct path, chooses 32 videos from page ignoring the links coming from ads
links_to_follow = video_links.extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_metadata)
# Second parsing method
def parse_metadata(self, response):
# Create a SelectorList of the course titles text
video_title = response.css('div.title-container > h1.title > span.inlineFree::text')
# Extract the text and strip it clean
video_title_ext = video_title.extract_first()
# Extract views
video_views = response.css('span.count::text').extract_first()
# Extract tags
video_tags = response.css('div.tagsWrapper a::text').extract()
# Extract Categories
video_categories = response.css('div.categoriesWrapper a::text').extract()
# Fill in the dictionary
yield {
'title': video_title_ext,
'views': video_views,
'tags': video_tags,
'categories': video_categories,
}
The thing is that almost half the entries end up being empty, with no title, views, tags or categories. Example of the log:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.example.com/view_video.php?viewkey=ph5d594b093f8d6>
{'title': None, 'views': None, 'tags': [], 'categories': []}
But at the same time, if I fetch the very same link in the scrapy shell, and copy and paste the very same selector paths in the spider, it gives me the correct values:
In [4]: fetch('https://www.example.com/view_video.php?viewkey=ph5d594b093f8d6')
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.example.com/view_video.php?viewkey=ph5d594b093f8d6> (referer: None)
In [5]: response.css('div.tagsWrapper a::text').extract()
Out[5]: ['alday', '559', '+ ']
In [6]: response.css('span.count::text').extract_first()
Out[6]: '6'
Thanks in advance for any help.
Edit: Would I be correct in thinking that this is not a problem with my code but rather a limitation on the server to avoid being scraped?
The data such as the views, duration, ... seems to be called in by a HTML variable element <var> DATA </var>. For example, if you enter the following line in your scrapy shell, you should get the views.
response.xpath(".//var[#class='duration')")
Not sure if it will work, but it is worth the try.
P.s. I'll have to tell my wife it was for educational purposes..

Crawling too many links at the same time

I'm trying to crawl a website and my spider (I don't know why) is crawling my links in such disorder!!
It's crawling all the links I want but it stored only the first one (rank and url_seller as example after)... I'm new in this universe of crawling, python or scrapy but all I want is to learn!! I post my code here, somebody could help me ?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from MarketplacePulse.items import MarketplacepulseItem
import urllib.parse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MarketplacePulseSpider(CrawlSpider):
name = 'MP_A_UK'
allowed_domains = ['marketplacepulse.com', 'amazon.co.uk']
start_urls = ['https://www.marketplacepulse.com/amazon/uk']
def parse(self, response):
item = MarketplacepulseItem()
rank = response.xpath('//div/table/tbody/tr/td[#class="number"]/text()').extract()
print('\n', rank, '\n')
url_1 = response.xpath('//div/table/tbody/tr/td/a/#href').extract()
print('\n', url_1, '\n')
for i in range(len(rank)-2):
item['month_rank'] = ''.join(rank[i]).strip()
item['year_rank'] = ''.join(rank[i+1]).strip()
item['lifetime_rank'] = ''.join(rank[i+2]).strip()
i += 3
for i in range(len(url_1)):
url_tmp = urllib.parse.urljoin('https://www.marketplacepulse.com',url_1[i])
yield scrapy.Request(url_tmp, callback=self.parse_2, meta={'item': item})
def parse_2(self, response):
item = response.meta['item']
url_2 = response.xpath('//body/div/section/div/div/div/p/a[contains(text(), "Amazon.co.uk")]/#href').extract()
item['url_seller'] = ''.join(url_2).strip()
yield scrapy.Request(str(url_2), callback=self.parse_3, meta={'item': item})
def parse_3(self, response):
item = response.meta['item']
business_name = response.xpath('//div[#class="a-row a-spacing-medium"]/div[#class="a-column a-span6"]/ul[#class="a-unordered-list a-nostyle a-vertical"]/li//span[#class="a-list-item"]/span[.="Business Name:"]/following-sibling::text()').extract()
phone_number = response.xpath('//div[#class="a-column a-span6"]/ul[#class="a-unordered-list a-nostyle a-vertical"]/li//span[#class="a-list-item"]/span[.="Phone number:"]/following-sibling::text()').extract()
address = response.xpath('//div[#class="a-column a-span6"]/ul[#class="a-unordered-list a-nostyle a-vertical"]/li//span[span[contains(.,"Address:")]]/ul//li//text()').extract()
item['business_name'] = ''.join(business_name).strip()
item['phone_number'] = ''.join(phone_number).strip()
item['address'] = '\n'.join(address).strip()
yield item
I post also an example of what I want and of what I get... you'll see the problem I hope!!
What I want :
2017-07-18 11:28:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.co.uk/sp?_encoding=UTF8&asin=&isAmazonFulfilled=&isCBA=&marketplaceID=A1F83G8C2ARO7P&orderID=&seller=A7CL6GT0UVQKS&tab=&vasStoreID=>
{'address': '55740 Currant Rd\nMishawaka\nIndiana\n46545\nUS',
'business_name': 'Better World Books Marketplace Inc',
'lifetime_rank': '863',
'month_rank': '218',
'phone_number': '',
'url_seller': 'https://www.amazon.co.uk/gp/aag/main?seller=A7CL6GT0UVQKS&tag=mk4343k-21',
'year_rank': '100'}
2017-07-18 11:28:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.co.uk/sp?_encoding=UTF8&asin=&isAmazonFulfilled=&isCBA=&marketplaceID=A1F83G8C2ARO7P&orderID=&seller=W5VG5JB9GHYUG&tab=&vasStoreID=>
{'address': 'ROOM 919, BLOCK 2 West, SEG TECHNOLOGY PARK\n'
'SHENZHEN\n'
'GUANGDONG\n'
'518000\n'
'CN\n'
'FU TIAN QU QIAO XIANG LU HAO FENG YUAN 7 DONG 7A\n'
'SHENZHEN\n'
'GUANGDONG\n'
'518000\n'
'CN',
'business_name': 'MUDDER TECHNOLOGY CO., LTD',
'lifetime_rank': '3',
'month_rank': '28',
'phone_number': '86 18565729081',
'url_seller': 'https://www.amazon.co.uk/gp/aag/main?seller=W5VG5JB9GHYUG&tag=mk4343k-21',
'year_rank': '10'}
And what I get :
2017-07-18 11:28:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.co.uk/sp?_encoding=UTF8&asin=&isAmazonFulfilled=&isCBA=&marketplaceID=A1F83G8C2ARO7P&orderID=&seller=A20T907OQC02JJ&tab=&vasStoreID=>
{'address': '55740 Currant Rd\nMishawaka\nIndiana\n46545\nUS',
'business_name': 'Better World Books Marketplace Inc',
'lifetime_rank': '863',
'month_rank': '218',
'phone_number': '',
'url_seller': 'https://www.amazon.co.uk/gp/aag/main?seller=A7CL6GT0UVQKS&tag=mk4343k-21',
'year_rank': '100'}
2017-07-18 11:28:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.co.uk/sp?_encoding=UTF8&asin=&isAmazonFulfilled=&isCBA=&marketplaceID=A1F83G8C2ARO7P&orderID=&seller=A1XG2K8M6HRQZ8&tab=&vasStoreID=>
{'address': 'ROOM 919, BLOCK 2 West, SEG TECHNOLOGY PARK\n'
'SHENZHEN\n'
'GUANGDONG\n'
'518000\n'
'CN\n'
'FU TIAN QU QIAO XIANG LU HAO FENG YUAN 7 DONG 7A\n'
'SHENZHEN\n'
'GUANGDONG\n'
'518000\n'
'CN',
'business_name': 'MUDDER TECHNOLOGY CO., LTD',
'lifetime_rank': '863',
'month_rank': '218',
'phone_number': '86 18565729081',
'url_seller': 'https://www.amazon.co.uk/gp/aag/main?seller=A7CL6GT0UVQKS&tag=mk4343k-21',
'year_rank': '100'}
You can see that the url_seller are exactly the same and the rank (by month, year or lifetime) also... but I want them to be different..... And the url_seller is not the same that the link I crawled, but it has to be the same..... Any help please?
Ranking issues
I'll walk through it step by step:
You get a list of rank numbers:
rank = response.xpath('//div/table/tbody/tr/td[#class="number"]/text()').extract()
You get a list of URLs:
url_1 = response.xpath('//div/table/tbody/tr/td/a/#href').extract()
Here's where you go wrong:
for i in range(len(rank)-2):
item['month_rank'] = ''.join(rank[i]).strip()
item['year_rank'] = ''.join(rank[i+1]).strip()
item['lifetime_rank'] = ''.join(rank[i+2]).strip()
i += 3
Firstly, since you're using a for loop, your i variable is getting reset to the next item (in this case the next number) at the beginning of each loop, so you're looping through each one, not looping by threes. That i += 3 is doing nothing, sorry.
Anyway, the purpose of the loop appears to be to build the following structure:
{'month_rank': <rank>, 'year_rank': <rank>, 'lifetime_rank': <rank>}
So..., secondly, each time you run this loop, you overwrite the previous set of values without having done anything with them. Oops.
You then proceed to loop through your list of URLs, passing the last set of rankings your previous loop built, along with each url to your parse_2 function.
for i in range(len(url_1)):
url_tmp = urllib.parse.urljoin('https://www.marketplacepulse.com',url_1[i])
yield scrapy.Request(url_tmp, callback=self.parse_2, meta={'item': item})
You end up with each call to parse_2 having the same set of ranking data.
To fix this, you should deal with your your URL and it's associated rankings in the same loop:
for i in range(len(url_1)):
url_tmp = urllib.parse.urljoin('https://www.marketplacepulse.com',url_1[i])
item['month_rank'] = ''.join(rank[i*3]).strip()
item['year_rank'] = ''.join(rank[i*3+1]).strip()
item['lifetime_rank'] = ''.join(rank[i*3+2]).strip()
yield scrapy.Request(url_tmp, callback=self.parse_2, meta={'item': item})
That should fix your rank issues.
url_seller issue
I'm not too sure about the url_seller issue, because it seems like it should use the same url for both item['url_seller'] and its call to parse_3, and it seems like it's using the right info to call parse_3, but continuing to use the same information in item['url_seller'] over and over again.
I'm kind of going out on a limb here, since if I'm understanding the situation properly, both methods should (in the particular case that I think this is) make equal strings, but the only difference I've noticed so far is that for one you're using ''.join(url_2).strip() and for the other you're using str(url_2).
Since the part where you're using str(url_2) seems to be working properly where it's being used, perhaps you should try using it in the other spot too:
item['url_seller'] = str(url_2)

Scrapy Python loop to next unscraped link

i'm trying to make my spider go over a list and scrape all the url's it can find following them scraping some data and returning to continue on the next unscraped link if i run the spider i can see that it returns back to the starting page but tries to scrape the same page again and just quits afterwards any code suggestions pretty new to python.
import scrapy
import re
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
request = scrapy.Request(item['listurl'], callback=self.parseBasicListingInfo)
yield request
def parseBasicListingInfo(item, response):
item = ListResidentialItem()
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
return item
to clarify:
i'm passing [0] so it only takes the first link of the list
but i want it to continue using the next unscraped link
output after running the spider :
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/robots.txt> (referer: None)
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/list> (referer: None)
2016-07-18 12:11:21 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/link1> (referer: http://www.domain.com/list)
2016-07-18 12:11:21 [scrapy] DEBUG: Scraped from <200 http://www.domain.com/link1>
{'title': [u'\rlink1\r']}
This should just work fine. Change the domain and xpath and see
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ProdItems(scrapy.Item):
listurl = scrapy.Field()
title = scrapy.Field()
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
list_urls = sel.xpath('//a[#id="link101"]/#href').extract()
for url in list_urls:
item['listurl'] = url
yield scrapy.Request(url, callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
yield item
This is the line that's causing your problem:
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
The "//" means "from the start of the document" which means that it scans from the very first tag and will always find the same first link. What you need to do is search relative to the start of the current tag using ".//" which means "from this tag onwards". Also your current for loop is visiting every tag in the document which is unneccesary. Try this:
def parse(self, response):
for href in response.xpath('//a[#id="link101"]/#href').extract():
item = ProductionItem()
item['listurl'] = href
yield scrapy.Request(href,callback=self.parseBasicListingInfo, meta={'item': item})
The xpath pulls the hrefs out of the links and returns them as a list you can iterate over.

Categories