Soundcloud Scrapy Spider - python

I'm trying to build a Scrapy Spider to parse the artist and track info from SoundCloud.
Using the developer tools in FireFox I've determined an API call can be made that returns a JSON object that converts to a python dictionary. This API call needs an artist ID, and as far as I can tell these IDs have been auto-incremented. This means I don't need to crawl the site, and can just have a list of starting URLs that make the initial API call and then parse the pages that follow from that. I believe this should make me more friendly to the site?
From the returned response the artists' URL can be obtained, and visiting and parsing this URL will give more information about the artist
From the artists' URL we can visit their tracks and scrape a list of tracks alongside the tracks' attributes.
I think the issues I'm having stem from not understanding Scrapy's framework...
If I directly put in the artists' URL is start_urls Scrapy passes a scrapy.http.response.html.HtmlResponse Object to parse_artist. This allows me to extract the data I need (I didn't include all the code to parse the page to keep the code snippet shorter). However, if I pass that same Object to the same function from the function parse_api_call it results in an error...
I cannot understand why this is, and any help would be appreciated.
Side Note:
The initial API call grabs tracks from the artist, and the offset and limit can be changed and the function called recursively to collect the tracks. This, however, has proven unreliable, and even when it doesn't result in an error that terminates the program, it doesn't get a full list of tracks from the artist.
Here's the current code:
"""
Scrapes SoundCloud websites for artists and tracks
"""
import json
import scrapy
from ..items import TrackItem, ArtistItem
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class SoundCloudBot(scrapy.Spider):
name = 'soundcloudBot'
allowed_domains = ['soundcloud.com']
start_urls = [
'https://api-v2.soundcloud.com/users/7436630/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/4803918/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/17364233/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/19697240/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/5949564/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en'
]
# This is added for testing purposes. When these links are added directly to the
# start_urls the code runs as expected, when these links are extracted using parse_api_call
# is when problems arise
# start_urls.extend([
# 'https://soundcloud.com/futureisnow',
# 'https://soundcloud.com/bigsean-1',
# 'https://soundcloud.com/defjam',
# 'https://soundcloud.com/ymcmbofficial',
# 'https://soundcloud.com/walefolarin',
# # 'https://soundcloud.com/futureisnow/tracks',
# # 'https://soundcloud.com/bigsean-1/tracks',
# # 'https://soundcloud.com/defjam/tracks',
# # 'https://soundcloud.com/ymcmbofficial/tracks',
# # 'https://soundcloud.com/walefolarin/tracks'
# ])
def parse(self, response):
url = response.url
if url[:35] == 'https://api-v2.soundcloud.com/users':
self.parse_api_call(response)
# 'https://soundcloud.com/{artist}'
elif url.replace('https://soundcloud.com', '').count('/') == 1: # One starting forward slash for artist folder
self.parse_artist(response)
# 'https://soundcloud.com/{artist}/{track}'
elif url.replace('https://soundcloud.com', '').count('/') == 2 and url[-6:] == 'tracks':
self.parse_tracks(response)
def parse_api_call(self, response):
data = json.loads(response.text)
artistItem = ArtistItem()
first_track = data['collection'][0]
artist_info = first_track.get('user')
artist_id = artist_info.get('id')
artist_url = artist_info.get('permalink_url')
artist_name = artist_info.get('username')
artistItem['artist_id'] = artist_id
artistItem['username'] = artist_name
artistItem['url'] = artist_url
artist_response = scrapy.http.response.html.HtmlResponse(artist_url)
self.parse_artist(artist_response)
# Once the pipelines are written this will be changed to yeild
return artistItem
def parse_artist(self, response):
# This prints out <class 'scrapy.http.response.html.HtmlResponse'>
# It doesn't matter if start_urls get extend with artists' URLS or not
print(type(response))
data = response.css('script::text').extract()
# This prints out a full HTML response if the function is called directly
# With scrapy, or an empty list if called from parse_api_call
print(data)
track_response = scrapy.http.response.html.HtmlResponse(f'{response.url}/tracks')
self.parse_tracks(track_response)
def parse_tracks(self, response):
pass

You have to use
Request(url)
to get data from new url. But you can't execute it as normal function and get result at once. You have to use return Request() or yield Request() and scrapy puts it in queue to get data later.
After it gets data it uses method parse() to parse data from response. But you can set own method in request
Request(url, self.parse_artist)
But in parse_artist() you will not have access to data which you get in previous function so you have to send it in request using meta - ie.
Request(artistItem['url'], self.parse_artist, meta={'item': artistItem})
Full working code. You can put all in one file and run it without creating project.
It also saves result in output.csv
import scrapy
from scrapy.http import Request
import json
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['soundcloud.com']
start_urls = [
'https://api-v2.soundcloud.com/users/7436630/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/4803918/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/17364233/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/19697240/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/5949564/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en'
]
def parse(self, response):
data = json.loads(response.text)
if len(data['collection']) > 0:
artist_info = data['collection'][0]['user']
artistItem = {
'artist_id': artist_info.get('id'),
'username': artist_info.get('username'),
'url': artist_info.get('permalink_url'),
}
print('>>>', artistItem['url'])
# make requests to url artistItem['url'],
# parse response in parse_artist,
# send artistItem to parse_artist
return Request(artistItem['url'], self.parse_artist, meta={'item': artistItem})
else:
print("ERROR: no collections in data")
def parse_artist(self, response):
artistItem = response.meta['item']
data = response.css('script::text').extract()
# add data to artistItem
#print(data)
artistItem['new data'] = 'some new data'
#print('>>>', response.urljoin('tracks'))
print('>>>', response.url + '/tracks')
# make requests to url artistItem['url'],
# parse response in parse_tracks,
# send artistItem to parse_tracks
return Request(response.url + '/tracks', self.parse_tracks, meta={'item': artistItem})
def parse_tracks(self, response):
artistItem = response.meta['item']
artistItem['tracks'] = 'some tracks'
# send to CSV file
return artistItem
#------------------------------------------------------------------------------
# run it without creating project
#------------------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
ouput.csv
artist_id,username,url,new data,tracks
17364233,Def Jam Recordings,https://soundcloud.com/defjam,some new data,some tracks
4803918,Big Sean,https://soundcloud.com/bigsean-1,some new data,some tracks
19697240,YMCMB-Official,https://soundcloud.com/ymcmbofficial,some new data,some tracks
5949564,WALE,https://soundcloud.com/walefolarin,some new data,some tracks

Related

Python Scrapy: Return list of URLs scraped

I am using scrapy to scrape all the links off single domain. I am following all links on the domain but saving all links off the domain. The following scraper works correctly, but I can't access member variables from within the scraper since I am running it with a CrawlerProcess.
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://example.com']
on_domain_urls = set()
off_domain_urls = set()
def parse(self, response):
links = response.xpath('//a/#href')
for link in links:
url = link.get()
if 'example.com' in url and url not in self.on_domain_urls:
print('On domain links found: {}'.format(
len(self.on_domain_urls)))
self.on_domain_urls.add(url)
yield scrapy.Request(url, callback=self.parse)
elif url not in self.off_domain_urls:
print('Offf domain links found: {}'.format(
len(self.on_domain_urls)))
self.off_domain_urls.add(url)
process = CrawlerProcess()
process.crawl(GoodOnYouSpider)
process.start()
# Need access to off_domain_links
How can I access off_domain_links? I could probably move it to a global scope but this seems hack. I can also append to a file, but I'd like to avoid file I/O if possible. Is there a better way to return aggregated data like this?
Did you check the Itempipeline? I think you'll have to use that in this scenario and decide what needs to be done with the variable.
See:
https://docs.scrapy.org/en/latest/topics/item-pipeline.html

Scrapy: Maintain location cookie for redirects

Code:
# -*- coding: utf-8 -*-
import scrapy
from ..items import LowesspiderItem
from scrapy.http import Request
class LowesSpider(scrapy.Spider):
name = 'lowes'
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, cookies={'sn':'2333'}) #Added cookie to bypass location req
def parse(self, response):
items = response.css('.grid-container')
for product in items:
item = LowesspiderItem()
#get product price
productPrice = product.css('.art-pd-price::text').get()
#get lowesNum
productLowesNum = response.url.split("/")[-1]
#get SKU
productSKU = product.css('.met-product-model::text').get()
item["productLowesNum"] = productLowesNum
item["productSKU"] = productSKU
item["productPrice"] = productPrice
yield item
Output:
{'productLowesNum': '1001440644',
'productPrice': None,
'productSKU': '8654RM-42'}
Now, I'll have a list of SKU's so that's how I'm going to format start_urls, so,
start_urls = ['https://www.lowes.com/search?searchTerm=('some sku)']
This url would redirect me to this link: https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644
That's handled by scrapy
Now the problem
When I have:
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
I get the SKU but not the price.
However when I use the actual URL in start_urls
start_urls = ['https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644']
then my output is fine:
{'productLowesNum': '1001440644',
'productPrice': '1,449.95',
'productSKU': '8654RM-42'}
So, I believe using a URL that has to be redirected causes for my scraper to not get the price for some reason, but I still get the SKU.
Here's my guess: I had to preset a location cookie because the Lowes website does not allow you to see the price unless the user gives them a zip code/ location. so I'd assume I would have to move or adjust cookies={'sn':'2333'} to make my program work as expected.
Problem
The main issue here is that some of your cookies which are set by the first request
https://www.lowes.com/search?searchTerm=8654RM-42
are carried forward to the request after the redirect which is
https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644
These cookies are overriding the cookies set by you.
Solution
You need to send explict cookies to each request and prevent the previous cookies from being added to the next request.
There is a setting in scrapy called dont_merge_cookies which is used for this purpose. You need to set this setting in your request meta to prevent cookies from previous requests being appended to the next request.
Now you need to explicitly set the cookies in request header. Something like this:
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, headers={'Cookie': 'sn=2333;'}, meta={'dont_merge_cookies': True})

Scrapy: How To Start Scraping Data From a Search Result that uses Javascript

I am new at using scrapy and python
I wanted to start scraping data from a search result, if you will load the page the default content will appear, what I need to scrape is the filtered one, while doing pagination?
Here's the URL
https://teslamotorsclub.com/tmc/post-ratings/6/posts
I need to scrape the item from Time Filter: "Today" result
I tried different approach but none is working.
What I have done is this but more on layout structure.
class TmcnfSpider(scrapy.Spider):
name = 'tmcnf'
allowed_domains = ['teslamotorsclub.com']
start_urls = ['https://teslamotorsclub.com/tmc/post-ratings/6/posts']
def start_requests(self):
#Show form from a filtered search result
def parse(self, response):
#some code scraping item
#Yield url for pagination
To get the posts of todays filter, you need to send a post request to this url https://teslamotorsclub.com/tmc/post-ratings/6/posts along with payload. The following should fetch you the results you are interested in.
import scrapy
class TmcnfSpider(scrapy.Spider):
name = "teslamotorsclub"
start_urls = ["https://teslamotorsclub.com/tmc/post-ratings/6/posts"]
def parse(self,response):
payload = {'time_chooser':'4','_xfToken':''}
yield scrapy.FormRequest(response.url,formdata=payload,callback=self.parse_results)
def parse_results(self,response):
for items in response.css("h3.title > a::text").getall():
yield {"title":items.strip()}

How to increase number of scraped items in scrapy for reuters search

I am trying to scrape reuters search result page. It loads using java script as explained in this question.
I changed numResultsToShow more than 2000 like 9999 or say. Total items on page are over 45000. No matter what number I put it, scrapy is only returning exactly 5000 scraped items.
My code is as follows:
class ReutersSpider(scrapy.Spider):
name = "reuters"
start_urls = [
'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=steel.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=9999&pn=1&callback=addMoreNewsResults',
]
def parse(self, response):
html = response.body.decode('utf-8')
json_string = re.search( r'addMoreNewsResults\((.+?) \);', html, re.DOTALL ).group(1)
#Below code is used to transform from Javascript-ish JSON-like structure to JSON
json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)
results = json.loads(json_string)
for result in results["news"]:
item = ReuterItem()
item["href"] = result["href"]
item["date"] = result["date"]
item["headline"] = result["headline"]
yield item
How can I increase it to cover all search results.
There are more than a few considerations when crawling sites like this, more so if it's by using their internal API's. Here's a few advice points from my experience, in no particular order:
Since you will likely be making a lot of requests while changing the query arguments, a good practice is to build them dynamically so you don't go crazy.
Always try to remove as much boilerplate from your requests as possible, like extra query parameters, headers, etc. It's useful to play around the API with tools like Postman or similar, to come to a bare minimum working requirements.
As the spider gets more complicated and/or there is a more complex crawling logic in place, it's useful to extract relevant code into separate methods for usability and easier maintenance.
You can pass along valuable information in meta of your request, which will be copied to the response's meta. This can be useful in the given example to keep track of the current page being crawled. Alternatively you can just extract the page number from the URL to make it more robust.
Consider if you need any Cookies in order to visit a certain page. You might not be able to get a response directly from the API (or any page for that matter) if you don't have proper cookies. Usually it's enough to just visit the main page before proceeding, and Scrapy will take care of storing cookies.
Always be polite to avoid being banned and putting a lot of stress on the target site. Use high download delays if possible, and keep the concurrency low.
All that said, I've given it a quick run and put together a semi-working example which should be enough to get you started. There are still improvements to be made, like more complex retry logic, revisiting the main page in case cookie expires, etc...
# -*- coding: utf-8 -*-
import json
import re
import urllib
import scrapy
class ReuterItem(scrapy.Item):
href = scrapy.Field()
date = scrapy.Field()
headline = scrapy.Field()
class ReutersSpider(scrapy.Spider):
name = "reuters"
NEWS_URL = 'https://www.reuters.com/search/news?blob={}'
SEARCH_URL = 'https://www.reuters.com/assets/searchArticleLoadMoreJson?'
RESULTS_PER_PAGE = 1000
BLOB = 'steel.'
custom_settings = {
# blend in
'USER_AGENT': ('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0)'
' Gecko/20100101 Firefox/40.1'),
# be polite
'DOWNLOAD_DELAY': 5,
}
def _build_url(self, page):
params = {
'blob': self.BLOB,
'bigOrSmall': 'big',
'callback': 'addMoreNewsResults',
'articleWithBlog': True,
'numResultsToShow': self.RESULTS_PER_PAGE,
'pn': page
}
return self.SEARCH_URL + urllib.urlencode(params)
def _parse_page(self, response):
html = response.body.decode('utf-8')
json_string = re.search( r'addMoreNewsResults\((.+?) \);', html, re.DOTALL ).group(1)
#Below code is used to transform from Javascript-ish JSON-like structure to JSON
json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)
return json.loads(json_string)
def start_requests(self):
# visit the news page first to get the cookies needed
# to visit the API in the next steps
url = self.NEWS_URL.format(self.BLOB)
yield scrapy.Request(url, callback=self.start_crawl)
def start_crawl(self, response):
# now that we have cookies set,
# start crawling form the first page
yield scrapy.Request(self._build_url(1), meta=dict(page=1))
def parse(self, response):
data = self._parse_page(response)
# extract news from the current page
for item in self._parse_news(data):
yield item
# Paginate if needed
current_page = response.meta['page']
total_results = int(data['totalResultNumber'])
if total_results > (current_page * self.RESULTS_PER_PAGE):
page = current_page + 1
url = self._build_url(page)
yield scrapy.Request(url, meta=dict(page=page))
def _parse_news(self, data):
for article in data["news"]:
item = ReuterItem()
item["href"] = article["href"]
item["date"] = article["date"]
item["headline"] = article["headline"]
yield item

using scrapy to extract dynamic data - location based on postcodes

I'm new to Scrapy, and with some tutorials I was able to scrape a few simple websites, but I'm facing an issue now with a new website where I have to fill a search form and extract the results. The response I get doesn't have the results.
Let's say for example, for the following site: http://www.beaurepaires.com.au/store-locator/
I want to provide a list of postcodes and extract information about stores in each postcode (store name and address).
I'm using the following code but it's not working, and I'm not sure where to start from.
class BeaurepairesSpider(BaseSpider):
name = "beaurepaires"
allowed_domains = ["http://www.beaurepaires.com.au"]
start_urls = ["http://www.beaurepaires.com.au/store-locator/"]
#start_urls = ["http://www.beaurepaires.com.au/"]
def parse(self, response):
yield FormRequest.from_response(response, formname='frm_dealer_locator', formdata={'dealer_postcode_textfield':'2115'}, callback=self.parseBeaurepaires)
def parseBeaurepaires(self, response):
hxs = HtmlXPathSelector(response)
filename = "postcodetest3.txt"
open(filename, 'wb').write(response.body)
table = hxs.select("//div[#id='jl_results']/table/tbody")
headers = table.select("tr[position()<=1]")
data_rows = table.select("tr[position()>1]")
Thanks!!
The page load here heavily uses javascript and is too complex for Scrapy. Here's an example of what I've come up to:
import re
from scrapy.http import FormRequest, Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class BeaurepairesSpider(BaseSpider):
name = "beaurepaires"
allowed_domains = ["beaurepaires.com.au", "gdt.rightthere.com.au"]
start_urls = ["http://www.beaurepaires.com.au/store-locator/"]
def parse(self, response):
yield FormRequest.from_response(response, formname='frm_dealer_locator',
formdata={'dealer_postcode_textfield':'2115'},
callback=self.parseBeaurepaires)
def parseBeaurepaires(self, response):
hxs = HtmlXPathSelector(response)
script = str(hxs.select("//div[#id='jl_container']/script[4]/text()").extract()[0])
url, script_name = re.findall(r'LoadScripts\("([a-zA-Z:/\.]+)", "(\w+)"', script)[0]
url = "%s/locator/js/data/%s.js" % (url, script_name)
yield Request(url=url, callback=self.parse_js)
def parse_js(self, response):
print response.body # here are your locations - right, inside the js file
see that regular expressions are used, hardcoded urls, and you'll have to parse js in order to get your locations - too fragile even if you'll finish it and get the locations.
Just switch to in-browser tools like selenium (or combine scrapy with it).

Categories