Scrapy - unexpected unindent for "yield" command - python

I have a silly question that doesn't let me run the spider. Every time I run the spider, I get IndentationError for the last "}" at the end of my spider code after "yield" and I cannot find out WHY. Can someone help me out with the problem? Thanks a lot!!
Here is my spider:
-- coding: utf-8 --
import scrapy
import json
import logging
import urlparse
class ArtsPodcastsSpider(scrapy.Spider):
name = 'arts_podcasts'
allowed_domains = ['www.castbox.fm']
def start_requests(self):
try:
if response.request.meta['skip']:
skip=response.request.meta['skip']
else:
skip=0
while skip < 201:
url = 'https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip=0&limit=60&web=1&m=20201112&n=609584ea96edb64605bca96212128aa5&r=1'
split_url = urlparse.urlsplit(url)
path = split_url.path
path.split('&')
path.split('&')[:-5]
'&'.join(path.split('&')[:-5])
parsed_query = urlparse.parse_qs(split_url.query)
query = urlparse.parse_qs(split_url.query, keep_blank_values=True)
query['skip'] = skip
updated = split_url._replace(path='&'.join(base_path.split('&')[:-5]+['limit=60&web=1&m=20201112&n=609584ea96edb64605bca96212128aa5&r=1', '']),
query=urllib.urlencode(query, doseq=True))
updated_url=urlparse.urlunsplit(updated)
yield scrapy.Request(url= updated_url, callback= self.parse_id, meta={'skip':skip})
def parse_id(self, response):
skip=response.request.meta['skip']
data=json.loads(response.body)
category=data.get('data').get('category').get('name')
arts_podcasts=data.get('data').get('list')
for arts_podcast in arts_podcasts:
yield scrapy.Request(url='https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip={0}&limit=60&web=1&m=20201111&n=609ba0097bb48d4b0778a927bdcf69f4&r=1'.format(arts_podcast.get('list')[2].get('cid')), meta={'category':category,'skip':skip}, callback= self.parse)
def parse(self, response):
skip=response.request.meta['skip']
category=response.request.meta['category']
arts_podcast=json.loads(response.body).get('data')
yield scrapy.Request(callback=self.start_requests,meta={'skip':skip+1})
yield{
'title':arts_podcast.get('title'),
'category':arts_podcast.get('category'),
'sub_category':arts_podcast.get('categories')
}
Thank you!

The error is having a try without a matching except or finally.
I would expect this to result in SyntaxError, but I'm guessing python detects that you're back to the original indentation of the try statement before it figures out there is no matching except/finally.
There are other errors, such as accessing unexistant response in start_requests and parsing methods' indentation being wrong...

Related

First Python Scrapy Web Scraper Not Working

I took the Data Camp Web Scraping with Python course and am trying to run the 'capstone' web scraper in my own environment (the course takes place in a special in-browser environment). The code is intended to scrape the titles and descriptions of courses from the Data Camp webpage.
I've spend a good deal of time tinkering here and there, and at this point am hoping that the community can help me out.
The code I am trying to run is:
# Import scrapy
import scrapy
# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess
# Create the Spider class
class YourSpider(scrapy.Spider):
name = 'yourspider'
# start_requests method
def start_requests(self):
yield scrapy.Request(url= https://www.datacamp.com, callback = self.parse)
def parse (self, response):
# Parser, Maybe this is where my issue lies
crs_titles = response.xpath('//h4[contains(#class,"block__title")]/text()').extract()
crs_descrs = response.xpath('//p[contains(#class,"block__description")]/text()').extract()
for crs_title, crs_descr in zip(crs_titles, crs_descrs):
dc_dict[crs_title] = crs_descr
# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()
# Run the Spider
process = CrawlerProcess()
process.crawl(YourSpider)
process.start()
# Print a preview of courses
previewCourses(dc_dict)
I get the following output:
C:\Users*\PycharmProjects\TestScrape\venv\Scripts\python.exe C:/Users/*/PycharmProjects/TestScrape/main.py
File "C:\Users******\PycharmProjects\TestScrape\main.py", line 20
yield scrapy.Request(url=https://www.datacamp.com, callback=self.parse1)
^
SyntaxError: invalid syntax
Process finished with exit code 1
I notice that the parse method in line 20 remains grey in my PyCharm window. Maybe I am missing something important in the parse method?
Any help in getting the code to run would be greatly appreciated!
Thank you,
-WolfHawk
The error message is triggered in the following line:
yield scrapy.Request(url=https://www.datacamp.com, callback = self.parse)
As an input to url you should enter a string and strings are written with ' or " in the beginning and in the end.
Try this:
yield scrapy.Request(url='https://www.datacamp.com', callback = self.parse)
If this is your full code, you are also missing the function previewCourses. Check if it is provided to you or write it yourself with something like this:
def previewCourses(dict_to_print):
for key, value in dict_to_print.items():
print(key, value)

Scrapy Splash Screenshot Pipeline not working

I'm trying to save screenshots of scraped webpages with Scrapy Splash. I've copied and pasted the code found here into my pipeline folder: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
Here's the code from the url:
import scrapy
import hashlib
from urllib.parse import quote
class ScreenshotPipeline(object):
"""Pipeline that uses Splash to render screenshot of
every Scrapy item."""
SPLASH_URL = "http://localhost:8050/render.png?url={}"
async def process_item(self, item, spider):
encoded_item_url = quote(item["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await spider.crawler.engine.download(request, spider)
if response.status != 200:
# Error happened, return item.
return item
# Save screenshot to file, filename will be hash of url.
url = item["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = "{}.png".format(url_hash)
with open(filename, "wb") as f:
f.write(response.body)
# Store filename in item.
item["screenshot_filename"] = filename
return item
I've also followed the instructions for setting up splash found here: https://github.com/scrapy-plugins/scrapy-splash
When I call the command scrapy crawl spidereverything works correctly except the pipeline.
This is the "Error" I'm seeing.
<coroutine object ScreenshotPipeline.process_item at 0x7f29a9c7c8c0>
The spider is yielding the item correctly, but it will not process the item.
Does anyone have any advice? Thank you.
Edit:
I think what is going on is that Scrapy is calling the process_item() method as you normally would. However according to these docs: https://docs.python.org/3/library/asyncio-task.html a coroutine object must be called differently.
asyncio.run(process_item()) rather than process_item().
I think I may have to modify the source code?
You should use scrapy-splash in a script inside spider not in the pipelines.
I followed this docs and it works for me.

How to integrate several 'yield' commands together in scrapy script

My issue is that when I added the redirect code from Can't get Scrapy to parse and follow 301, 302 redirects to my script, it solved the problem in that now it runs without errors, but now I'm not getting any output to my csv file. The problem is that in parse_links1, the if and else statements end with a 'yield' statement and this seems to be preventing the scrapy.Request line from implementing. This is fairly clear since in the previous iteration of this code, which only went down 2 levels of links, the code ran perfectly. But since the latest level has a redirect issue, I had to add that code in.
My code is like this:
class TurboSpider(scrapy.Spider):
name = "fourtier"
handle_httpstatus_list = [404]
start_urls = [
"https://ttlc.intuit.com/browse/cd-download-support"]
# def parse gets first set of links to use
def parse(self, response):
links = response.selector.xpath('//ul[contains(#class,
"list-unstyled")]//#href').extract()
for link in links:
yield scrapy.Request(link, self.parse_links,
dont_filter=True)
def parse_links(self, response):
tier2_text = response.selector.xpath('//a[contains(#class,
"dropdown-item-link")]//#href').extract()
for link in tier2_text:
schema = 'https://turbotax.intuit.com/'
links_to_use = urlparse.urljoin(schema, link)
yield scrapy.Request(links_to_use, self.parse_links1)
def parse_links1(self, response):
tier2A_text = response.selector.xpath('//a').extract()
for t in tier2A_text:
if response.status >= 300 and response.status < 400:
# HTTP header is ascii or latin1, redirected url will be percent-encoded utf-8
location=
to_native_str(response.headers['location'].decode('latin1'))
request = response.request
redirected_url = urljoin(request.url, location)
if response.status in (301, 307) or request.method
== 'HEAD':
redirected =
request.replace(url=redirected_url)
yield redirected
else:
redirected =
request.replace(url=redirected_url,
method='GET', body='')
redirected.headers.pop('Content-Type', None)
redirected.headers.pop('Content-Length', None)
yield redirected
yield scrapy.Request((t, self.parse_links2))
def parse_links2(self, response):
divs = response.selector.xpath('//div')
for p in divs.select('.//p'):
yield{'text':p.extract()}
What is wrong with the way I've set up the 'yield' in the parse_links1 function so that now I don't get any output? How to integrate several 'yield' commands together?
See Debugging Spiders.
Some logging statements should allow you to determine where something unexpected is happening (execution not reaching a certain line, some variable containing unexpected data), which in turn should help you either understanding what the issue is or writing a more specific question that is easier to answer.

Python Scrapy not outputting to csv file

What am I doing wrong with the script so it's not outputting a csv file with the data? I am running the script with scrapy runspider yellowpages.py -o items.csv and still nothing is coming out but a blank csv file. I have followed different things here and also watched youtube trying to figure out where I am making the mistake and still cannot figure out what I am not doing correctly.
# -*- coding: utf-8 -*-
import scrapy
import requests
search = "Plumbers"
location = "Hammond, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search, 'geo_location_terms': location}
page = requests.get(url, params=q)
page = page.url
items = ()
class YellowpagesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['yellowpages.com']
start_urls = [page]
def parse(self, response):
self.log("I just visited: " + response.url)
items = response.css('a[class=business-name]::attr(href)')
for item in items:
print(item)
Simple spider without project.
Use my code, I wrote comments to make it easier to understand. This spider looks for all blocks on all pages for a pair of parameters "service" and "location". To run, use:
In your case:
scrapy runspider yellowpages.py -a servise="Plumbers" -a location="Hammond, LA" -o Hammondsplumbers.csv
The code will also work with any queries. For example:
scrapy runspider yellowpages.py -a servise="Doctors" -a location="California, MD" -o MDDoctors.json
etc...
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.exceptions import CloseSpider
class YellowpagesSpider(scrapy.Spider):
name = 'yellowpages'
allowed_domains = ['yellowpages.com']
start_urls = ['https://www.yellowpages.com/']
# We can use any pair servise + location on our request
def __init__(self, servise=None, location=None):
self.servise = servise
self.location = location
def parse(self, response):
# If "service " and" location " are defined
if self.servise and self.location:
# Create search phrase using "service" and " location"
search_url = 'search?search_terms={}&geo_location_terms={}'.format(self.servise, self.location)
# Send request with url "yellowpages.com" + "search_url", then call parse_result
yield Request(url=response.urljoin(search_url), callback=self.parse_result)
else:
# Else close our spider
# You can add deffault value if you want.
self.logger.warning('=== Please use keys -a servise="service_name" -a location="location" ===')
raise CloseSpider()
def parse_result(self, response):
# all blocks without AD posts
posts = response.xpath('//div[#class="search-results organic"]//div[#class="v-card"]')
for post in posts:
yield {
'title': post.xpath('.//span[#itemprop="name"]/text()').extract_first(),
'url': response.urljoin(post.xpath('.//a[#class="business-name"]/#href').extract_first()),
}
next_page = response.xpath('//a[#class="next ajax-page"]/#href').extract_first()
# If we have next page url
if next_page:
# Send request with url "yellowpages.com" + "next_page", then call parse_result
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_result)
for item in items:
print(item)
put yield instead of print there,
for item in items:
yield item
On inspection of your code, I notice a number of problems:
First, you initialize items to a tuple, when it should be a list: items = [].
You should change your name property to reflect the name you want on your crawler so you can use it like so: scrapy crawl my_crawler where name = "my_crawler".
start_urls is supposed to contain strings, not Request objects. You should change the entry from page to the exact search string you want to use. If you have a number of search strings and want to iterate over them, I would suggest using a middleware.
When you try to extract the data from CSS you're forgetting to call extract_all() which would actually transform your selector into string data which you could use.
Also, you shouldn't be redirecting to the standard output stream because a lot of logging goes there and it'll make your output file really messy. Instead, you should extract the responses into items using loaders.
Finally, you're probably missing the appropriate settings from your settings.py file. You can find the relevant documentation here.
FEED_FORMAT = "csv"
FEED_EXPORT_FIELDS = ["Field 1", "Field 2", "Field 3"]

Scrapy spider for JSON response is giving me error

import json
import scrapy
class SpidyQuotesSpider(scrapy.Spider):
name = 'hotelspider'
start_urls = [
'https://tr.hotels.com/search/listings.json?destination-id=1648683&q-check-out=2016-10-22&q-destination=Didim,+T%C3%BCrkiye&q-room-0-adults=2&pg=2&q-rooms=1&start-index=7&q-check-in=2016-10-21&resolved-location=CITY:1648683:UNKNOWN:UNKNOWN&q-room-0-children=0&pn=1'
]
def parse(self, response):
myresponse = json.loads(response.body)
data = myresponse.get('data')
body = data.get('body')
searchresults = body.get('searchResults')
for item in searchresults.get('results', []):
yield {
'text': item[0]['altText']
}
this is the screenshot of the error
I always get error when I run this script. Can anybody help me where I am doing wrong ?
I can't seem to reproduce your error but upon copying your code, I got a key error which pertains to your yield statement. See the code below:
import scrapy
import json
class SpidyQuotesSpider(scrapy.Spider):
name = "hotelspider"
allowed_domains = ["tr.hotels.com"]
start_urls = (
'https://tr.hotels.com/search/listings.json?destination-id=1648683&q-check-out=2016-10-22&q-destination=Didim,+T%C3%BCrkiye&q-room-0-adults=2&pg=2&q-rooms=1&start-index=7&q-check-in=2016-10-21&resolved-location=CITY:1648683:UNKNOWN:UNKNOWN&q-room-0-children=0&pn=1',
)
def parse(self, response):
myresponse = json.loads(response.body)
data = myresponse.get('data')
body = data.get('body')
searchresults = body.get('searchResults')
for item in searchresults.get('results', []):
yield {
'text': item['altText']
}
Make sure you are indenting using the same amount of spaces or just use TAB. Though the indentation shown in your code seems fine. Try pasting mine and see what comes up.
You are mixing spaces and tabs characters in your spider code (I copied your code from the "edit" function on your question):
Quoting Wikipedia, "Python uses whitespace to delimit control flow blocks". Indentation is crucial and you need to stick to either spaces or tabs. Mixing the 2 will lead to these IndentationErrors.
Try to make it like so:

Categories