I am having trouble building a basic spider program in Python. Whenever I try to run I get an error. The error occurs somewhere in the last seven lines of code.
#These modules do most of the work.
import sys
import urllib2
import urlparse
import htmllib, formatter
from cStringIO import StringIO
def log_stdout(msg):
"""Print msg to the screen."""
print msg
def get_page(url, log):
"""Retrieve URL and return contents, log errors."""
try:
page = urllib2.urlopen(url)
except urllib2.URLError:
log("Error retrieving: " + url)
return ''
body = page.read()
page.close()
return body
def find_links(html):
"""Return a list links in html."""
# We're using the parser just to get the HREFs
writer = formatter.DumbWriter(StringIO())
f = formatter.AbstractFormatter(writer)
parser = htmllib.HTMLParser(f)
parser.feed(html)
parser.close()
return parser.anchorlist
class Spider:
"""
The heart of this program, finds all links within a web site.
run() contains the main loop.
process_page() retrieves each page and finds the links.
"""
def __init__(self, startURL, log=None):
#This method sets initial values
self.URLs = set()
self.URLs.add(startURL)
self.include = startURL
self._links_to_process = [startURL]
if log is None:
# Use log_stdout function if no log provided
self.log = log_stdout
else:
self.log = log
def run(self):
#Processes list of URLs one at a time
while self._links_to_process:
url = self._links_to_process.pop()
self.log("Retrieving: " + url)
self.process_page(url)
def url_in_site(self, link):
#Checks whether the link starts with the base URL
return link.startswith(self.include)
def process_page(self, url):
#Retrieves page and finds links in it
html = get_page(url, self.log)
for link in find_links(html):
#Handle relative links
link = urlparse.urljoin(url, link)
self.log("Checking: " + link)
# Make sure this is a new URL within current site
if link not in self.URLs and self.url_in_site(link):
self.URLs.add(link)
self._links_to_process.append(link)
The error message pertains to this block of code.
if __name__ == '__main__':
#This code runs when script is started from command line
startURL = sys.argv[1]
spider = Spider(startURL)
spider.run()
for URL in sorted(spider.URLs):
print URL
The error message:
startURL = sys.argv[1]
IndexError: list index out of range
You aren't calling your spider program with an argument. sys.argv[0] is your script file, and sys.argv[1] would be the first argument you pass it. The "list index out of range" means you didn't give it any arguments.
Try calling it as python spider.py http://www.example.com (with your actual URL).
This doesn't directly answer your question, but:
I would go something as:
START_PAGE = 'http://some.url.tld'
ahrefs = lxml.html.parse(START_PAGE).getroottree('//a/#href')
Then use the available methods on lmxl.html objects and multiprocess the links
This handles "semi-malformed" HTML, and you can plug-in the BeautifulSoup library.
A bit of work is required if you want to even try to attempt to follow JavaScript generated links, but - that's life!
Related
I'm trying to save screenshots of scraped webpages with Scrapy Splash. I've copied and pasted the code found here into my pipeline folder: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
Here's the code from the url:
import scrapy
import hashlib
from urllib.parse import quote
class ScreenshotPipeline(object):
"""Pipeline that uses Splash to render screenshot of
every Scrapy item."""
SPLASH_URL = "http://localhost:8050/render.png?url={}"
async def process_item(self, item, spider):
encoded_item_url = quote(item["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await spider.crawler.engine.download(request, spider)
if response.status != 200:
# Error happened, return item.
return item
# Save screenshot to file, filename will be hash of url.
url = item["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = "{}.png".format(url_hash)
with open(filename, "wb") as f:
f.write(response.body)
# Store filename in item.
item["screenshot_filename"] = filename
return item
I've also followed the instructions for setting up splash found here: https://github.com/scrapy-plugins/scrapy-splash
When I call the command scrapy crawl spidereverything works correctly except the pipeline.
This is the "Error" I'm seeing.
<coroutine object ScreenshotPipeline.process_item at 0x7f29a9c7c8c0>
The spider is yielding the item correctly, but it will not process the item.
Does anyone have any advice? Thank you.
Edit:
I think what is going on is that Scrapy is calling the process_item() method as you normally would. However according to these docs: https://docs.python.org/3/library/asyncio-task.html a coroutine object must be called differently.
asyncio.run(process_item()) rather than process_item().
I think I may have to modify the source code?
You should use scrapy-splash in a script inside spider not in the pipelines.
I followed this docs and it works for me.
I have the following Python script using Scrapy:
import scrapy
class ChemSpider(scrapy.Spider):
name = "site"
def start_requests(self):
urls = [
'https://www.site.com.au'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
category_links = response.css('li').xpath('a/#href').getall()
category_links_filtered = [x for x in category_links if 'shop-online' in x] # remove non category links
category_links_filtered = list(dict.fromkeys(category_links_filtered)) # remove duplicates
for category_link in category_links_filtered:
if "medicines" in category_link:
next_page = response.urljoin(category_link) + '?size=10'
self.log(next_page)
yield scrapy.Request(next_page, callback=self.parse_subcategories)
def parse_subcategories(self, response):
for product in response.css('div.Product'):
yield {
'category_link': response.url,
'product_name': product.css('img::attr(alt)').get(),
'product_price': product.css('span.Price::text').get().replace('\n','')
}
My solution will run multiple instances of this script, each scraping a different subset of information from different 'categories'. I know you can run scrapy from the command line to output to a json file, but i want do to the output to a file from within the function, so each instance writes to a different file. Being a beginner with Python, I'm not sure where to go with my script. I need to get the output of the yield into a file while the script is executing. How do i achieve this? There will be hundreds of rows scraped, and I'm not familiar enough with how yield works to understand how to 'return' from it a set of data (or a list) that can then be written to the file.
You are looking to append a file. But being file writing an I/O operation, you need to lock the file from being written by other processes while a process is writing.
Easiest way to achieve is to write in different random files (files with random names) in a directory and concatenating them all using another process.
First let me suggest you some changes to your code. If you want to remove duplicates i you could use a set like this:
category_links_filtered = (x for x in category_links if 'shop-online' in x) # remove non category links
category_links_filtered = set(category_links_filtered) # remove duplicates
note that i'm also changing the [ to ( to make a generator instead of a list and save some memory. Search more about generators: https://www.python-course.eu/python3_generators.php
OK then the solution for your problem is using an Item Pipeline (https://docs.scrapy.org/en/latest/topics/item-pipeline.html), what this does perfom some action on every item yielded from your function parse_subcategories. What you do is add a class in your pipelines.py file and enable this pipeline in settings.py. This is:
In settings.py:
ITEM_PIPELINES = {
'YOURBOTNAME.pipelines.CategoriesPipeline': 300, #the number here is the priority of the pipeline, dont worry and just leave it
}
In pipelines.py:
import json
from urlparse import urlparse #this is library to parse urls
class CategoriesPipeline(object):
#This class dynamically saves the data depending on the category name obtained in the url or by an atrtribute
def open_spider(self, spider):
if hasattr(spider, 'filename'):
#the filename is an attribute set by -a filename=somefilename
filename = spider.filename
else:
#you could also set the name dynamically from the start url like this, if you set -a start_url=https://www.site.com.au/category-name
try:
filename = urlparse(spider.start_url).path[1:] #this returns 'category-name' and replace spaces with _
except AttributeError:
spider.crawler.engine.close_spider(self, reason='no start url') #this should not happen
self.file = open(filename+'.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
In spiders/YOURBOTNAME.py modify this:
class ChemSpider(scrapy.Spider):
name = "site"
if !hasattr(self, 'start_url'):
spider.crawler.engine.close_spider(self, reason='no start url') #we need a start url
start_urls = [ self.start_url ] #see why this works on https://docs.scrapy.org/en/latest/intro/tutorial.html#a-shortcut-for-creating-requests
def parse(self, response):#...
and then you start your crawl with this command: scrapy crawl site -a start_url=https://www.site.com.au/category-name and you could optionally add -a filename=somename
I have two sets of scripts. One to download a webpage and another to download links from the webpage. They both run but the links script doesn't return any scripts. Can anyone see or tell me why?
webpage script;
import sys, urllib
def getWebpage(url):
print '[*] getWebpage()'
url_file = urllib.urlopen(url)
page = url_file.read()
return page
def main():
sys.argv.append('http://www.bbc.co.uk')
if len(sys.argv) != 2:
print '[-] Usage: webpage_get URL'
return
else:
print getWebpage(sys.argv[1])
if __name__ == '__main__':
main()
Links Script
import sys, urllib, re
import getWebpage
def print_links(page):
print '[*] print_links()'
links = re.findall(r'\<a.*href\=.*http\:.+', page)
links.sort()
print '[+]', str(len(links)), 'HyperLinks Found:'
for link in links:
print link
def main():
sys.argv.append('http://www.bbc.co.uk')
if len(sys.argv) != 2:
print '[-] Usage: webpage_links URL'
return
page = webpage_get.getWebpage(sys.argv[1])
print_links(page)
This will fix most of your problems:
import sys, urllib, re
def getWebpage(url):
print '[*] getWebpage()'
url_file = urllib.urlopen(url)
page = url_file.read()
return page
def print_links(page):
print '[*] print_links()'
links = re.findall(r'\<a.*href\=.*http\:.+', page)
links.sort()
print '[+]', str(len(links)), 'HyperLinks Found:'
for link in links:
print link
def main():
site = 'http://www.bbc.co.uk'
page = getWebpage(site)
print_links(page)
if __name__ == '__main__':
main()
Then you can move on to fixing your regular expression.
While we are on the topic, though, I have two material recommendations:
use python library requests for getting web pages
use a real XML/HTML library for parsing HTML (recommend lxml)
Your regular expression doesn't have an end, so when you find the first it will display you the entire rest of page as you use the http\:.+ which means return all what is : till the end of the html page you need to specify the as end of the regular expression
friends.
I'm trying to rewrite one of my little tools. basically, it gets an input from user, and if that input doesn't contain the "base url" a function will construct that input into a valid url for other part of the program to work on.
if I were wrote it so the program only accepts valid url as input, it will work; however if I pass a string and construct it, urllib2.urlopen() will fail, and I have no idea why, as the value return is exactly the same str value...
import urllib2
import re
class XunLeiKuaiChuan:
kuaichuanBaseAddress = 'http://kuaichuan.xunlei.com/d/'
regexQuery = 'file_name=\"(.*?)\"\sfile_url=\"(.*?)\sfile_size=\"(.*?)\"'
agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)'
def buildLink(self, aLink):
if aLink == '':
return
if 'xunlei.com' not in aLink:
aLink = self.kuaichuanBaseAddress + aLink
return aLink
def decodeLink(self, url):
url = self.buildLink(url) #it will return correct url with the value provided.
print 'in decodeLink ' + url
urlReq = urllib2.Request(url)
urlReq.add_header('User-agent', self.agent)
pageContent = urllib2.urlopen(urlReq).read()
realLinks = re.findall(self.regexQuery, pageContent)
return realLinks
test = XunLeiKuaiChuan()
link='y7L1AwKuOwDeCClS528'
link2 = 'http://kuai.xunlei.com/d/y7L1AwKuOwDeCClS528'
s = test.decodeLink(link2)
print s
when I call it with link2 it will function as expected. and will fail when use 'link' someone tell me what I miss here? my "old version" work with only accept full url, but this unknown behavior is killing me here......Thank you.
btw if with full url it returns an empty list, just open the url and enter the catcha on the page. they do it to prevent some kind of 'attacks'....
never mind I got hostname in the code wrong.
For academic and performance sake, given this crawl recursive web-crawling function (which crawls only within the given domain) what would be the best approach to make it run iteratively? Currently when it runs, by the time it finishes python has climbed to using over 1GB of memory which isn't acceptable for running in a shared environment.
def crawl(self, url):
"Get all URLS from which to scrape categories."
try:
links = BeautifulSoup(urllib2.urlopen(url)).findAll(Crawler._match_tag)
except urllib2.HTTPError:
return
for link in links:
for attr in link.attrs:
if Crawler._match_attr(attr):
if Crawler._is_category(attr):
pass
elif attr[1] not in self._crawled:
self._crawled.append(attr[1])
self.crawl(attr[1])
Use a BFS instead of crawling recursively (DFS): http://en.wikipedia.org/wiki/Breadth_first_search
You can use an external storage solution (such as a database) for BFS queue to free up RAM.
The algorithm is:
//pseudocode:
var urlsToVisit = new Queue(); // Could be a queue (BFS) or stack(DFS). (probably with a database backing or something).
var visitedUrls = new Set(); // List of visited URLs.
// initialization:
urlsToVisit.Add( rootUrl );
while(urlsToVisit.Count > 0) {
var nextUrl = urlsToVisit.FetchAndRemoveNextUrl();
var page = FetchPage(nextUrl);
ProcessPage(page);
visitedUrls.Add(nextUrl);
var links = ParseLinks(page);
foreach (var link in links)
if (!visitedUrls.Contains(link))
urlsToVisit.Add(link);
}
Instead of recursing, you could put the new URLs to crawl into a queue. Then run until the queue is empty without recursing. If you put the queue into a file this uses almost no memory at all.
#Mehrdad - Thank you for your reply, the example you provided was concise and easy to understand.
The solution:
def crawl(self, url):
urls = Queue(-1)
_crawled = []
urls.put(url)
while not urls.empty():
url = urls.get()
try:
links = BeautifulSoup(urllib2.urlopen(url)).findAll(Crawler._match_tag)
except urllib2.HTTPError:
continue
for link in links:
for attr in link.attrs:
if Crawler._match_attr(attr):
if Crawler._is_category(attr):
continue
else:
Crawler._visit(attr[1])
if attr[1] not in _crawled:
urls.put(attr[1])
You can do it pretty easily just by using links as a queue:
def get_links(url):
"Extract all matching links from a url"
try:
links = BeautifulSoup(urllib2.urlopen(url)).findAll(Crawler._match_tag)
except urllib2.HTTPError:
return []
def crawl(self, url):
"Get all URLS from which to scrape categories."
links = get_links(url)
while len(links) > 0:
link = links.pop()
for attr in link.attrs:
if Crawler._match_attr(attr):
if Crawler._is_category(attr):
pass
elif attr[1] not in self._crawled:
self._crawled.append(attr[1])
# prepend the new links to the queue
links = get_links(attr[1]) + links
Of course, this doesn't solve the memory problem...