Python AttributeError: module 'requests' has no attribute 'requestURL' - python

I would like to run the code from the module requests_with_caching.py that uses the build in requests module from Python. I have 2 py.files in the samefolder (requests_with_caching.py + test.py). The library requests is installed.
I get an AttributeError: module 'requests' has no attribute 'requestURL'.
I don't get what I'm missing ... .
import requests
import json
PERMANENT_CACHE_FNAME = "permanent_cache.txt"
TEMP_CACHE_FNAME = "this_page_cache.txt"
def _write_to_file(cache, fname):
with open(fname, 'w') as outfile:
outfile.write(json.dumps(cache, indent=2))
def _read_from_file(fname):
try:
with open(fname, 'r') as infile:
res = infile.read()
return json.loads(res)
except:
return {}
def add_to_cache(cache_file, cache_key, cache_value):
temp_cache = _read_from_file(cache_file)
temp_cache[cache_key] = cache_value
_write_to_file(temp_cache, cache_file)
def clear_cache(cache_file=TEMP_CACHE_FNAME):
_write_to_file({}, cache_file)
def make_cache_key(baseurl, params_d, private_keys=["api_key"]):
"""Makes a long string representing the query.
Alphabetize the keys from the params dictionary so we get the same order each time.
Omit keys with private info."""
alphabetized_keys = sorted(params_d.keys())
res = []
for k in alphabetized_keys:
if k not in private_keys:
res.append("{}-{}".format(k, params_d[k]))
return baseurl + "_".join(res)
def get(baseurl, params={}, private_keys_to_ignore=["api_key"], permanent_cache_file=PERMANENT_CACHE_FNAME, temp_cache_file=TEMP_CACHE_FNAME):
full_url = requests.requestURL(baseurl, params)
cache_key = make_cache_key(baseurl, params, private_keys_to_ignore)
# Load the permanent and page-specific caches from files
permanent_cache = _read_from_file(permanent_cache_file)
temp_cache = _read_from_file(temp_cache_file)
if cache_key in temp_cache:
print("found in temp_cache")
# make a Response object containing text from the change, and the full_url that would have been fetched
return requests.Response(temp_cache[cache_key], full_url)
elif cache_key in permanent_cache:
print("found in permanent_cache")
# make a Response object containing text from the change, and the full_url that would have been fetched
return requests.Response(permanent_cache[cache_key], full_url)
else:
print("new; adding to cache")
# actually request it
resp = requests.get(baseurl, params)
# save it
add_to_cache(temp_cache_file, cache_key, resp.text)
return resp
import requests_with_caching
# it's not found in the permanent cache
res = requests_with_caching.get("https://api.datamuse.com/words?rel_rhy=happy", permanent_cache_file="datamuse_cache.txt")
print(res.text[:100])
# this time it will be found in the temporary cache
res = requests_with_caching.get("https://api.datamuse.com/words?rel_rhy=happy", permanent_cache_file="datamuse_cache.txt")
# This one is in the permanent cache.
res = requests_with_caching.get("https://api.datamuse.com/words?rel_rhy=funny", permanent_cache_file="datamuse_cache.txt")

The module requests_with_caching.py was written for Runestone by the University of Michigan in their Coursera course Data Collection and Processing with Python.
This module imports the requests module, and seems to use a special requests method called requestURL().
The thing is, the requests.requestURL() method used in the requests_with_caching module is particular to Runestone.
In fact the entire requests module was rewritten for Runestone because Runestone can't do API requests.
Take a look at the difference between Runestone's version of requests found in its src/lib. You will notice it's different than the requests module in your Python environment's site packages folder when you ran pip install requests.
You can view Runestone's rewritten requests module by running this in Runestone:
with open('src/lib/requests.py','r') as f:
module = f.read()
print(module)
I'd suggest looking at how the Runestone requests.requestURL() function was written and modify your copy of the requests_with_caching.py module to add this custom function.
There will be other changes to make as well to get requests_with_caching.py to work in your local python environment.

Related

How to convert requests.RequestsCookieJar to string

I have a requests.cookies.RequestCookieJar object which contains multiple cookies from different domain/path. How can I extract a cookies string for a particular domain/path following the rules mentioned in here?
For example
>>> r = requests.get("https://stackoverflow.com")
>>> print(r.cookies)
<RequestsCookieJar[<Cookie prov=4df137f9-848e-01c3-f01b-35ec61022540 for .stackoverflow.com/>]>
# the function I expect
>>> getCookies(r.cookies, "stackoverflow.com")
"prov=4df137f9-848e-01c3-f01b-35ec61022540"
>>> getCookies(r.cookies, "meta.stackoverflow.com")
"prov=4df137f9-848e-01c3-f01b-35ec61022540"
# meta.stackoverflow.com is also satisfied as it is subdomain of .stackoverflow.com
>>> getCookies(r.cookies, "google.com")
""
# r.cookies does not contains any cookie for google.com, so it return empty string
I think you need to work with a Python dictionary of the cookies. (See my comment above.)
def getCookies(cookie_jar, domain):
cookie_dict = cookie_jar.get_dict(domain=domain)
found = ['%s=%s' % (name, value) for (name, value) in cookie_dict.items()]
return ';'.join(found)
Your example:
>>> r = requests.get("https://stackoverflow.com")
>>> getCookies(r.cookies, ".stackoverflow.com")
"prov=4df137f9-848e-01c3-f01b-35ec61022540"
NEW ANSWER
Ok, so I still don't get exactly what it is you are trying to achieve.
If you want to extract the originating url from a requests.RequestCookieJar object (so that you could then check if there is a match with a given subdomain) that is (as far as I know) impossible.
However, you could off course do something like:
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import requests
import re
class getCookies():
def __init__(self, url):
self.cookiejar = requests.get(url).cookies
self.url = url
def check_domain(self, domain):
try:
base_domain = re.compile("(?<=\.).+\..+$").search(domain).group()
except AttributeError:
base_domain = domain
if base_domain in self.url:
print("\"prov=" + str(dict(self.cookiejar)["prov"]) + "\"")
else:
print("No cookies for " + domain + " in this jar!")
Then if you do:
new_instance = getCookies("https://stackoverflow.com")
You could then do:
new_instance.check_domain("meta.stackoverflow.com")
Which would give the output:
"prov=5d4fda78-d042-2ee9-9a85-f507df184094"
While:
new_instance.check_domain("google.com")
Would output:
"No cookies for google.com in this jar!"
Then, if you (if needed) fine-tune the regex & create a list of urls, you could first loop through the list to create many instances and save them in eg a list or dict. In a second loop you could check another list of urls to see if their cookies might be present in any of the instances.
OLD ANSWER
The docs you link to explain:
items()
Dict-like items() that returns a list of name-value
tuples from the jar. Allows client-code to call
dict(RequestsCookieJar) and get a vanilla python dict of key value
pairs.
I think what you are looking for is:
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import requests
def getCookies(url):
r = requests.get(url)
print("\"prov=" + str(dict(r.cookies)["prov"]) + "\"")
Now I can run it like this:
>>> getCookies("https://stackoverflow.com")
"prov=f7712c78-b489-ee5f-5e8f-93c85ca06475"
actually , when I just have the problem as you are. but when I access the Class Define
class RequestsCookieJar(cookielib.CookieJar, MutableMapping):
I found a func called def get_dict(self, domain=None, path=None):
you can simply write code like this
raw = "rawCookide"
print(len(cookie))
mycookie = SimpleCookie()
mycookie.load(raw)
UCookie={}
for key, morsel in mycookie.items():
UCookie[key] = morsel.value
The following code is not promised to be "forward compatible" because I am accessing attributes of classes that were intentionally hidden (kind of) by their authors; however, if you must get into the attributes of a cookie, take a look here:
import http.cookies
import requests
import json
import sys
import os
aresponse = requests.get('https://www.att.com')
requestscookiejar = aresponse.cookies
for cdomain,cooks in requestscookiejar._cookies.items():
for cpath, cookgrp in cooks.items():
for cname,cattribs in cookgrp.items():
print(cattribs.version)
print(cattribs.name)
print(cattribs.value)
print(cattribs.port)
print(cattribs.port_specified)
print(cattribs.domain)
print(cattribs.domain_specified)
print(cattribs.domain_initial_dot)
print(cattribs.path)
print(cattribs.path_specified)
print(cattribs.secure)
print(cattribs.expires)
print(cattribs.discard)
print(cattribs.comment)
print(cattribs.comment_url)
print(cattribs.rfc2109)
print(cattribs._rest)
When a person needs to access the simple attributes of cookies is it likely less complicated to go after the following way. This avoids the use of RequestsCookieJar. Here we construct a single SimpleCookie instance by reading from the headers attribute of a response object instead of the cookies attribute. The name SimpleCookie would seem to imply a single cookie but that isn't what a simple cookie is. Try it out:
import http.cookies
import requests
import json
import sys
import os
def parse_cookies(http_response):
cookie_grp = http.cookies.SimpleCookie()
for h,v in http_response.headers.items():
if 'set-cookie' in h.lower():
for cook in v.split(','):
cookie_grp.load(cook)
return cookie_grp
aresponse = requests.get('https://www.att.com')
cookies = parse_cookies(aresponse)
print(str(cookies))
You can get list of domains in ResponseCookieJar and then dump the cookies for each domain with the following code:
import requests
response = requests.get("https://stackoverflow.com")
cjar = response.cookies
for domain in cjar.list_domains():
print(f'Cookies for {domain}: {cjar.get_dict(domain=domain)}')
Outputs:
Cookies for domain .stackoverflow.com: {'prov': 'efe8c1b7-ddbd-4ad5-9060-89ea6c29479e'}
In this example, only one domain is listed. It would have multiple lines in output if there were cookies for multiple domains in the Jar.
For many usecases, the cookie jar can be serialized by simply ignoring domains by calling:
dCookies = cjar.get_dict()
We can easily extract cookies string for a particular domain/path using functions already available in requests lib.
import requests
from requests.models import Request
from requests.cookies import get_cookie_header
session = requests.session()
r1 = session.get("https://www.google.com")
r2 = session.get("https://stackoverflow.com")
cookie_header1 = get_cookie_header(session.cookies, Request(method="GET", url="https://www.google.com"))
# '1P_JAR=2022-02-19-18; NID=511=Hz9Mlgl7DtS4uhTqjGOEolNwzciYlUtspJYxQ0GWOfEm9u9x-_nJ1jpawixONmVuyua59DFBvpQZkPzNAeZdnJjwiB2ky4AEFYVV'
cookie_header2 = get_cookie_header(session.cookies, Request(method="GET", url="https://stackoverflow.com"))
# 'prov=883c41a4-603b-898c-1d14-26e30e3c8774'
Request is used to prepare a :class:PreparedRequest <PreparedRequest>, which is sent to the server.
What you need is get_dict() method
a_session = requests.Session()
a_session.get('https://google.com/')
session_cookies = a_session.cookies
cookies_dictionary = session_cookies.get_dict()
# Now just print it or convert to json
as_string = json.dumps(cookies_dictionary)
print(cookies_dictionary)

Mersive Solstive API: AttributeError: 'dict' object has no attribute 'm_displayInformation'

I have around 100 machines running Mersive Solstice, which is a wireless display tool. I'm trying to gather a few important pieces of information, in particular the fulfillment ID for the license for each installed instance.
Using the Solstice OpenControl API, found here, I whipped up a python script to grab everything I needed using a json GET. However, even when using the example GET from the documentation,
import requests
import json
url = ‘http://ip-of-machine/api/stats’
r = requests.get(url)
jsonStats = json.loads(r.text)
usersConnected = jsonStats.m_statistics.m_connectedUsers
I encounter:
Traceback (most recent call last):
File "C:/Python27/test.py", line 7, in <module>
usersConnected = jsonStats.m_statistics.m_connectedUsers
AttributeError: 'dict' object has no attribute 'm_statistics'
Which is very confusing. I've found plenty of similar questions on SO regarding this problem, but not one that's been specifically regarding wrong GET requests from the API Reference guide.
Additionally, here is my script:
import requests
import json
from time import sleep
url = 'test'
f = open("ip.txt", "r")
while(url != ""):
url = f.readline()
url = url.rstrip('\n')
print(url)
try:
r = requests.get(url)
except:
sleep(5)
jsonConfig = json.loads(r.text)
displayName = jsonConfig.m_displayInformation.m_displayName
hostName = jsonConfig.m_displayInformation.m_hostName
ipv4 = jsonConfig.m_displayInformation.m_ipv4
fulfillmentId = jsonConfig.m_licenseCuration.fulfillmentId
r.close()
f.close
I import the URL's from a text document for easy keeping. I'm able to make the connection to the /api/config JSON, and when the URL is put into a browser it does spit out the JSON records:
Json uses "Dicts" which are a type of array. You are just using them in the wrong way. I recommend reading Python Data Structures.
Json.Loads()
Returns a dictionary not a object. Do:
dict['key']['key']
Here is how your code should look:
import requests
import json
from time import sleep
url = 'test'
f = open("ip.txt", "r")
while(url != ""):
url = f.readline()
url = url.rstrip('\n')
print(url)
try:
response = requests.get(url)
json_object = json.loads(response .text)
displayName = json_object['m_displayInformation']['m_displayName']
hostName = json_object['m_displayInformation']['m_hostName']
ipv4 = json_object['m_displayInformation']['m_ipv4']
fulfillmentId = json_object['m_licenseCuration']['fulfillmentId']
except:
pass
response .close()
f.close()
I hope this was helpful!

JSON serialization Error in Python 3.2

I am using JSON library and trying to import a page feed to an CSV file. Tried many a ways to get the result however every time code execute it Gives JSON not serialzable. No Facebook use auth code which I have and used it so connection string will change however if you use a page which has public privacy you will still be able to get the result from below code.
following is the code
import urllib3
import json
import requests
#from pprint import pprint
import csv
from urllib.request import urlopen
page_id = "abcd" # username or id
api_endpoint = "https://graph.facebook.com"
fb_graph_url = api_endpoint+"/"+page_id
try:
#api_request = urllib3.Requests(fb_graph_url)
#http = urllib3.PoolManager()
#api_response = http.request('GET', fb_graph_url)
api_response = requests.get(fb_graph_url)
try:
#print (list.sort(json.loads(api_response.read())))
obj = open('data', 'w')
# write(json_dat)
f = api_response.content
obj.write(json.dumps(f))
obj.close()
except Exception as ee:
print(ee)
except Exception as e:
print( e)
Tried many approach but not successful. hope some one can help
api_response.content is the text content of the API, not a Python object so you won't be able to dump it.
Try either:
f = api_response.content
obj.write(f)
Or
f = api_response.json()
obj.write(json.dumps(f))
requests.get(fb_graph_url).content
is probably a string. Using json.dumps on it won't work. This function expects a list or a dictionary as the argument.
If the request already returns JSON, just write it to the file.

Extracting data from a URL result with special formatting

I have a URL:
http://somewhere.com/relatedqueries?limit=2&query=seedterm
where modifying the inputs, limit and query, will generate wanted data. Limit is the max number of term possible and query is the seed term.
The URL provides text result formatted in this way:
oo.visualization.Query.setResponse({version:'0.5',reqId:'0',status:'ok',sig:'1303596067112929220',table:{cols:[{id:'score',label:'Score',type:'number',pattern:'#,##0.###'},{id:'query',label:'Query',type:'string',pattern:''}],rows:[{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm1'}]},{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm2'}]}],p:{'totalResultsCount':'7727'}}});
I'd like to write a python script that takes two arguments (limit number and the query seed), go fetch the data online, parse the result and return a list with the new terms ['newterm1','newterm2'] in this case.
I'd love some help, especially with the URL fetching since I have never done this before.
It sounds like you can break this problem up into several subproblems.
Subproblems
There are a handful of problems that need to be solved before composing the completed script:
Forming the request URL: Creating a configured request URL from a template
Retrieving data: Actually making the request
Unwrapping JSONP: The returned data appears to be JSON wrapped in a JavaScript function call
Traversing the object graph: Navigating through the result to find the desired bits of information
Forming the request URL
This is just simple string formatting.
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=2, seedterm='seedterm')
Python 2 Note
You will need to use the string formatting operator (%) here.
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
Retrieving data
You can use the built-in urllib.request module for this.
import urllib.request
data = urllib.request.urlopen(url) # url from previous section
This returns a file-like object called data. You can also use a with-statement here:
with urllib.request.urlopen(url) as data:
# do processing here
Python 2 Note
Import urllib2 instead of urllib.request.
Unwrapping JSONP
The result you pasted looks like JSONP. Given that the wrapping function that is called (oo.visualization.Query.setResponse) doesn't change, we can simply strip this method call out.
result = data.read()
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
Parsing JSON
The resulting result string is just JSON data. Parse it with the built-in json module.
import json
result_object = json.loads(result)
Traversing the object graph
Now, you have a result_object that represents the JSON response. The object itself be a dict with keys like version, reqId, and so on. Based on your question, here is what you would need to do to create your list.
# Get the rows in the table, then get the second column's value for
# each row
terms = [row['c'][2]['v'] for row in result_object['table']['rows']]
Putting it all together
#!/usr/bin/env python3
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python3 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib.request
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=limit, seedterm=seedterm)
try:
with urllib.request.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
print('Could not request data from server', file=sys.stderr)
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print(terms)
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print(term)
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
print(error_message, file=sys.stderr)
exit(2)
exit(main(limit, seedterm))
Python 2.7 version
#!/usr/bin/env python2.7
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python2.7 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib2
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
try:
with urllib2.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print terms
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print term
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
sys.stderr.write('%s\n' % error_message)
exit(2)
exit(main(limit, seedterm))
i didn't understand well your problem because from your code there it seem to me that you use Visualization API (it's the first time that i hear about it by the way).
But well if you are just searching for a way to fetch data from a web page you could use urllib2 this is just for getting data, and if you want to parse the retrieved data you will have to use a more appropriate library like BeautifulSoop
if you are dealing with another web service (RSS, Atom, RPC) rather than web pages you can find a bunch of python library that you can use and that deal with each service perfectly.
import urllib2
from BeautifulSoup import BeautifulSoup
result = urllib2.urlopen('http://somewhere.com/relatedqueries?limit=%s&query=%s' % (2, 'seedterm'))
htmletxt = resul.read()
result.close()
soup = BeautifulSoup(htmltext, convertEntities="html" )
# you can parse your data now check BeautifulSoup API.

Caching in urllib2?

Is there an easy way to cache things when using urllib2 that I am over-looking, or do I have to roll my own?
If you don't mind working at a slightly lower level, httplib2 (https://github.com/httplib2/httplib2) is an excellent HTTP library that includes caching functionality.
You could use a decorator function such as:
class cache(object):
def __init__(self, fun):
self.fun = fun
self.cache = {}
def __call__(self, *args, **kwargs):
key = str(args) + str(kwargs)
try:
return self.cache[key]
except KeyError:
self.cache[key] = rval = self.fun(*args, **kwargs)
return rval
except TypeError: # incase key isn't a valid key - don't cache
return self.fun(*args, **kwargs)
and define a function along the lines of:
#cache
def get_url_src(url):
return urllib.urlopen(url).read()
This is assuming you're not paying attention to HTTP Cache Controls, but just want to cache the page for the duration of the application
This ActiveState Python recipe might be helpful:
http://code.activestate.com/recipes/491261/
I've always been torn between using httplib2, which does a solid job of handling HTTP caching and authentication, and urllib2, which is in the stdlib, has an extensible interface, and supports HTTP Proxy servers.
The ActiveState recipe starts to add caching support to urllib2, but only in a very primitive fashion. It fails to allow for extensibility in storage mechanisms, hard-coding the file-system-backed storage. It also does not honor HTTP cache headers.
In an attempt to bring together the best features of httplib2 caching and urllib2 extensibility, I've adapted the ActiveState recipe to implement most of the same caching functionality as is found in httplib2. The module is in jaraco.net as jaraco.net.http.caching. The link points to the module as it exists at the time of this writing. While that module is currently part of the larger jaraco.net package, it has no intra-package dependencies, so feel free to pull the module out and use it in your own projects.
Alternatively, if you have Python 2.6 or later, you can easy_install jaraco.net>=1.3 and then utilize the CachingHandler with something like the code in caching.quick_test().
"""Quick test/example of CacheHandler"""
import logging
import urllib2
from httplib2 import FileCache
from jaraco.net.http.caching import CacheHandler
logging.basicConfig(level=logging.DEBUG)
store = FileCache(".cache")
opener = urllib2.build_opener(CacheHandler(store))
urllib2.install_opener(opener)
response = opener.open("http://www.google.com/")
print response.headers
print "Response:", response.read()[:100], '...\n'
response.reload(store)
print response.headers
print "After reload:", response.read()[:100], '...\n'
Note that jaraco.util.http.caching does not provide a specification for the backing store for the cache, but instead follows the interface used by httplib2. For this reason, the httplib2.FileCache can be used directly with urllib2 and the CacheHandler. Also, other backing caches designed for httplib2 should be usable by the CacheHandler.
I was looking for something similar, and came across "Recipe 491261: Caching and throttling for urllib2" which danivo posted. The problem is I really dislike the caching code (lots of duplication, lots of manually joining of file paths instead of using os.path.join, uses staticmethods, non very PEP8'sih, and other things that I try to avoid)
The code is a bit nicer (in my opinion anyway) and is functionally much the same, with a few additions - mainly the "recache" method (example usage can be seem here, or in the if __name__ == "__main__": section at the end of the code).
The latest version can be found at http://github.com/dbr/tvdb_api/blob/master/cache.py, and I'll paste it here for posterity (with my application specific headers removed):
#!/usr/bin/env python
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/ by dbr
"""
import os
import time
import httplib
import urllib2
import StringIO
from hashlib import md5
def calculate_cache_path(cache_location, url):
"""Checks if [cache_location]/[hash_of_url].headers and .body exist
"""
thumb = md5(url).hexdigest()
header = os.path.join(cache_location, thumb + ".headers")
body = os.path.join(cache_location, thumb + ".body")
return header, body
def check_cache_time(path, max_age):
"""Checks if a file has been created/modified in the [last max_age] seconds.
False means the file is too old (or doesn't exist), True means it is
up-to-date and valid"""
if not os.path.isfile(path):
return False
cache_modified_time = os.stat(path).st_mtime
time_now = time.time()
if cache_modified_time < time_now - max_age:
# Cache is old
return False
else:
return True
def exists_in_cache(cache_location, url, max_age):
"""Returns if header AND body cache file exist (and are up-to-date)"""
hpath, bpath = calculate_cache_path(cache_location, url)
if os.path.exists(hpath) and os.path.exists(bpath):
return(
check_cache_time(hpath, max_age)
and check_cache_time(bpath, max_age)
)
else:
# File does not exist
return False
def store_in_cache(cache_location, url, response):
"""Tries to store response in cache."""
hpath, bpath = calculate_cache_path(cache_location, url)
try:
outf = open(hpath, "w")
headers = str(response.info())
outf.write(headers)
outf.close()
outf = open(bpath, "w")
outf.write(response.read())
outf.close()
except IOError:
return True
else:
return False
class CacheHandler(urllib2.BaseHandler):
"""Stores responses in a persistant on-disk cache.
If a subsequent GET request is made for the same URL, the stored
response is returned, saving time, resources and bandwidth
"""
def __init__(self, cache_location, max_age = 21600):
"""The location of the cache directory"""
self.max_age = max_age
self.cache_location = cache_location
if not os.path.exists(self.cache_location):
os.mkdir(self.cache_location)
def default_open(self, request):
"""Handles GET requests, if the response is cached it returns it
"""
if request.get_method() is not "GET":
return None # let the next handler try to handle the request
if exists_in_cache(
self.cache_location, request.get_full_url(), self.max_age
):
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = True
)
else:
return None
def http_response(self, request, response):
"""Gets a HTTP response, if it was a GET request and the status code
starts with 2 (200 OK etc) it caches it and returns a CachedResponse
"""
if (request.get_method() == "GET"
and str(response.code).startswith("2")
):
if 'x-local-cache' not in response.info():
# Response is not cached
set_cache_header = store_in_cache(
self.cache_location,
request.get_full_url(),
response
)
else:
set_cache_header = True
#end if x-cache in response
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = set_cache_header
)
else:
return response
class CachedResponse(StringIO.StringIO):
"""An urllib2.response-like object for cached responses.
To determine if a response is cached or coming directly from
the network, check the x-local-cache header rather than the object type.
"""
def __init__(self, cache_location, url, set_cache_header=True):
self.cache_location = cache_location
hpath, bpath = calculate_cache_path(cache_location, url)
StringIO.StringIO.__init__(self, file(bpath).read())
self.url = url
self.code = 200
self.msg = "OK"
headerbuf = file(hpath).read()
if set_cache_header:
headerbuf += "x-local-cache: %s\r\n" % (bpath)
self.headers = httplib.HTTPMessage(StringIO.StringIO(headerbuf))
def info(self):
"""Returns headers
"""
return self.headers
def geturl(self):
"""Returns original URL
"""
return self.url
def recache(self):
new_request = urllib2.urlopen(self.url)
set_cache_header = store_in_cache(
self.cache_location,
new_request.url,
new_request
)
CachedResponse.__init__(self, self.cache_location, self.url, True)
if __name__ == "__main__":
def main():
"""Quick test/example of CacheHandler"""
opener = urllib2.build_opener(CacheHandler("/tmp/"))
response = opener.open("http://google.com")
print response.headers
print "Response:", response.read()
response.recache()
print response.headers
print "After recache:", response.read()
main()
This article on Yahoo Developer Network - http://developer.yahoo.com/python/python-caching.html - describes how to cache http calls made through urllib to either memory or disk.
#dbr: you may need to add also https responses caching with :
def https_response(self, request, response):
return self.http_response(request,response)

Categories