I am working with an API and parsing different informations on its response, and calling the parsed information in different functions / files.
The issue is that it will call quickInfo() multiple times as a result, creating multiple API requests, which is unwanted as there is a rate limit or cause performance issues (API response is very large).
I am trying to find a way to get the API once and then be able to use the content of the response in different situations.
I could make "reponse" a global variable but I read that it was bad programming and could cause memory leaks.
Simplified code is as follows:
FILE 1
def quickInfo(name):
response = requests.get('[website]/product/{}?token=No'.format(name), headers=headers, verify=False).json()
return response
def parsing(name):
r = quickInfo(name)
name = "{}".format(r["product"]["name"])
buyprice_raw = [i["buyprice"] for i in r["avgHistory"]]
buy_orders = "{:,}".format(r["product"]["buyorders"])
sell_orders = "{:,}".format(r["product"]["sellorders"])
return name, buyprice_raw, buy_orders, sell_orders
def charting(name):
buyprice, sellprice = parsing(name)
#code continues
FILE 2
name, price = parsing(name)
print(name +"'s price is of " + price) #sample use
#code continues
Thanks for your help!
The absolute easiest way would be to decorate quickInfo with the #functools.lru_cache() decorator, but you'll just have to be aware that it will only ever do a real request once per name (unless you clear the cache):
import functools
#functools.lru_cache()
def quickInfo(name):
response = requests.get(
"[website]/product/{}?token=No".format(name),
headers=headers,
verify=False,
)
response.raise_for_status()
return response.json()
Related
EDIT2: Solved! Thanks to Michael Butscher's comment, I made a shallow copy of req_params by renaming the argument req_params to req_params_arg and then adding req_params = req_params_arg.copy() in get_assets_api_for_range().
The variable was shared between the threads, so copying before use solved the problem.
EDIT: It seems that "python requests" doesn't like being called simultaneously by several threads, I activated debug mode and I can see that the requests sent to the API are sometimes equal (same range asked) which leads to the duplicates. Weird behavior... Do you think I need to use aiohttp or asyncio??
I'm struggling with the concurrent.futures module in order to fetch a large volume of data from an API.
The API I'm using is limiting results to 100 results per page, then I'm calling it multiple times in order to get all the data.
To accelerate the process I tried to use multithreading (ThreadPoolExecutor) with a maximum of 10 threads.
It works fine and it's very quick but the results are different each time... Sometimes I get duplicates, sometimes not.
It seems not to be thread-safe somewhere but I cannot figure out where... Maybe it is hidden into the functions that uses pandas behind?
I tried to echo the no of pages it's getting and it's pretty correct (not in order but normal):
Fetching 300-400
Fetching 700-800
Fetching 800-900
Fetching 500-600
Fetching 400-500
Fetching 0-100
Fetching 200-300
Fetching 100-200
Fetching 900-1000
Fetching 600-700
Fetching 1100-1159
Fetching 1000-1100
Another weird behavior is here: when I put the line req_sess = authenticate() after the line print("Fetching {}".format(req_params['range'])) in get_assets_api_for_range function, only one page (the last I believe) is fetched multiple times.
Thanks for your help!!
Here is my code (I removed some parts, I should be enough I think), the main function called is get_assets_from_api_in_df():
from functools import partial
import pandas as pd
import requests
import concurrent.futures as cfu
def get_assets_api_for_range(range_to_fetch, req_params):
req_sess = authenticate()
req_params.update({'range': range_to_fetch})
print("Fetching {}".format(req_params['range']))
r = req_sess.get(url=auth_config['endpoint_url'] + '/assets',
params=req_params)
if r.status_code != 200:
raise ConnectionError("API Get assets error: {}".format(r))
json_response = r.json()
# This function in return, processes the list into a dataframe
return process_get_assets_from_api_in_df(json_response["asset_list"])
def get_assets_from_api_in_df() -> pd.DataFrame:
GET_NB_MAX = 100
# First, fetch 1 value to get nb to fetch
req_sess = authenticate()
r = req_sess.get(url=auth_config['endpoint_url'] + '/assets',
params={'range': '0-1'})
if r.status_code != 200:
raise ConnectionError("API Get assets error: {}".format(r))
json_response = r.json()
nb_to_fetch_total = json_response['total']
print("Nb to fetch: {}".format(nb_to_fetch_total))
# Building a queue of ranges to fetch
ranges_to_fetch_queue = []
for nb in range(0, nb_to_fetch_total, GET_NB_MAX):
if nb + GET_NB_MAX < nb_to_fetch_total:
range_str = str(nb) + '-' + str(nb + GET_NB_MAX)
else:
range_str = str(nb) + '-' + str(nb_to_fetch_total)
ranges_to_fetch_queue.append(range_str)
params = {
}
func_to_call = partial(get_assets_api_for_range,
req_params=params)
with cfu.ThreadPoolExecutor(max_workers=10) as executor:
result = list(executor.map(func_to_call, ranges_to_fetch_queue))
print("Fetch finished, merging data...")
return pd.concat(result, ignore_index=True)
I try to download a serie of text files from different websites. I am using urllib.request with Python. I want to expend the list of URL without making the code long.
The working sequence is
import urllib.request
url01 = 'https://web.site.com/this.txt'
url02 = 'https://web.site.com/kind.txt'
url03 = 'https://web.site.com/of.txt'
url04 = 'https://web.site.com/link.txt'
[...]
urllib.request.urlretrieve(url01, "Liste n°01.txt")
urllib.request.urlretrieve(url02, "Liste n°02.txt")
urllib.request.urlretrieve(url03, "Liste n°03.txt")
[...]
The number of file to download is increasing and I want to keep the second part of the code short.
I tried
i = 0
while i<51
i = i +1
urllib.request.urlretrieve( i , "Liste n°0+"i"+.txt")
It doesn't work and I am thinking that a while loop can be use for string but not for request.
So I was thinking of making it a function.
def newfunction(i)
return urllib.request.urlretrieve(url"i", "Liste n°0"+1+".txt")
But it seem that I am missing a big chunk of it.
This request is working but it seem I cannot transform it for long list or URL.
As a general suggestion, I'd recommend the requests module for Python, rather than urllib.
Based on that, some naive code for a possible function:
import requests
def get_file(site, filename):
target = site + "/" + filename
try:
r = requests.get(target, allow_redirects=True)
open(filename, 'wb').write(r.content)
return r.status_code
except requests.exceptions.RequestException as e:
print("File not downloaded, error: {}".format(e))
You can then call the function, passing in parameters of site and file name:
get_file('https://web.site.com', 'this.txt')
The function will raise an exception, but not stop execution, if it cannot download a file. You could expand exception handling to deal with files not being writable, but this should be a start.
It seems as if you're not casting the variable i to an integer before your concatenating it to the url string. That may be the reason why you're code isn't working. The while-loop/for-loop approach shouldn't effect whether or not the requests get sent out. I recommend using the requests module for making requests as well. Mike's post covers what the function should kind of look like. I also recommend creating a sessions object if you're going to be making a whole lot of requests in a piece of code. The sessions object will keep the underlying TCP connection open while you make your requests, which should reduce latency, CPU usage, and network congestion (https://en.wikipedia.org/wiki/HTTP_persistent_connection#Advantages). The code would look something like this:
import requests
with requests.Session() as s:
for i in range(10):
s.get(str(i)+'.com') # make request
# write to file here
To cast to a string you would want something like this:
i = 0
while i<51
i = i +1
urllib.request.urlretrieve( i , "Liste n°0" + str(i) + ".txt")
I'm having difficulty in determining what the best practices are for structuring data for calling various APIs that contain bitcoin prices. I want to be able to call multiple APIs, without duplicate code.
My initial thought was to build classes for each of the APIs I would be calling, and feed their attributes (api_id, url, and json_tree (json path I'd like to pull data from) to the BtcAPI class, and spit them out.
*Note, before reading that while BtcAPI works, the Coindesk/Bitstamp classes do not yet interact with that class. I want to ask about the way I should do this before I go to the trouble...*
Now, I'm wondering if I shouldn't make them lists, like:
coindesk = ['http://www.something.com', 'coindesk', '["time"]["updated"]']
...and just iterate through each of them. Or dicts, or any variety of other thing. What data structure is indicated here?
I am basically looking for a bit of a code review (since this code isn't working, I don't want to send it to the code review stack) and an understanding of best practices: tell me where you think I'm screwing up horribly, and what I can do to structure this data better? I'm a python and oop noob. I could do this procedurally, but it would be ugly and superfluous. I imagine I'm using classes a bit wrong, too. Insights? Help? Thank you!
Thank you!
import json
import urllib.request
#The BtcAPI class works well when you feed it static variables. It returns json.
class BtcAPI:
def __init__(self, url, api_id):
self.url = url
self.api_id = api_id
def btc_api_call(self):
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=hdr)
readdata = urllib.request.urlopen(req)
json_data = readdata.read()
json_dict = json.loads(json_data)
return(json_dict)
class Coindesk:
api_id = 'Coindesk'
url = 'https://api.coindesk.com/v1/bpi/currentprice.json'
json_tree = json_dict['time']['updated']
def __init__(self):
self.current_us_price = current_us_price
class Bitstamp:
api_id = 'Bitstamp'
url = 'https://www.bitstamp.net/api/ticker/'
json_tree = json_dict['last']
def __init__(self):
self.current_us_price = current_us_price
coindesk_url = Coindesk()
coindeskoutput = coindesk_url.url
print(coindeskoutput)
If you want a generic piece of code, I would suggest that you dissociate your code and your configuration data into 2 files. This way you can manage your configuration data (URL, JSON attributes you want to retrieve) without having to modify your actual Python code. This is usually considered a good practice but it implies to manage two files instead of one so it can be a bit of a burden if you are on a very small project.
Example in your case, you can have:
conf.ini
bitcoin_api.py
conf.ini
Configuration file which would look like this:
[COINDESK]
url: https://api.coindesk.com/v1/bpi/currentprice.json
response: time.updated
[BITSTAMP]
url: https://www.bitstamp.net/api/ticker
response: last
bitcoin_api.py
Your code would look like this:
import configparser
import requests
import os
class BitcoinAPI:
def __init__(self, API):
config = configparser.ConfigParser()
config.read(os.path.dirname(__file__) + '/conf.ini')
self.url = config.get(API, 'url')
self.attribute = config.get(API, 'response')
self.header = {'content-type': 'application/json'}
def get(self):
response = requests.get(self.url, headers=self.header)
response = response.json()
# Browse the response to fetch only the attributes you want
self.attribute = self.attribute.split('.')
depth = len(self.attribute)
for i in range(depth):
response = response[self.attribute[i]]
print(response)
return response
Then you can call you class in your main script:
import bitcoin_api
result = bitcoin_api.BitcoinAPI('COINDESK').get()
result = bitcoin_api.BitcoinAPI('BITSTAMP').get()
Well, the sane way to struct your code would involve super classes (inheritance) and some sort of standardisation (interfaces).
Assuming that I understand you question, you need a standard on how to request the prices from various exchanges.
So consider this structure (of course it's completely draft):
import requests
class BitcoinAPI:
def get_price_usd(self):
raise NotImplementedError
#staticmethod
def generic_unothorized_request(request_method, url, **kwargs):
return getattr(requests, request_method)(url, **kwargs)
class CoinDesk(BitcoinAPI):
url = 'https://api.coindesk.com/v1/bpi/currentprice.json'
def get_price_usd(self):
return self.generic_unothorized_request('get', url)
# process result, bla bla
class Bitstamp(BitcoinAPI):
def get_price_usd(self):
# Implementation of get_price_usd for Bitstamp
return 0.0
Which would be a better to way to get contents from two different request handlers?
This is how my app structure looks like
#/twitter/<query>
class TwitterSearch(webapp2.RequestHandler):
def get(self,query):
#get data from Twitter
json_data = data_from_twiiter()
return json_data
#/google/<query>
class GoogleSearch(webapp2.RequestHandler):
def get(self,query):
#get data from Twitter
json_data = data_from_google()
return json_data
Now I can access twitter search data and Google search data separately by calling their respective URL.
I also need to combine both these search results and offer to the user. What would be my best approach to do this ?
Should I call the get method of the respective classes like this ?
#/search/<query>
#Combined search result from google and twitter
class Search(webapp2.RequestHandler):
def get(self,query):
t = TwitterSearch()
twitterSearch = t.get(self,query)
g = GoogleSearch()
googlesearch = g.get(self,query)
Or fetch the data from URL using urllib or something like this ?
#/search/<query>
#Combined search result from google and twitter
class Search(webapp2.RequestHandler):
def get(self,query):
t = get_data_from_URL('/twitter/'+query)
g = get_data_from_URL('/google/'+query)
Or is there some other way to handle this situation?
You shouldn't make HTTP calls to your own application, that introduces a completely unnecessary level of overhead.
I would do this by extracting the query code into a separate function and calling it from both request handlers.
Is there an easy way to cache things when using urllib2 that I am over-looking, or do I have to roll my own?
If you don't mind working at a slightly lower level, httplib2 (https://github.com/httplib2/httplib2) is an excellent HTTP library that includes caching functionality.
You could use a decorator function such as:
class cache(object):
def __init__(self, fun):
self.fun = fun
self.cache = {}
def __call__(self, *args, **kwargs):
key = str(args) + str(kwargs)
try:
return self.cache[key]
except KeyError:
self.cache[key] = rval = self.fun(*args, **kwargs)
return rval
except TypeError: # incase key isn't a valid key - don't cache
return self.fun(*args, **kwargs)
and define a function along the lines of:
#cache
def get_url_src(url):
return urllib.urlopen(url).read()
This is assuming you're not paying attention to HTTP Cache Controls, but just want to cache the page for the duration of the application
This ActiveState Python recipe might be helpful:
http://code.activestate.com/recipes/491261/
I've always been torn between using httplib2, which does a solid job of handling HTTP caching and authentication, and urllib2, which is in the stdlib, has an extensible interface, and supports HTTP Proxy servers.
The ActiveState recipe starts to add caching support to urllib2, but only in a very primitive fashion. It fails to allow for extensibility in storage mechanisms, hard-coding the file-system-backed storage. It also does not honor HTTP cache headers.
In an attempt to bring together the best features of httplib2 caching and urllib2 extensibility, I've adapted the ActiveState recipe to implement most of the same caching functionality as is found in httplib2. The module is in jaraco.net as jaraco.net.http.caching. The link points to the module as it exists at the time of this writing. While that module is currently part of the larger jaraco.net package, it has no intra-package dependencies, so feel free to pull the module out and use it in your own projects.
Alternatively, if you have Python 2.6 or later, you can easy_install jaraco.net>=1.3 and then utilize the CachingHandler with something like the code in caching.quick_test().
"""Quick test/example of CacheHandler"""
import logging
import urllib2
from httplib2 import FileCache
from jaraco.net.http.caching import CacheHandler
logging.basicConfig(level=logging.DEBUG)
store = FileCache(".cache")
opener = urllib2.build_opener(CacheHandler(store))
urllib2.install_opener(opener)
response = opener.open("http://www.google.com/")
print response.headers
print "Response:", response.read()[:100], '...\n'
response.reload(store)
print response.headers
print "After reload:", response.read()[:100], '...\n'
Note that jaraco.util.http.caching does not provide a specification for the backing store for the cache, but instead follows the interface used by httplib2. For this reason, the httplib2.FileCache can be used directly with urllib2 and the CacheHandler. Also, other backing caches designed for httplib2 should be usable by the CacheHandler.
I was looking for something similar, and came across "Recipe 491261: Caching and throttling for urllib2" which danivo posted. The problem is I really dislike the caching code (lots of duplication, lots of manually joining of file paths instead of using os.path.join, uses staticmethods, non very PEP8'sih, and other things that I try to avoid)
The code is a bit nicer (in my opinion anyway) and is functionally much the same, with a few additions - mainly the "recache" method (example usage can be seem here, or in the if __name__ == "__main__": section at the end of the code).
The latest version can be found at http://github.com/dbr/tvdb_api/blob/master/cache.py, and I'll paste it here for posterity (with my application specific headers removed):
#!/usr/bin/env python
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/ by dbr
"""
import os
import time
import httplib
import urllib2
import StringIO
from hashlib import md5
def calculate_cache_path(cache_location, url):
"""Checks if [cache_location]/[hash_of_url].headers and .body exist
"""
thumb = md5(url).hexdigest()
header = os.path.join(cache_location, thumb + ".headers")
body = os.path.join(cache_location, thumb + ".body")
return header, body
def check_cache_time(path, max_age):
"""Checks if a file has been created/modified in the [last max_age] seconds.
False means the file is too old (or doesn't exist), True means it is
up-to-date and valid"""
if not os.path.isfile(path):
return False
cache_modified_time = os.stat(path).st_mtime
time_now = time.time()
if cache_modified_time < time_now - max_age:
# Cache is old
return False
else:
return True
def exists_in_cache(cache_location, url, max_age):
"""Returns if header AND body cache file exist (and are up-to-date)"""
hpath, bpath = calculate_cache_path(cache_location, url)
if os.path.exists(hpath) and os.path.exists(bpath):
return(
check_cache_time(hpath, max_age)
and check_cache_time(bpath, max_age)
)
else:
# File does not exist
return False
def store_in_cache(cache_location, url, response):
"""Tries to store response in cache."""
hpath, bpath = calculate_cache_path(cache_location, url)
try:
outf = open(hpath, "w")
headers = str(response.info())
outf.write(headers)
outf.close()
outf = open(bpath, "w")
outf.write(response.read())
outf.close()
except IOError:
return True
else:
return False
class CacheHandler(urllib2.BaseHandler):
"""Stores responses in a persistant on-disk cache.
If a subsequent GET request is made for the same URL, the stored
response is returned, saving time, resources and bandwidth
"""
def __init__(self, cache_location, max_age = 21600):
"""The location of the cache directory"""
self.max_age = max_age
self.cache_location = cache_location
if not os.path.exists(self.cache_location):
os.mkdir(self.cache_location)
def default_open(self, request):
"""Handles GET requests, if the response is cached it returns it
"""
if request.get_method() is not "GET":
return None # let the next handler try to handle the request
if exists_in_cache(
self.cache_location, request.get_full_url(), self.max_age
):
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = True
)
else:
return None
def http_response(self, request, response):
"""Gets a HTTP response, if it was a GET request and the status code
starts with 2 (200 OK etc) it caches it and returns a CachedResponse
"""
if (request.get_method() == "GET"
and str(response.code).startswith("2")
):
if 'x-local-cache' not in response.info():
# Response is not cached
set_cache_header = store_in_cache(
self.cache_location,
request.get_full_url(),
response
)
else:
set_cache_header = True
#end if x-cache in response
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = set_cache_header
)
else:
return response
class CachedResponse(StringIO.StringIO):
"""An urllib2.response-like object for cached responses.
To determine if a response is cached or coming directly from
the network, check the x-local-cache header rather than the object type.
"""
def __init__(self, cache_location, url, set_cache_header=True):
self.cache_location = cache_location
hpath, bpath = calculate_cache_path(cache_location, url)
StringIO.StringIO.__init__(self, file(bpath).read())
self.url = url
self.code = 200
self.msg = "OK"
headerbuf = file(hpath).read()
if set_cache_header:
headerbuf += "x-local-cache: %s\r\n" % (bpath)
self.headers = httplib.HTTPMessage(StringIO.StringIO(headerbuf))
def info(self):
"""Returns headers
"""
return self.headers
def geturl(self):
"""Returns original URL
"""
return self.url
def recache(self):
new_request = urllib2.urlopen(self.url)
set_cache_header = store_in_cache(
self.cache_location,
new_request.url,
new_request
)
CachedResponse.__init__(self, self.cache_location, self.url, True)
if __name__ == "__main__":
def main():
"""Quick test/example of CacheHandler"""
opener = urllib2.build_opener(CacheHandler("/tmp/"))
response = opener.open("http://google.com")
print response.headers
print "Response:", response.read()
response.recache()
print response.headers
print "After recache:", response.read()
main()
This article on Yahoo Developer Network - http://developer.yahoo.com/python/python-caching.html - describes how to cache http calls made through urllib to either memory or disk.
#dbr: you may need to add also https responses caching with :
def https_response(self, request, response):
return self.http_response(request,response)