I am accessing a api and extracting a json but I want to make sure I stay within the hourly request limit, what would be the best way to do this?
This where I make the request:
# return the json
def returnJSONQuestion(id):
url = 'http://someApi.com?index_id={0}&output=json'
format_url = url.format(id)
try:
urlobject = urllib2.urlopen(format_url)
jsondata = json.loads(urlobject.read().decode("utf-8"))
print jsondata
shortRandomSleep()
except urllib2.URLError, e:
print e.reason
except(json.decoder.JSONDecodeError,ValueError):
print 'Decode JSON has failed'
return jsondata
I usually use a cheap hack where I make the script run every other minute by checking the current time. This is the general form of the function:
def minuteMod(x, p=0):
import datetime
minute = datetime.datetime.now() + datetime.timedelta(seconds=15)
minute = int(datetime.datetime.strftime(minute, "%M"))
if minute % x == p:
return True
return False
p is the remainder here and has a default value of 0 so no particular need to pass in the second argument.
So basically, if you want your script to run only every other minute, you use it like this:
def returnJSONQuestion(id):
if not minuteMod(2):
return None or ''
# rest of the code
This will stop the request if the current minute is not even. Considering this is not the best way to do things, you can use this function to cache results (depending on if this is allowed). So basically, you would do something like this:
def returnJSONQuestion(id):
if minuteMod(3): # current minute is a factor of 3
return jsonFromCache # open a file and output cached contents
else:
url = 'http://...'
storeJSONToFile(url)
return json
You could use a token bucket algorithm, something like this: http://code.activestate.com/recipes/511490/
Have tokens added to the bucket at the rate the API allows you to make requests, and take a token from the bucket each time you make a request.
Related
I have an ajax function that is posing data in intervals of 25 seconds to the class that you see bellow
class StateInfo:
flag = None
def post(self):
data = request.get_data()
info = json.loads(data)
Now what I wanna achieve is to set flag variable to 0 when there is no post request within 30 seconds of each other. I know that there is .elapsed but it returns time delta between request and response
As I already said in the comments, my naive approach would be to add a timestamp for the last post request made. Than simply check if the timestamp is older than 30 seconds.
from multiprocessing import Process
from datetime import timedelta, datetime
class StateInfo:
flag = None
ts = None
def post(self):
ts = datetime.now()
data = request.get_data()
info = json.loads(data)
def check_state(self):
if ts < datetime.now() + timedelta(seconds = -30):
flag = 0
if __name__ == "__main__":
state_info = StateInfo()
proc = Process(target=state_info.check_state())
proc.start()
Note that I've added multiprocessing since you may have a second process, which needs to run at the same time. Although, you may have to repeat the process.
I hope this gives you a good idea, how you can achieve this. Also take a look at the suggestion from #match in the comments.
Here is a bit of context, I have a program to gets data from an API. It does this in two requests one for the total amount of points and the second a request for each point in the data. These get appended into an array.
def fetch_details(url: str):
response = requests.get(url)
# Makes request call to get the data of detail
# save_file(folder_path,GipodId,text2)
# any other processe
return response.json()
def fetch_data_points(url: str):
limit_request = 1000
# Placeholder for limit: please do not remove = 1000000000 -JJ
folder_path_reset("api_request_jsons","csv","Geographic_information")
total_start_time = start_time_measure()
start_time = start_time_measure(
'Starting Phase 1: First request from API: Data Points')
response = requests.get(url,params={"limit": limit_request})
end_time = end_time_measure(start_time, "Request completed: ")
print(len(response.json()))
time_estimated = end_time/len(response.json())
print(time_estimated)
end_time_measure(total_start_time, "End of Phase 1, completed in: ")
return response.json()
def fetch_details_of_data_points(url: str):
input_json = fetch_data_points(url)
fetch_points_save(input_json)
all_location_points_details = []
amount_of_objects = len(input_json)
total_start_time = start_time_measure()
start_time = start_time_measure(f'Starting Phase 2: Second request from API: {str(amount_of_objects)} requested')
#for i in tqdm(range(amount_of_objects),miniters=0.000000001):
# for obj in input_json:
# all_location_points_details.append(fetch_details(obj.get("detail")))
with tqdm(total=amount_of_objects) as pbar:
for obj in input_json:
all_location_points_details.append(fetch_details(obj.get("detail")))
pbar.update(1)
However I have noticed a certain flaw in my program I may have a solution for but I do not know how to implement. You see when the amount of data requested is massive (over more than 10.000 points) there can always happen a disconnect causing my program to fail. So as a solution I have would like this loop:
with tqdm(total=amount_of_objects) as pbar:
for obj in input_json:
all_location_points_details.append(fetch_details(obj.get("detail")))
pbar.update(1)
To be split a factor of a value i (or x) that is calculated by the following:
value y = 1000
value x = round(Amount of objects/y) --> Round because this needs to be rounded up no matter.
So lets say I have 145862 objects to request details from by my formula that is suppose to be 14.5 rounded up 15 sessions.
So 1 session request the first 1000 obj, starting from obj 1 and ending at 1000. The next session starts requesting from obj 2001. Next sessions starts from obj
So this is technically this:
i = 0
while i < x
for obj (starting from i + 1 object ending at 1*y ) in input_json:
all_location_points_details.append(fetch_details(obj.get("detail")))
i += 1
Thing is the part of I do not know how to program this. Can anyone help me with this?
I have a question about rate limits.
I take a data from the CSV and enter it into the query and the output is stored in a list.
I get an error because I make too many requests at once.
(I can only make 20 requests per second). How can I determine the rate limit?
import requests
import pandas as pd
df = pd.read_csv("Data_1000.csv")
list = []
def requestSummonerData(summonerName, APIKey):
URL = "https://euw1.api.riotgames.com/lol/summoner/v3/summoners/by-name/" + summonerName + "?api_key=" + APIKey
response = requests.get(URL)
return response.json()
def main():
APIKey = (str)(input('Copy and paste your API Key here: '))
for index, row in df.iterrows():
summonerName = row['Player_Name']
responseJSON = requestSummonerData(summonerName, APIKey)
ID = responseJSON ['accountId']
ID = int(ID)
list.insert(index,ID)
df["accountId"]= list
If you already know you can only make 20 requests per second, you just need to work out how long to wait between each request:
Divide 1 second by 20, which should give you 0.05. So you just need to sleep for 0.05 of a second between each request and you shouldn't hit the limit (maybe increase it a bit if you want to be safe).
import time at the top of your file and then time.sleep(0.05) inside of your for loop (you could also just do time.sleep(1/20))
I have the following program to scrap data from a website. I want to improve the below code by using a generator with a yield instead of calling generate_url and call_me multiple times sequentially. The purpose of this exersise is to properly understand yield and the context in which it can be used.
import requests
import shutil
start_date='03-03-1997'
end_date='10-04-2015'
yf_base_url ='http://real-chart.finance.yahoo.com/table.csv?s=%5E'
index_list = ['BSESN','NSEI']
def generate_url(index, start_date, end_date):
s_day = start_date.split('-')[0]
s_month = start_date.split('-')[1]
s_year = start_date.split('-')[2]
e_day = end_date.split('-')[0]
e_month = end_date.split('-')[1]
e_year = end_date.split('-')[2]
if (index == 'BSESN') or (index == 'NSEI'):
url = yf_base_url + index + '&a={}&b={}&c={}&d={}&e={}&f={}'.format(s_day,s_month,s_year,e_day,e_month,e_year)
return url
def callme(url,index):
print('URL {}'.format(url))
r = requests.get(url, verify=False,stream=True)
if r.status_code!=200:
print "Failure!!"
exit()
else:
r.raw.decode_content = True
with open(index + "file.csv", 'wb') as f:
shutil.copyfileobj(r.raw, f)
print "Success"
if __name__ == '__main__':
url = generate_url(index_list[0],start_date,end_date)
callme(url,index_list[0])
url = generate_url(index_list[1],start_date,end_date)
callme(url,index_list[1])
There are multiple options. You could use yield to iterate over URL's. Or over request objects.
If your index_list were long, I would suggest yielding URLs.
Because then you could use multiprocessing.Pool to map a function that does a request and saves the output over these URLs. That would execute them in parallel, potentially making it a lot faster (assuming that you have enough network bandwidth, and that yahoo finance doesn't throttle connections).
yf ='http://real-chart.finance.yahoo.com/table.csv?s=%5E'
'{}&a={}&b={}&c={}&d={}&e={}&f={}'
index_list = ['BSESN','NSEI']
def genurl(symbols, start_date, end_date):
# assemble the URLs
s_day, s_month, s_year = start_date.split('-')
e_day, e_month, e_year = end_date.split('-')
for s in symbols:
url = yf.format(s, s_day,s_month,s_year,e_day,e_month,e_year)
yield url
def download(url):
# Do the request, save the file
p = multiprocessing.Pool()
rv = p.map(download, genurl(index_list, '03-03-1997', '10-04-2015'))
If I understand you correctly, what you want to know is how to change the code so that you can replace the last part by
if __name__ == '__main__':
for url in generate_url(index_list,start_date,end_date):
callme(url,index)
If this is correct, you need to change generate_url, but not callme. Changing generate_url is rather mechanical. Make the first parameter index_list instead of index, wrap the function body in a for index in index_list loop, and change return url to yield url.
You don't need to change callme because you never want to say something like for call in callme(...). You won't do anything with it but a normal function call.
I've got a Python CGI script that pulls data from a GPS service; I'd like this information to be updated on the webpage about once every 10s (the max allowed by the GPS service's TOS). But there could be, say, 100 users viewing the webpage at once, all calling the script.
I think the users' scripts need to grab data from a buffer page that itself only upates once every ten seconds. How can I make this buffer page auto-update if there's no one directly viewing the content (and not accessing the CGI)? Are there better ways to accomplish this?
Cache the results of your GPS data query in a file or database (sqlite) along with a datetime.
You can then do a datetime check against the last cached datetime to initiate another GPS data query.
You'll probably run into concurrency issues with cgi and the datetime check though...
To get around concurrency issues, you can use sqlite, and put the write in a try/except.
Here's a sample cache implementation using sqlite.
import datetime
import sqlite3
class GpsCache(object):
db_path = 'gps_cache.db'
def __init__(self):
self.con = sqlite3.connect(self.db_path)
self.cur = self.con.cursor()
def _get_period(self, dt=None):
'''normalize time to 15 minute periods'''
if dt.minute < 15:
minute_period = 0
elif 15 <= dt.minute < 30:
minute_period = 15
elif 30 <= dt_minute < 45:
minute_period = 30
elif 45 <= dt_minute:
minute_period = 25
period_dt = datetime.datetime(year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=minute_period)
return period_dt
def get_cache(dt=None):
period_dt = self._get_period(dt)
select_sql = 'SELECT * FROM GPS_CACHE WHERE date_time = "%s";' % period_dt.strftime('%Y-%m-%d %H:%M')
self.cur.execut(select_sql)
result = self.cur.fetchone()[0]
return result
def put_cache(dt=None, data=None):
period_dt = self._get_period(dt)
insert_sql = 'INSERT ....' # edit to your table structure
try:
self.cur.execute(insert_sql)
self.con.commit()
except sqlite3.OperationalError:
# assume db is being updated by another process with the current resutls and ignore
pass
So we have the cache tool now the implementation side.
You'll want to check the cache first then if it's not 'fresh' (doens't return anything), go grab the data using your current method. Then cache the data you grabbed.
you should probably organize this better, but you should get the general idea here.
Using this sample, you just replace your current calls to 'remote_get_gps_data' with 'get_gps_data'.
from gps_cacher import GpsCache
def remote_get_gps_data():
# your function here
return data
def get_gps_data():
data = None
gps_cache = GpsCache()
current_dt = datetime.datetime.now()
cached_data = gps_cache.get_cache(current_dt)
if cached_data:
data = cached_data
else:
data = remote_get_gps_data()
gps_cache.put_cache(current_dt, data)
return data