Can we put condition on REST API - python

I was giving hackerrank test where I got this problem.
Problem was to find number of football matches that are draw. i.e data[index]['team1goals']==data[index]['team2goals']
Here is an API you can play with it: https://jsonmock.hackerrank.com/api/football_matches?year=2011&page=1
This is what I tried:
import requests
year = 2011
draw = 0
r = requests.get('https://jsonmock.hackerrank.com/api/football_matches?year='+str(year)+'&page=1').json()
total_pages = r['total_pages']
per_page = r['per_page']
for page in range(1, total_pages+1):
r = requests.get('https://jsonmock.hackerrank.com/api/football_matches?year='+str(year)+'&page='+str(page)).json()
try:
for i in range(0, per_page):
if int(r['data'][i]['team1goals']) == int(r['data'][i]['team2goals']):
draw += 1
except:
pass
print(draw) #516
It is giving me correct answer. Since the data was big, it is facing time complexity which I don't want
Is it possible, Can we modify the REST API with condition like this:
https://jsonmock.hackerrank.com/api/football_matches?year=2011&team1goals==team2goals&page=1
OR
https://jsonmock.hackerrank.com/api/football_matches?year=2011&team1goals-gt-lt&team2goals&page=1

If the API allows these many calls, you can use a multiprocessing.pool.Pool function and iterate through each page parallelly to reduce time. This should work:
import requests
from functools import partial
from multiprocessing.pool import Pool
def loop(page,year,r,per_page):
r = requests.get('https://jsonmock.hackerrank.com/api/football_matches?year='+str(year)+'&page='+str(page)).json()
try:
for i in range(0, per_page):
if int(r['data'][i]['team1goals']) == int(r['data'][i]['team2goals']):
increase = 1
else:
increase = 0
except:
increase = 0
return increase
if __name__ == "__main__":
year = 2011
draw = []
r = requests.get('https://jsonmock.hackerrank.com/api/football_matches?year='+str(year)+'&page=1').json()
total_pages = r['total_pages']
per_page = r['per_page']
pages = range(1, total_pages+1)
pool = Pool()
f = pool.map(partial(loop,year=year,r=r,per_page=per_page),pages)
draw += f
final = 0
for x in draw:
x = int(x)
final += x
print(final) #516

You should use multithreading and make multiple requests in parallel.

You can do it other way.
def getNumDraws(year):
counter = 0
for z in range(0, 10):
# 10 - maximum goals, it's in the description of this task
link = f"https://jsonmock.hackerrank.com/api/football_matches?year={year}&team1goals={z}&team2goals={z}"
r = requests.get(link)
counter = counter + int(r.json()['total'])
return counter
Steps:
Get ['total'] from link where you add &team1goals={variable}&team2goals={variable}
Add ['total'] number to your counter.
Repeat 10 times. 10 times - because it's from task description that you can assume that there are 10 maximum goals scored.
So you call an api only 10 times

Related

Python iterate over each 100 elements

I don't know if this is a good way to optimize, but basically I am using python inside a 3D app to create random colors per object. And the code I have works well with objects within 10k polygons. But it crashes in 100k polygons. Is there a way to do it by chunks in the loop, basically I have the for loop and using an if statement to filter the first 100. But then I need another 100, and another 100, etc. How can I write that? Maybe with a time sleep between each. It's not going to be faster but at least won't possible crash the program. Thanks.
for i, n in enumerate(uvShellIds):
#code can only perform well within sets of 100 elements
limit = 100 #?
if 0 <= i <= 100:
#do something
print(n)
# now I need it to work on a new set of 100 elements
#if 101 <= i <= 200:
#(...keep going between sets of 100...)
My current code :
import maya.OpenMaya as om
import maya.cmds as cmds
import random
def getUvShelList(name):
selList = om.MSelectionList()
selList.add(name)
selListIter = om.MItSelectionList(selList, om.MFn.kMesh)
pathToShape = om.MDagPath()
selListIter.getDagPath(pathToShape)
meshNode = pathToShape.fullPathName()
uvSets = cmds.polyUVSet(meshNode, query=True, allUVSets =True)
allSets = []
for uvset in uvSets:
shapeFn = om.MFnMesh(pathToShape)
shells = om.MScriptUtil()
shells.createFromInt(0)
# shellsPtr = shells.asUintPtr()
nbUvShells = shells.asUintPtr()
uArray = om.MFloatArray() #array for U coords
vArray = om.MFloatArray() #array for V coords
uvShellIds = om.MIntArray() #The container for the uv shell Ids
shapeFn.getUVs(uArray, vArray)
shapeFn.getUvShellsIds(uvShellIds, nbUvShells, uvset)
# shellCount = shells.getUint(shellsPtr)
shells = {}
for i, n in enumerate(uvShellIds):
#print(i,n)
limit = 100
if i <= limit:
if n in shells:
# shells[n].append([uArray[i],vArray[i]])
shells[n].append( '%s.map[%i]' % ( name, i ) )
else:
# shells[n] = [[uArray[i],vArray[i]]]
shells[n] = [ '%s.map[%i]' % ( name, i ) ]
allSets.append({uvset: shells})
for shell in shells:
selection_shell = shells.get(shell)
cmds.select(selection_shell)
#print(shells.get(shell))
facesSel = cmds.polyListComponentConversion(fromUV=True, toFace=True)
cmds.select(facesSel)
r = [random.random() for i in range(3)]
cmds.polyColorPerVertex(facesSel,rgb=(r[0], r[1], r[2]), cdo=1 )
cmds.select(deselect=1)
getUvShelList( 'polySurface359' )
You can use islice from itertools to chunk.
from itertools import islice
uvShellIds = list(range(1000))
iterator = iter(uvShellIds)
while True:
chunk = list(islice(iterator, 100))
if not chunk:
break
print(chunk) # chunk contains 100 elements you can process
I don't know how well it fits in your current code but, below is how you can process the chunks:
from itertools import islice
uvShellIds = list(range(1000))
iterator = iter(uvShellIds)
offset = 0
while True:
chunk = list(islice(iterator, 100))
if not chunk:
break
# Processing chunk items
for i, n in enumerate(chunk):
# offset + i will give you the right index referring to the uvShellIds variable
# Then , perform your actions
if n in shells:
# shells[n].append([uArray[i],vArray[i]])
shells[n].append( '%s.map[%i]' % ( name, offset + i ) )
else:
# shells[n] = [[uArray[i],vArray[i]]]
shells[n] = [ '%s.map[%i]' % ( name, offset + i ) ]
offset += 100
# Your sleep can come here
The snippet above should replace your for i, n in enumerate(uvShellIds): block.
As #David Culbreth's answer stated, I'm not sure the sleep will be of help, but I left a comment on where you can place it.
I use this generator to "chunkify" my long-running operations in python into smaller batches:
def chunkify_list(items, chunk_size):
for i in range(0, len(items), chunk_size):
yield items[i:i+chunk_size]
With this defined, you can write your program something like this:
items = [1,2,3,4,5 ...]
for chunk in chunkify_list(items, 100):
for item in chunk:
process_item(item)
sleep(delay)
Now, I'm not going to guarantee that sleep will actually solve your problems, but this lets you handle your data one chunk at a time.

I'd like to cross-calculate the two formular over and over again in python

Q_optimal=((np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2))**(1/2)+((np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2+(2*np.array(order_cost)*np.array(demand_lt_avg)/np.array(carrying_cost))))
#get optimal value
while 1:
#new safety stock
new_safety_stock=((np.array(demand_lt_std))**2/(4*beta*np.array( Q_optimal)))-(beta*np.array( Q_optimal))
new_safety_stock[np.isnan(new_safety_stock)] = 0
new_safety_stock=new_safety_stock.tolist()
#delete 0
for i in range(len(order_cost)):
if new_safety_stock[i]< 0:
pos_1=np.where(np.array(new_safety_stock)<0)[0]
for i in pos_1:
new_safety_stock[i]=0
new_safety_stock=np.array(new_safety_stock)
Q_a_result = (np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2)**0.5
Q_b_result= (2*np.array(order_cost)*np.array(demand_lt_avg))/np.array(carrying_cost)
Q_c_result=(np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2+Q_b_result)**0.5
Q_d_result=Q_a_result+Q_c_result
#new Q
new_Q = Q_d_result
loop += 1
above there is all code
Please look at the cord below. first i caculate new_safety_stock using Q_optimal and i get new_Q.
After that, I want to do this repetition of using new_Q to get new_safety, and then using new_safety to get new_Q. I want to repeat it 10000 times, but I have no choice but to use Q_optimal when I get new_safety for the first time, but I wonder how to let you use new_Q again after that.
'''
while 1:
#new safety stock
new_safety_stock=((np.array(demand_lt_std))**2/(4*beta*np.array( Q_optimal)))-(beta*np.array( Q_optimal))
new_safety_stock[np.isnan(new_safety_stock)] = 0
new_safety_stock=new_safety_stock.tolist()
#delete 0
for i in range(len(order_cost)):
if new_safety_stock[i]< 0:
pos_1=np.where(np.array(new_safety_stock)<0)[0]
for i in pos_1:
new_safety_stock[i]=0
new_safety_stock=np.array(new_safety_stock)
Q_a_result = (np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2)**0.5
Q_b_result= (2*np.array(order_cost)*np.array(demand_lt_avg))/np.array(carrying_cost)
Q_c_result=(np.array(demand_lt_std)**2+np.array(Q_safety_stock)**2+Q_b_result)**0.5
Q_d_result=Q_a_result+Q_c_result
#new Q
new_Q = Q_d_result
loop += 1
'''

Python multiprocessing: how to create x number of processes and get return value back

I have a program that I created using threads, but then I learned that threads don't run concurrently in python and processes do. As a result, I am trying to rewrite the program using multiprocessing, but I am having a hard time doing so. I have tried following several examples that show how to create the processes and pools, but I don't think it's exactly what I want.
Below is my code with the attempts I have tried. The program tries to estimate the value of pi by randomly placing points on a graph that contains a circle. The program takes two command-line arguments: one is the number of threads/processes I want to create, and the other is the total number of points to try placing on the graph (N).
import math
import sys
from time import time
import concurrent.futures
import random
import multiprocessing as mp
def myThread(arg):
# Take care of imput argument
n = int(arg)
print("Thread received. n = ", n)
# main calculation loop
count = 0
for i in range (0, n):
x = random.uniform(0,1)
y = random.uniform(0,1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count = count + 1
print("Thread found ", count, " points inside circle.")
return count;
# end myThread
# receive command line arguments
if (len(sys.argv) == 3):
N = sys.argv[1] # original ex: 0.01
N = int(N)
totalThreads = sys.argv[2]
totalThreads = int(totalThreads)
print("N = ", N)
print("totalThreads = ", totalThreads)
else:
print("Incorrect number of arguments!")
sys.exit(1)
if ((totalThreads == 1) or (totalThreads == 2) or (totalThreads == 4) or (totalThreads == 8)):
print()
else:
print("Invalid number of threads. Please use 1, 2, 4, or 8 threads.")
sys.exit(1)
# start experiment
t = int(time() * 1000) # begin run time
total = 0
# ATTEMPT 1
# processes = []
# for i in range(totalThreads):
# process = mp.Process(target=myThread, args=(N/totalThreads))
# processes.append(process)
# process.start()
# for process in processes:
# process.join()
# ATTEMPT 2
#pool = mp.Pool(mp.cpu_count())
#total = pool.map(myThread, [N/totalThreads])
# ATTEMPT 3
#for i in range(totalThreads):
#total = total + pool.map(myThread, [N/totalThreads])
# p = mp.Process(target=myThread, args=(N/totalThreads))
# p.start()
# ATTEMPT 4
# with concurrent.futures.ThreadPoolExecutor() as executor:
# for i in range(totalThreads):
# future = executor.submit(myThread, N/totalThreads) # start thread
# total = total + future.result() # get result
# analyze results
pi = 4 * total / N
print("pi estimate =", pi)
delta_time = int(time() * 1000) - t # calculate time required
print("Time =", delta_time, " milliseconds")
I thought that creating a loop from 0 to totalThreads that creates a process for each iteration would work. I also wanted to pass in N/totalThreads (to divide the work), but it seems that processes take in an iterable list rather than an argument to pass to the method.
What is it I am missing with multiprocessing? Is it at all possible to even do what I want to do with processes?
Thank you in advance for any help, it is greatly appreciated :)
I have simplified your code and used some hard-coded values which may or may not be reasonable.
import math
import concurrent.futures
import random
from datetime import datetime
def myThread(arg):
count = 0
for i in range(0, arg[0]):
x = random.uniform(0, 1)
y = random.uniform(0, 1)
d = math.sqrt(x * x + y * y)
if (d < 1):
count += 1
return count
N = 10_000
T = 8
_start = datetime.now()
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(myThread, (int(N / T),)): _ for _ in range(T)}
total = 0
for future in concurrent.futures.as_completed(futures):
total += future.result()
_end = datetime.now()
print(f'Estimate for PI = {4 * total / N}')
print(f'Run duration = {_end-_start}')
A typical output on my machine looks like this:-
Estimate for PI = 3.1472
Run duration = 0:00:00.008895
Bear in mind that the number of threads you start is effectively managed by the ThreadPoolExecutor (TPE) [ when constructed with no parameters ]. It makes decisions about the number of threads that can run based on your machine's processing capacity (number of cores etc). Therefore you could, if you really wanted to, set T to a very high number and the TPE will block execution of any new threads until it determines that there is capacity.

How can I optimize the groupby.apply(function) in Python?

I have a function that uses deque.collections to track daily stock in based on FIFO. An order will be fulfilled if possible and is substracted from stock accordingly. I use a function in groupby.apply(my_function).
I have struggles where to place the second loop. Both loops work properly when run on their own. But I do not get them working combined.
The dataset is about 1.5 million rows.
Thanks.
DOS = 7
WIP = 1
df_fin['list_stock'] = 0
df_fin['stock_new'] = 0
def create_stocklist(x):
x['date_diff'] = x['dates'] - x['dates'].shift()
x['date_diff'] = x['date_diff'].fillna(0)
x['date_diff'] = (x['date_diff'] / np.timedelta64(1, 'D')).astype(int)
x['list_stock'] = x['list_stock'].astype(object)
x['stock_new'] = x['stock_new'].astype(object)
var_stock = DOS*[0]
sl = deque([0],maxlen=DOS)
for i in x.index:
order = x['order_bin'][i]
if x['date_diff'][i] > 0:
for p in range(0,x['date_diff'][i]):
if p == WIP:
sl.appendleft(x.return_bin[i-1])
else:
sl.appendleft(0)
sl_list = list(sl)
sl_list.reverse()
new_list = []
#from here the loop does not work as I wanted it to work.
#I want to loop over de created sl_list
#and then start the loop above with the outcome of the loop below.
for elem in sl_list:
while order > 0:
val = max(0,elem-order)
order = (abs(min(0,elem-order)))
new_list.append(val)
break
else:
new_list.append(elem)
new_list.reverse()
x.at[i,'list_stock'] = new_list
sl = deque(new_list)
return x
df_fin.groupby(by=['ID']).apply(create_stocklist)
You do not have access to sl_list inside the second loop, you should just define it in the upper scope: for example just after the first global for loop:
for i in x.index:
# define it just here
sl_list = []
order = x['order_bin'][i]

Parallelization/multiprocessing of conditional for loop

I want to use multiprocessing in Python to speed up a while loop.
More specifically:
I have a matrix (samples*features). I want to select x subsets of samples whose values at a random subset of features is unequal to a certain value (-1 in this case).
My serial code:
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
no_feat = 500
no_samp = 5
no_trees = 5
i=0
iter=0
samples = np.zeros((no_trees, no_samp))
features = np.zeros((no_trees, no_feat))
while i < no_trees:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False)
iter_order = np.random.choice(df.shape[0], df.shape[0], replace=False)
samp_idx = []
a=0
#--------------
#how to run in parallel?
for j in iter_order:
pot_samp = df.iloc[j, rand_feat]
if len(np.where(pot_samp==-1)[0]) == 0:
samp_idx.append(j)
if len(samp_idx) == no_samp:
print a
break
a+=1
#--------------
if len(samp_idx) == no_samp:
samples[i,:] = samp_idx
features[i, :] = rand_feat
i+=1
iter+=1
if iter>1000: #break if subsets cannot be found
break
Searching for fitting samples is the potentially expensive part (the j for loop), which in theory can be run in parallel. In some cases, it is not necessary to iterate over all samples to find a large enough subset, which is why I am breaking out of the loop as soon as the subset is large enough.
I am struggling to find an implementation that would allow for checks of how many valid results are generated already. Is it even possible?
I have used joblib before. If I understand correctly this uses the pool methods of multiprocessing as a backend which only works for separate tasks? I am thinking that queues might be helpful but thus far I failed at implementing them.
I found a working solution. I decided to run the while loop in parallel and have the different processes interact over a shared counter. Furthermore, I vectorized the search for suitable samples.
The vectorization yielded a ~300x speedup and running on 4 cores speeds up the computation ~twofold.
First I tried to implement separate processes and put the results into a queue. Turns out these aren't made to store large amounts of data.
If someone sees another bottleneck in that code I would be glad if someone pointed it out.
With my basically nonexistent knowledge about parallel computing I found it really hard to puzzle this together, especially since the example on the internet are all very basic. I learnt a lot though =)
My code:
import numpy as np
import pandas as pd
import itertools
from multiprocessing import Pool, Lock, Value
from datetime import datetime
import settings
val = Value('i', 0)
worker_ID = Value('i', 1)
lock = Lock()
def findSamp(no_trees, df, no_feat, no_samp):
lock.acquire()
print 'starting worker - {0}'.format(worker_ID.value)
worker_ID.value +=1
worker_ID_local = worker_ID.value
lock.release()
max_iter = 100000
samp = []
feat = []
iter_outer = 0
iter = 0
while val.value < no_trees and iter_outer<max_iter:
rand_feat = np.random.choice(df.shape[1], no_feat, replace=False
#get samples with random features from dataset;
#find and select samples that don't have missing values in the random features
samp_rand = df.iloc[:,rand_feat]
nan_idx = np.unique(np.where(samp_rand == -1)[0])
all_idx = np.arange(df.shape[0])
notnan_bool = np.invert(np.in1d(all_idx, nan_idx))
notnan_idx = np.where(notnan_bool == True)[0]
if notnan_idx.shape[0] >= no_samp:
#if enough samples for random feature subset, select no_samp samples randomly
notnan_idx_rand = np.random.choice(notnan_idx, no_samp, replace=False)
rand_feat_rand = rand_feat
lock.acquire()
val.value += 1
#x = val.value
lock.release()
#print 'no of trees generated: {0}'.format(x)
samp.append(notnan_idx_rand)
feat.append(rand_feat_rand)
else:
#increase iter_outer counter if no sample subset could be found for random feature subset
iter_outer += 1
iter+=1
if iter >= max_iter:
print 'exiting worker{0} because iter >= max_iter'.format(worker_ID_local)
else:
print 'worker{0} - finished'.format(worker_ID_local)
return samp, feat
def initialize(*args):
global val, worker_ID, lock
val, worker_ID, lock = args
def star_findSamp(i_df_no_feat_no_samp):
return findSamp(*i_df_no_feat_no_samp)
if __name__ == '__main__':
np.random.seed(43)
datafile = '...'
df = pd.read_csv(datafile, sep=" ", nrows = 89)
df = df.fillna(-1)
df = df.iloc[:, 6:]
no_feat = 700
no_samp = 10
no_trees = 5000
startTime = datetime.now()
print 'starting multiprocessing'
ncores = 4
p = Pool(ncores, initializer=initialize, initargs=(val, worker_ID, lock))
args = itertools.izip([no_trees]*ncores, itertools.repeat(df), itertools.repeat(no_feat), itertools.repeat(no_samp))
result = p.map(star_findSamp, args)#, callback=log_result)
p.close()
p.join()
print '{0} sample subsets for tree training have been found'.format(val.value)
samples = [x[0] for x in result if x != None]
samples = np.vstack(samples)
features = [x[1] for x in result if x != None]
features = np.vstack(features)
print datetime.now() - startTime

Categories