I am working on a stock market analysis project. I am attempting to find the highest price in the past 5 days, the volume on the day of the highest price, and how many days before that highest price occurred.
I have constructed a solution utilizing a couple of 'for' loops, but would like to find a more efficient way to code this without utilizing 'for' loops. Any suggestions would be appreciated.
A1 = pd.merge(A, B, left_index = True, right_index = True)
A1["Date"] = A1.index
A1.reset_index(inplace = True)
### 5 Day High and Volume
Indexes = []
for index in range(len(A1.index) - 5):
M = 0
H = 0
for i in range(1,6):
if H < A1.iloc[i+index,2]:
H = A1.iloc[i+index,2]
M = i+index
Indexes.append(M)
Vol = pd.DataFrame(columns = ['B','C'])
Vol5 = []
DH5 = []
Z = []
count = 0
for i in Indexes:
Vol5.append(A1.iloc[i,1])
DH5.append(A1.iloc[i,2])
Z.append(count - i)
count += 1
for i in range(5):
Vol5.append(np.nan)
DH5.append(np.nan)
Z.append(np.nan)
Vol['B'] = Vol5
Vol.index = A1['Date']
Vol['C'] = DH5
Vol['D'] = Z
I suggest using the rolling method to find the index of the maximum value computed over the previous 5 rows:
import pandas as pd
import numpy as np
d={'date':np.random.random(10), 'open':np.random.random(10), 'high':np.random.random(10), 'low':np.random.random(10), 'close':np.random.random(10), 'volume':np.random.random(10)}
A1=pd.DataFrame(data=d)
df=A1.rolling(window=5).apply(np.argmax).shift(1).fillna(0)
Then to find the volume associated with this maximum value (in this example for the highest column):
A1['volume associated with maximum price']=A1.iloc[df.high]['volume']
One of the Rugby Coaches at my school have asked me to code a conditional rugby match draw for the upcoming games with the task laid out something like this: Given a list of teams from 1 - 12 split into 3 groups ([Group1 = 1, 2, 3, 4], [Group2 = 5, 6, 7, 8,], [Group3 = 9, 10, 11, 12])
generate and print an 11 round-robin matchup with the conditions that:
Teams in Group1 does NOT verse teams in Group3
Teams in Group1 verses every other team in Group 1 twice (Eg. 1v2, 2v1, 1v3, 3v1, 1v4, 4v1, 1v5, 5v1.....)
This same rule applies to teams in Group3 as they verse other teams in Group3
Teams in Group2 verse every other team once.
Teams in Group1 and Group3 need one Bye Game.
I have attempted multiple times but inevitably become stuck, below are my 2 attempts:
Attempt 1:
import operator
import functools
import random
###First Generation (Flawed unclean round robin)
def fixtures(teams):
if len(teams) % 2:
teams.append('Day off') # if team number is odd - use 'day off' as fake team
rotation = list(teams) # copy the list
random.shuffle(rotation)
fixtures = []
for i in range(0, len(teams)-1):
fixtures.append(rotation)
rotation = [rotation[0]] + [rotation[-1]] + rotation[1:-1]
return fixtures
def main():
# demo code
teams = ["Team1","Team2","Team3","Team4","Team5","Team6","Team7","Team8","Team9","Team10","Team11","Team12"]
groupA = ["Team1","Team2","Team3","Team4"]
groupB = ["Team5","Team6","Team7","Team8"]
groupC = ["Team9","Team10","Team11","Team12"]
# for one match each - use this block only
matches = fixtures(teams)
print("flawed matches:")
RoundCounter = 0
homeTeams = []
awayTeams = []
for f in matches:
#print(f)
homeTeams = f[::2]
awayTeams = f[1::2]
print("Home Teams:{}".format(homeTeams))
print("Away Teams:{}".format(awayTeams))
HomeTeamGroupA = set(homeTeams).intersection(groupA)
HomeTeamGroupC = set(homeTeams).intersection(groupC)
AwayTeamGroupA = set(awayTeams).intersection(groupA)
AwayTeamGroupC = set(awayTeams).intersection(groupC)
VSCounter = 0
for p, o in zip(homeTeams, awayTeams):
if p in HomeTeamGroupA:
if o in AwayTeamGroupC:
AvsCPosition = awayTeams.index(o)
VSCounter += 1
RoundCleanUp(homeTeams, awayTeams, AvsCPosition, VSCounter) #if this is returned begin cleaning the round
else: print("GroupA is versing either Group B or GroupA") #if this is returned it is a team 1-4 but is vs either group b or group a
elif p in HomeTeamGroupC:
if o in AwayTeamGroupA:
AvsCPosition = awayTeams.index(o)
VSCounter += 1
RoundCleanUp(homeTeams, awayTeams, AvsCPosition, VSCounter) #if this is returned begin cleaning the round
else:
print("GroupC is versing either Group B or GroupC") #if this is returned it is a team 9-12 but is vs either group b or group c
else:
pass
def RoundCleanUp(HTeam, ATeam, AvsCPos, VSCounter):
##gets Value of List at position
HTeamVal = HTeam[AvsCPos]
ATeamVal = ATeam[AvsCPos]
main()
Attempt 2:
import operator
import functools
import random
def make_round(rotation, num_teams, fixtures):
for i in range(num_teams - 1):
rotation = list(range(1, num_teams + 1))
# clip to 0 .. num_teams - 2 # if i == 0, no rotation is needed (and using -0 as list index will cause problems)
i %= (num_teams - 1)
if i:
rotation = rotation[:1] + rotation[-i:] + rotation[1:-i]
half = num_teams // 2
fixtures.append(list(rotation[:half]))
fixtures.append(list(rotation[half:][::-1]))
return fixtures
def make_schedule(teams):
"""Produces RoundRobin"""
# number of teams must be even
TeamLength = len(teams)
if TeamLength % 2:
TeamLength += 1 # add a dummy team for padding
# build first round-robin
rotation = list(teams)
Fixture = []
schedule = make_round(rotation, TeamLength, Fixture)
return schedule
def homeAwayRotation(matches):
for homeTeams, awayTeams in zip(matches[0::2], matches[1::2]):
print("Home Rotation: {}".format(homeTeams))
print("Away Rotation: {}".format(awayTeams))
validation(homeTeams, awayTeams)
def validation(homeTeams, awayTeams):
groupA = [1, 2, 3, 4]
groupC = [9, 10, 11, 12]
for x, y in zip(homeTeams, awayTeams):
if x in groupA:
if y in groupC:
AvsCPosition = awayTeams.index(y)
cleanDirtyData(homeTeams, awayTeams, AvsCPosition)
else:
# if this is returned it is a team 1-4 but is vs either group b or group a
print("Group A vsing either itself or GroupB\n")
elif x in groupC:
if y in groupA:
AvsCPosition = awayTeams.index(y)
cleanDirtyData(homeTeams, awayTeams, AvsCPosition)
else:
# if this is returned it is a team 9-12 but is vs either group b or group c
print("Group C vsing either itself or GroupB\n")
else:
# if this is returned it is a team in group B
print("This is team B\n")
def cleanDirtyData(homeTeams, awayTeams, AvsCPosition):
HTeamVal = homeTeams[AvsCPosition]
ATeamVal = awayTeams[AvsCPosition]
Dirtlist = []
Dirtlist.append(HTeamVal)
Dirtlist.append(ATeamVal)
def main():
# demo code
teams = ["Team1", "Team2", "Team3", "Team4", "Team5", "Team6",
"Team7", "Team8", "Team9", "Team10", "Team11", "Team12"]
# for one match each - use this block only
matches = make_schedule(teams)
print("flawed matches:")
homeAwayRotation(matches)
main()
My expected results would be printing each round showing which team is versing which and each team having a history a bit like this:
a team in Group1 has a verse history of: (in any random order)
1v2, 2v1, 1v3, 3v1, 1v4, 4v1, 1v5, 1v6, 1v7, 1v8, bye
a team in Group2 has a verse history of: (in any random order)
5v1, 5v2, 5v3, 5v4, 5v6, 5v7, 5v8, 5v9 5v10, 5v11, 5v12
a team in Group3 has a verse history of: (in any random order)
9v10, 10v9, 9v11, 11v9, 9v12, 12v9, 9v5, 9v6, 9v7, 9v8, bye
Any pointers or improvements I could possibly do would be greatly appreciated as I have been stuck on the final hurdle for the last 2 weeks
If I have understood the problem correct, then all you need is some combining of teams with every member in different groups.
I put some code together that should solve your problem:
def vs(team, group):
matchups = map(lambda opponent: (team,opponent), group)
matchups = filter(lambda tup: tup[0] != tup[1], matchups)
return list(matchups)
def matches(teams):
group_size = len(teams) // 3
# Make the groups, basically just splitting the team list in three parts
groups = [teams[:group_size], teams[group_size:2*group_size], teams[2*group_size:]]
matchups = []
for index, team in enumerate(teams):
group_index = index // group_size
current_matchup = []
# Check if we're working with group 1 or 3
if group_index == 0 or group_index == 2:
# Flip the order of a tuple
def flip(x):
return (x[1], x[0])
own_group = vs(team, groups[group_index])
# Add matches against everyone in the group
current_matchup.extend(own_group)
# Add matches agains everyone in the group again, but now the current team is 'away'
current_matchup.extend(list(map(flip, own_group)))
# Add matches against everyone in group 2
current_matchup.extend(vs(team, groups[1]))
# Lastly, add the bye
current_matchup.append((team, "bye"))
else:
# Just all matches against all other teams, once.
current_matchup.extend(vs(team, teams))
matchups.append(current_matchup)
return matchups
# This can be anything. Numbers, 'Team 1' or even 'The wondrous flying squirrels of death'
teams = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
# Make matches
matchups = matches(teams)
# Just pretty print
for i in range(len(matchups)):
matches = '\n\t'.join(map(lambda m: f'{str(m[0]).rjust(10)} vs {str(m[1]).ljust(10)}', matchups[i]))
print(f"Team '{teams[i]}' matches:\n\t{matches}")
I'm trying to create a nested dictionary with a set of values that are pulled from a for-loop, to measure growth and revenue amounts for various customer-product pairings. However, when I loop through a dataframe to set elements of the dictionary, each dictionary element ends up with the same values. What's going on, here?
I have already tried changing various elements of how the lists are built, but to no avail.
'''
TP_Name = customer name
Service_Level_1 = service name
100.2014 is just a marker to show that someone has started consuming the service
tpdict is already created with necessary nesting below with empty values at each endpoint
'''
for col in pivotdf.columns:
growthlist = []
amountlist = []
first = True
TP_Name, Service_Level_1 = col.split('___')
for row in pivotdf[col]:
if first == True:
past = row+.00001
first = False
if row == 0 and past <.0001 :
growth = 0
elif row != 0 and past == .00001:
growth = 100.2014
else:
current = row
growth = (current-past)/past
growth = round(growth,4)
growthlist.append(growth)
past = row +.00001
amountlist.append(row)
tpdict[TP_Name][Service_Level_1]['growth'] = growthlist
tpdict[TP_Name][Service_Level_1]['amount'] = amountlist
'''
problem: Each value ends up being the same thing
'''
Expected results:
{'CUSTOMER NAME': {'PRODUCT1': {'growth': [unique_growthlist], 'amount': [unique_amountlist]}, 'PRODUCT2': {'growth': [unique_growthlist],'amount': [unique_amountlist]}}}
A dictionary is a key value pair (as I am sure you may know). If you ever try to write to a dictionary with a key that already exists in the dictionary then the dictionary will overwrite the value for that key.
Example:
d = dict()
d[1] = 'a' # d = {1: 'a'}
d[1] = 'b' # d = {1: 'b'}
Your project seems like it may be a good use of a namedtuple in python.
A namedtuple is basically a light weight class/object.
My example code may be wrong because I don't know how your for loop is working (commenting helps everyone). That being said here is an example.
I only make this recommendation as dictionaries consume ~33% more memory then the objects they hold (though they are much faster).
from collections import namedtuple
Customer = namedtuple('Customer', 'name products')
Product = namedtuple('Product', 'growth amount')
customers = []
for col in pivotdf.columns:
products = []
growthlist = []
amountlist = []
first = True
TP_Name, Service_Level_1 = col.split('___')
for row in pivotdf[col]:
if first == True:
past = row + .00001
first = False
if row == 0 and past < .0001 :
growth = 0
elif row != 0 and past == .00001:
growth = 100.2014
else:
current = row
growth = (current - past) / past
growth = round(growth, 4)
growthlist.append(growth)
past = row + .00001
amountlist.append(row)
cur_product = Product(growth=growthlist, amount=amountlist) # Create a new product
products.append(cur_product) # Add that product to our customer
# Create a new customer with our products
cur_customer = Customer(name=TP_Name, products=products)
customers.append(cur_customer) # Add our customer to our list of customers
Here customers is a list of Customer namedtuples that we can use as objects.
For example this is how we can print them out.
for customer in customers:
print(customer.name, customer.products) # Print each name and their products
for growth, amount in customer.products:
print(growth, amount) # Print growth and amount for each product.
This question already has answers here:
Union of multiple ranges
(5 answers)
Closed 7 years ago.
I'm trying to remove overlapping values from a collection of ranges.
The ranges are represented by a string like this:
499-505 100-115 80-119 113-140 500-550
I want the above to be reduced to two ranges: 80-140 499-550. That covers all the values without overlap.
Currently I have the following code.
cr = "100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250".split(" ")
ar = []
br = []
for i in cr:
(left,right) = i.split("-")
ar.append(left);
br.append(right);
inc = 0
for f in br:
i = int(f)
vac = []
jnc = 0
for g in ar:
j = int(g)
if(i >= j):
vac.append(j)
del br[jnc]
jnc += jnc
print vac
inc += inc
I split the array by - and store the range limits in ar and br. I iterate over these limits pairwise and if the i is at least as great as the j, I want to delete the element. But the program doesn't work. I expect it to produce this result: 80-125 500-550 200-250 180-185
For a quick and short solution,
from operator import itemgetter
from itertools import groupby
cr = "499-505 100-115 80-119 113-140 500-550".split(" ")
fullNumbers = []
for i in cr:
a = int(i.split("-")[0])
b = int(i.split("-")[1])
fullNumbers+=range(a,b+1)
# Remove duplicates and sort it
fullNumbers = sorted(list(set(fullNumbers)))
# Taken From http://stackoverflow.com/questions/2154249
def convertToRanges(data):
result = []
for k, g in groupby(enumerate(data), lambda (i,x):i-x):
group = map(itemgetter(1), g)
result.append(str(group[0])+"-"+str(group[-1]))
return result
print convertToRanges(fullNumbers)
#Output: ['80-140', '499-550']
For the given set in your program, output is ['80-125', '180-185', '200-250', '500-550']
Main Possible drawback of this solution: This may not be scalable!
Let me offer another solution that doesn't take time linearly proportional to the sum of the range sizes. Its running time is linearly proportional to the number of ranges.
def reduce(range_text):
parts = range_text.split()
if parts == []:
return ''
ranges = [ tuple(map(int, part.split('-'))) for part in parts ]
ranges.sort()
new_ranges = []
left, right = ranges[0]
for range in ranges[1:]:
next_left, next_right = range
if right + 1 < next_left: # Is the next range to the right?
new_ranges.append((left, right)) # Close the current range.
left, right = range # Start a new range.
else:
right = max(right, next_right) # Extend the current range.
new_ranges.append((left, right)) # Close the last range.
return ' '.join([ '-'.join(map(str, range)) for range in new_ranges ]
This function works by sorting the ranges, then looking at them in order and merging consecutive ranges that intersect.
Examples:
print(reduce('499-505 100-115 80-119 113-140 500-550'))
# => 80-140 499-550
print(reduce('100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250'))
# => 80-125 180-185 200-250 500-550
I am new to numpy.I have referred to the following SO question:
Why NumPy instead of Python lists?
The final comment in the above question seems to indicate that numpy is probably slower on a particular dataset.
I am working on a 1650*1650*1650 data set. These are essentially similarity values for each movie in the MovieLens data set along with the movie id.
My options are to either use a 3D numpy array or a nested dictionary. On a reduced data set of 100*100*100, the run times were not too different.
Please find the Ipython code snippet below:
for id1 in range(1,count+1):
data1 = df[df.movie_id == id1].set_index('user_id')[cols]
sim_score = {}
for id2 in range (1, count+1):
if id1 != id2:
data2 = df[df.movie_id == id2].set_index('user_id')[cols]
sim = calculatePearsonCorrUnified(data1, data2)
else:
sim = 1
sim_matrix_panel[id1]['Sim'][id2] = sim
import pdb
from math import sqrt
def calculatePearsonCorrUnified(df1, df2):
sim_score = 0
common_movies_or_users = []
for temp_id in df1.index:
if temp_id in df2.index:
common_movies_or_users.append(temp_id)
#pdb.set_trace()
n = len(common_movies_or_users)
#print ('No. of common movies: ' + str(n))
if n == 0:
return sim_score;
# Ratings corresponding to user_1 / movie_1, present in the common list
rating1 = df1.loc[df1.index.isin(common_movies_or_users)]['rating'].values
# Ratings corresponding to user_2 / movie_2, present in the common list
rating2 = df2.loc[df2.index.isin(common_movies_or_users)]['rating'].values
sum1 = sum (rating1)
sum2 = sum (rating2)
# Sum up the squares
sum1Sq = sum (np.square(rating1))
sum2Sq = sum (np.square(rating2))
# Sum up the products
pSum = sum(np.multiply(rating1, rating2))
# Calculate Pearson score
num = pSum-(sum1*sum2/n)
den = sqrt(float(sum1Sq-pow(sum1,2)/n) * float(sum2Sq-pow(sum2,2)/n))
if den==0: return 0
sim_score = (num/den)
return sim_score
What would be the best way to most precisely time the runtime with either of these options?
Any pointers would be greatly appreciated.