Implementing Bellman-Ford in python - python

I'm trying to adapt a Bellman-Ford graph algorithm in Python to my needs.
I've worked out the parsing part from a json file.
Here is the Bellman Ford code I found on github:
https://github.com/rosshochwert/arbitrage/blob/master/arbitrage.py
And here is my code I adapted from it:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
jsrates = json.loads(page.read())
result_list = jsrates["result"]
for result_index, result in enumerate(result_list):
ask = result["Ask"]
market = result["MarketName"]
pattern = re.compile("([A-Z0-9]*)-([A-Z0-9]*)")
matches = pattern.match(market)
if (float(ask != 0)):
conversion_rate = -math.log(float(ask))
if matches:
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
# Step 1: For each node prepare the destination and predecessor
def initialize(graph, source):
d = {} # Stands for destination
p = {} # Stands for predecessor
for node in graph:
d[node] = float('Inf') # We start admiting that the rest of nodes are very very far
p[node] = None
d[source] = 0 # For the source we know how to reach
return d, p
def relax(node, neighbour, graph, d, p):
# If the distance between the node and the neighbour is lower than the one I have now
if d[neighbour] > d[node] + graph[node][neighbour]:
# Record this lower distance
d[neighbour] = d[node] + graph[node][neighbour]
p[neighbour] = node
def retrace_negative_loop(p, start):
arbitrageLoop = [start]
next_node = start
while True:
next_node = p[next_node]
if next_node not in arbitrageLoop:
arbitrageLoop.append(next_node)
else:
arbitrageLoop.append(next_node)
arbitrageLoop = arbitrageLoop[arbitrageLoop.index(next_node):]
return arbitrageLoop
def bellman_ford(graph, source):
d, p = initialize(graph, source)
for i in range(len(graph)-1): #Run this until is converges
for u in graph:
for v in graph[u]: #For each neighbour of u
relax(u, v, graph, d, p) #Lets relax it
# Step 3: check for negative-weight cycles
for u in graph:
for v in graph[u]:
if d[v] < d[u] + graph[u][v]:
return(retrace_negative_loop(p, source))
return None
paths = []
graph = download()
print graph
for ask in graph:
path = bellman_ford(graph, ask)
if path not in paths and not None:
paths.append(path)
for path in paths:
if path == None:
print("No opportunity here :(")
else:
money = 100
print "Starting with %(money)i in %(currency)s" % {"money":money,"currency":path[0]}
for i,value in enumerate(path):
if i+1 < len(path):
start = path[i]
end = path[i+1]
rate = math.exp(-graph[start][end])
money *= rate
print "%(start)s to %(end)s at %(rate)f = %(money)f" % {"start":start,"end":end,"rate":rate,"money":money}
print "\n"
Error:
Traceback (most recent call last):
File "belltestbit.py", line 78, in <module>
path = bellman_ford(graph, ask)
File "belltestbit.py", line 61, in bellman_ford
relax(u, v, graph, d, p) #Lets relax it
File "belltestbit.py", line 38, in relax
if d[neighbour] > d[node] + graph[node][neighbour]:
KeyError: 'LTC'
When I print the graph I got everything needed. It is 'LTC' because it is the first one the list. I tried executing and filtering LTC, it gives me the same error with the first name coming on the graph:
Traceback (most recent call last):
File "belltestbit.py", line 78, in <module>
path = bellman_ford(graph, ask)
File "belltestbit.py", line 61, in bellman_ford
relax(u, v, graph, d, p) #Lets relax it
File "belltestbit.py", line 38, in relax
if d[neighbour] > d[node] + graph[node][neighbour]:
KeyError: 'NEO'
I don't see how could I fix this.
Thanks everyone.
PS: It appears that an answer was deleted, I'm new to SO, so I don't know what happened. I edited the post, because the answer helped me to advance :)

Disclaimer: Note that although you can find "inefficiencies" in this way, the chances you could actually use them to earn money are quite low. Most probably you would actually loose some money. AFAICS from the data I've seen during testing, those "inefficiencies" come from the fact that exchange rates are more volatile over course of minutes than the Bid-Ask spread. So what you see as an inefficiency is probably just a stale data and you can't actually execute all the required orders fast enough for the exchange rate to be stable enough to earn money. So be advised that you might loose your money if you try to use this application for anything more than your curiosity.
So now to the business:
Your data is in a different format than the one that the original code was designed for. Typical piece of data looks like this:
{
"MarketName": "BTC-ETH",
"High": 0.05076884,
"Low": 0.04818392,
"Volume": 77969.61816991,
"Last": 0.04978511,
"BaseVolume": 3875.47491925,
"TimeStamp": "2017-12-29T05:45:10.18",
"Bid": 0.04978511,
"Ask": 0.04986673,
"OpenBuyOrders": 4805,
"OpenSellOrders": 8184,
"PrevDay": 0.04955001,
"Created": "2015-08-14T09:02:24.817"
}
What you are interested in is MarketName, Bid and Ask. And you need to understand what those Bid and Ask mean. Roughly speaking the Ask value means that if you want to sell BTC for ETH there is (or rather was not too long ago) a buyer who is willing to buy your BTC using exchange rate 0.04986673 BTC for 1 ETH. Similarly the Bid value means that if you want to sell ETH for BTC there is (was) a buyer who is willing to buy your ETH using exchange rate 0.04978511 BTC for 1 ETH. Note that this structure means that you will not have a record with "MarketName": "ETH-BTC" because it provides no additional data.
So knowing that you can fill your graph with proper distances, which are logarithms of the corresponding rates. Also I believe that there is another bug in your code: since the argument p of the retrace_negative_loop is actually dictionary of predecessor nodes, retrace_negative_loop returns the negative loop in the reverse order. And since your graph is directed the same loop might be positive in one direction and negative in the other one.
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
data = page.read()
jsrates = json.loads(data)
result_list = jsrates["result"]
for result_index, result in enumerate(result_list):
ask = result["Ask"]
bid = result["Bid"]
market = result["MarketName"]
pattern = re.compile("([A-Z0-9]*)-([A-Z0-9]*)")
matches = pattern.match(market)
if matches:
from_rate = matches.group(1).encode('ascii', 'ignore')
to_rate = matches.group(2).encode('ascii', 'ignore')
# different sign of log is effectively 1/x
if ask != 0:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = math.log(float(ask))
if bid != 0:
if to_rate not in graph:
graph[to_rate] = {}
graph[to_rate][from_rate] = -math.log(float(bid))
return graph # Step 1: For each node prepare the destination and predecessor
def initialize(graph, source):
d = {} # Stands for destination
p = {} # Stands for predecessor
for node in graph:
d[node] = float('Inf') # We start admiting that the rest of nodes are very very far
p[node] = None
d[source] = 0 # For the source we know how to reach
return d, p
def relax(node, neighbour, graph, d, p):
# If the distance between the node and the neighbour is lower than the one I have now
dist = graph[node][neighbour]
if d[neighbour] > d[node] + dist:
# Record this lower distance
d[neighbour] = d[node] + dist
p[neighbour] = node
def retrace_negative_loop(p, start):
arbitrageLoop = [start]
prev_node = start
while True:
prev_node = p[prev_node]
if prev_node not in arbitrageLoop:
arbitrageLoop.append(prev_node)
else:
arbitrageLoop.append(prev_node)
arbitrageLoop = arbitrageLoop[arbitrageLoop.index(prev_node):]
# return arbitrageLoop
return list(reversed(arbitrageLoop))
def bellman_ford(graph, source):
d, p = initialize(graph, source)
for i in range(len(graph) - 1): # Run this until is converges
for u in graph:
for v in graph[u]: # For each neighbour of u
relax(u, v, graph, d, p) # Lets relax it
# Step 3: check for negative-weight cycles
for u in graph:
for v in graph[u]:
if d[v] < d[u] + graph[u][v]:
return retrace_negative_loop(p, v)
return None
graph = download()
# print graph
for k, v in graph.iteritems():
print "{0} => {1}".format(k, v)
print "-------------------------------"
paths = []
for currency in graph:
path = bellman_ford(graph, currency)
if path not in paths and not None:
paths.append(path)
for path in paths:
if path == None:
print("No opportunity here :(")
else:
money = 100
print "Starting with %(money)i in %(currency)s" % {"money": money, "currency": path[0]}
for i, value in enumerate(path):
if i + 1 < len(path):
start = path[i]
end = path[i + 1]
rate = math.exp(-graph[start][end])
money *= rate
print "%(start)s to %(end)s at %(rate)f = %(money)f" % {"start": start, "end": end, "rate": rate,
"money": money}
print "\n"
Also the check if path not in paths and not None: is potentially not enough because it doesn't filter our paths that are just rotations of one another but I didn't bother with fixing that as well.

Related

I can get the distance of shortest path here but how do i get the whole path

This is my approach. I have successfully got the shortest distance in this graph. But cant get the shortest path here.
import heapq as hq
graph = {
1:{2:1, 4:2},
2:{3:4, 5:5},
3:{5:1},
4:{3:2},
5:{}
}
def dijkstra(G, src, dest):
dist = {} # initializing our distances dict
for v in G:
dist[v] = float('inf')
dist[src] = 0
Q = [(0,src)]
path = dict((key,[src]) for key in G) # i want all the paths here
while len(Q) > 0:
u, curr = hq.heappop(Q)
if u > dist[curr]:
continue
for v, w in G[curr].items():
d = u + w
if d < dist[v]:
dist[v] = d
hq.heappush(Q, (d, v))
return dist, path
start_point = 1
end_point = 5
print(dijkstra(graph, start_point, end_point))
I have used several methods except this but they didn't work.

How to fill quantity values into network in python with networkx?

The main question is, it's possible distribute a determinate quantity in a network satisfying the nodal equations in every node?
e.g: adding quantity values in a network(mesh) {GIF}
**Nodal equations: Incoming flow (quantity) = Outgoing flow (quantity) for every node.
import networkx as nx
inputName = [('D','H'),('G','E'),('B','F'),('H','C'),('E','B'),('C','G'),('H','A'),('C','E'),('A','G'),('D','A'),('F','D')]
quantity = [0,0,0,0,0,0,0,0,0,0,0]
resistance = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1]
fan = [0,0,0,0,0,0,2000,0,0,0,0]
def createDirectedGraph(inputName):
ventilationNetwork = nx.DiGraph()
i = 0
for edge in inputName:
node_from = edge[0]
node_to = edge[1]
ventilationNetwork.add_edge(node_from, node_to)
ventilationNetwork.edges[node_from, node_to]['quantity'] = quantity[i]
ventilationNetwork.edges[node_from, node_to]['resistance'] = resistance[i]
ventilationNetwork.edges[node_from, node_to]['fan'] = fan[i]
i += 1
return ventilationNetwork
I was trying the following:
count = 1
for node in ventilationNetwork.nodes:
if count == 1:
quantity = 100 # Any quantity != 0
for node_from in ventilationNetwork.predecessors(node):
ventilationNetwork.edges[node_from, node]['quantity'] = quantity/(len(list(ventilationNetwork.predecessors(node))))
for node_to in ventilationNetwork.successors(node):
ventilationNetwork.edges[node, node_to]['quantity'] = quantity/(len(list(ventilationNetwork.successors(node))))
else:
sumQ = 0
for node_from in ventilationNetwork.predecessors(node):
sumQ += ventilationNetwork.edges[node_from, node]['quantity']
ventilationNetwork.edges[node_from, node]['quantity'] = sumQ/(len(list(ventilationNetwork.predecessors(node))))
for node_to in ventilationNetwork.successors(node):
sumQ += ventilationNetwork.edges[node, node_to]['quantity']
ventilationNetwork.edges[node, node_to]['quantity'] = sumQ/(len(list(ventilationNetwork.successors(node))))
count += 1
but it's not working. It's not satisfying the nodal equations.
Any idea how to solve this? With NetworkX or without NetworkX.

Breadth's Algorithm Runs Forever Despite Possible Ending

So I saw this coding interview question and tried to solve it. I am trying to employ Breadth's Path Finding Algorithm to find the optimum flight routes from a given airport to all other airports; given the list of all airports and routes. An element in routes means that there is a one way flight from the first airport to the second one.
I got here, this was supposed to find the shortest routes to all other airports from a given airport. But when I run it never ends.
I figured that my algorithm doesn't append all possible next airports to my paths, but everything seems o.k to me.
'''
import queue
airports = ["BGI", "CDG", "DEL", "DOH", "DSM", "EWR", "EYW", "HND", "ICN", "JFK", "LGA",
"LHR", "ORD", "SAN", "SFO", "SIN", "TLV", "BUD"]
routes = [["DSM", "ORD"], ["ORD", "BGI"], ["BGI", "LGA"], ["SIN", "CDG"], ["CDG", "SIN"],
["CDG", "BUD"], ["DEL", "DOH"], ["DEL", "CDG"], ["TLV", "DEL"], ["EWR", "HND"],
["HND", "ICN"], ["HND", "JFK"], ["ICN", "JFK"], ["JFK", "LGA"], ["EYW", "LHR"],
["LHR", "SFO"], ["SFO", "SAN"], ["SFO", "DSM"], ["SAN", "EYW"], ["LGA", "EYW"]]
main = "LGA"
def done(moves, aim):
if moves == "":
return False
elif moves[-1][1] == aim:
return True
return False
def valid(moves, put):
if moves == "":
return False
if moves[-1][1] == put[0]:
return True
return False
def available_starts(start, pos):
anfaenge = list()
for i in pos:
if i[0] == start:
anfaenge.append(i)
return anfaenge
#MAIN ALGORITHM
kurzeste_moeglichkeiten = [] """all shortest possibilities"""
for port in airports:
nums = queue.Queue()
nums.put("")
add = ""
start = main
if port != start:
anfaenge = available_starts(start, routes) """possible startings"""
for anfang in anfaenge:
anfang = [anfang]
nums.put(anfang)
while not done(add, port):
add = nums.get()
for got in routes:
if valid(add, got):
t = add
t.append(got)
nums.put(t)
kurzeste_moeglichkeiten.append(add)
for eine in kurzeste_moeglichkeiten:
print(eine)
'''
I've managed to represent your graph with the modules networkx and matplotlib. As you can see, the airports are splitted in 2 groups.
As mentionned, I do believe 2 of your routes ["SIN", "CDG"] and ["CDG", "SIN"] are the reverse ones and are not OK with the rest of your data.
NOTE: the code I've used to show the graph:
import networkx as nx
import matplotlib.pyplot as plt
airports = ["BGI", "CDG", "DEL", "DOH", "DSM", "EWR", "EYW", "HND", "ICN", "JFK", "LGA",
"LHR", "ORD", "SAN", "SFO", "SIN", "TLV", "BUD"]
legs = [["DSM", "ORD"], ["ORD", "BGI"], ["BGI", "LGA"], ["SIN", "CDG"], ["CDG", "SIN"],
["CDG", "BUD"], ["DEL", "DOH"], ["DEL", "CDG"], ["TLV", "DEL"], ["EWR", "HND"],
["HND", "ICN"], ["HND", "JFK"], ["ICN", "JFK"], ["JFK", "LGA"], ["EYW", "LHR"],
["LHR", "SFO"], ["SFO", "SAN"], ["SFO", "DSM"], ["SAN", "EYW"], ["LGA", "EYW"]]
main = "LGA"
G = nx.Graph()
for node in airports:
G.add_node(node)
for leg in legs:
G.add_edge(leg[0], leg[1])
plt.subplot(121)
nx.draw(G, with_labels=True)

Visualizing modified SIR model

I tried to modify the SIR model from the eon package and did some changes to it. It has a new vaccination parameter attached to it with new parameters beta and omega and Vl and my code is-
def test_transmission(u, v, p):
return random.random()<p
def discrete_SIR(G,
initial_infecteds,beta,
w,Vl,return_full_data=True):
if G.has_node(initial_infecteds):
initial_infecteds=[initial_infecteds]
if return_full_data:
node_history = defaultdict(lambda : ([tmin], ['S']))
transmissions = []
for node in initial_infecteds:
node_history[node] = ([tmin], ['I'])
transmissions.append((tmin-1, None, node))
node_history = defaultdict(lambda : ([tmin], ['S']))
# transmissions = []
for node in initial_infecteds:
node_history[node] = ([tmin], ['I'])
#transmissions.append((tmin-1, None, node))
N=G.order()
t = [tmin]
S = [N-len(initial_infecteds)]
I = [len(initial_infecteds)]
R = [0]
V = [0]
susceptible = defaultdict(lambda: True)
#above line is equivalent to u.susceptible=True for all nodes.
for u in initial_infecteds:
susceptible[u] = False
infecteds = set(initial_infecteds)
while infecteds and t[-1]<tmax :
new_infecteds = set()
vaccinated= set()
infector = {} #used for returning full data. a waste of time otherwise
for u in infecteds:
# print('u-->' +str(u))
for v in G.neighbors(u):
# print('v --> '+ str(v))
##vaccination
if len(vaccinated)+V[-1]< (Vl*N) : #check if vaccination over or not
#print(len(vaccinated),Vl*N)
#print("HI")
if susceptible[v] and test_transmission(u, v, w):
vaccinated.add(v)
susceptible[v] = False
# print('transmitting vaccination')
elif susceptible[v] and test_transmission(u,v,beta):
new_infecteds.add(v)
susceptible[v]=False
infector[v] = [u]
# print('transmitting infection')
else:
# print("BYE")
if susceptible[v] and test_transmission(u, v,beta):
new_infecteds.add(v)
susceptible[v] = False
infector[v] = [u]
#infector[v] = [u]
if return_full_data:
for v in infector.keys():
transmissions.append((t[-1], random.choice(infector[v]), v))
next_time = t[-1]+1
if next_time <= tmax:
for u in infecteds:
node_history[u][0].append(next_time)
node_history[u][1].append('R')
for v in new_infecteds:
node_history[v][0].append(next_time)
node_history[v][1].append('I')
infecteds = new_infecteds
R.append(R[-1]+I[-1])
V.append(len(vaccinated)+V[-1])
I.append(len(infecteds))
S.append(N-V[-1]-I[-1]-R[-1])
#S.append(S[-1]-V[-1]-I[-1])
t.append(t[-1]+1)
print(str(R[-1])+','+str(V[-1])+','+str(I[-1])+','+str(S[-1]))
if not return_full_data:
return scipy.array(t), scipy.array(S), scipy.array(I), \
scipy.array(R)
else:
return EoN.Simulation_Investigation(G, node_history, transmissions)
Now I want to run the visualizations on it like in the packagae EON-
m=5
G=nx.grid_2d_graph(m,m,periodic=True)
initial_infections = [(u,v) for (u,v) in G if u==int(m/2) and v==int(m/2)]
sim = EoN.basic_discrete_SIR(G,0.5,initial_infecteds = initial_infections,
return_full_data=True, tmax = 25)
pos = {node:node for node in G}
sim.set_pos(pos)
sim.display(0, node_size = 40) #display time 6
plt.show()
plt.savefig('SIR_2dgrid.png')
What changes do I need to do in my code so that the display function works or do I need to make changes in the display function also?
Here's the output I now get:
You'll have to install EoN version 1.0.8rc3 or later, which is available on the github page (see installation instructions). At present pip will not work to install it. I want to make sure I haven't broken anything before I make it the default installed by pip.
Here's the code based on yours. You should look through the changes I've made. It's also worth looking at the examples I've put in the documentation (including an SIRV model where the vaccination rule is different than what you've got).
from collections import defaultdict
import EoN
import networkx as nx
import random
import matplotlib.pyplot as plt
def test_transmission(u, v, p):
return random.random()<p
def discrete_SIRV(G, initial_infecteds,beta,
w,Vl,tmin=0,tmax=float('Inf'), return_full_data=True):
if G.has_node(initial_infecteds):
initial_infecteds=[initial_infecteds]
if return_full_data:
node_history = defaultdict(lambda : ([tmin], ['S']))
transmissions = []
for node in initial_infecteds:
node_history[node] = ([tmin], ['I'])
transmissions.append((tmin-1, None, node))
'''
node_history = defaultdict(lambda : ([tmin], ['S']))
# transmissions = []
for node in initial_infecteds:
node_history[node] = ([tmin], ['I'])
#transmissions.append((tmin-1, None, node))
'''
N=G.order()
t = [tmin]
S = [N-len(initial_infecteds)]
I = [len(initial_infecteds)]
R = [0]
V = [0]
susceptible = defaultdict(lambda: True)
#above line is equivalent to u.susceptible=True for all nodes.
for u in initial_infecteds:
susceptible[u] = False
infecteds = set(initial_infecteds)
while infecteds and t[-1]<tmax :
new_infecteds = set()
vaccinated= set()
infector = {} #used for returning full data. a waste of time otherwise
for u in infecteds:
# print('u-->' +str(u))
for v in G.neighbors(u):
# print('v --> '+ str(v))
##vaccination
if len(vaccinated)+V[-1]< (Vl*N) : #check if vaccination over or not
#print(len(vaccinated),Vl*N)
#print("HI")
if susceptible[v] and test_transmission(u, v, w):
vaccinated.add(v)
susceptible[v] = False
'''It's probably better to define a `new_vaccinated`
set and then do the `return_full_data` stuff later
where all the others are done.'''
if return_full_data:
node_history[v][0].append(t[-1]+1)
node_history[v][1].append('V')
# print('transmitting vaccination')
elif susceptible[v] and test_transmission(u,v,beta):
new_infecteds.add(v)
susceptible[v]=False
infector[v] = [u]
# print('transmitting infection')
else:
# print("BYE")
if susceptible[v] and test_transmission(u, v,beta):
new_infecteds.add(v)
susceptible[v] = False
infector[v] = [u]
#infector[v] = [u]
if return_full_data:
for v in infector.keys():
'''This random choice is no longer needed as you've taken out
the possibility of multiple nodes transmitting to `v` in a given
time step. Now only the first one encountered does it.'''
transmissions.append((t[-1], random.choice(infector[v]), v))
next_time = t[-1]+1
if next_time <= tmax:
for u in infecteds:
node_history[u][0].append(next_time)
node_history[u][1].append('R')
for v in new_infecteds:
node_history[v][0].append(next_time)
node_history[v][1].append('I')
infecteds = new_infecteds
R.append(R[-1]+I[-1])
V.append(len(vaccinated)+V[-1])
I.append(len(infecteds))
S.append(N-V[-1]-I[-1]-R[-1])
#S.append(S[-1]-V[-1]-I[-1])
t.append(t[-1]+1)
print(str(R[-1])+','+str(V[-1])+','+str(I[-1])+','+str(S[-1]))
if not return_full_data:
return scipy.array(t), scipy.array(S), scipy.array(I), \
scipy.array(R)
else:
return EoN.Simulation_Investigation(G, node_history, transmissions, possible_statuses=['S', 'I', 'R', 'V'])
print(EoN.__version__)
print("line above needs to be 1.0.8rc3 or greater or it will not work\n\n")
m=20
G=nx.grid_2d_graph(m,m,periodic=True)
initial_infections = [(u,v) for (u,v) in G if u==int(m/2) and v==int(m/2)]
beta=0.8
Vl=0.3
w=0.1
sim = discrete_SIRV(G, initial_infections, beta, w, Vl, return_full_data=True)
pos = {node:node for node in G}
sim.set_pos(pos)
sim.sim_update_colordict({'S': '#009a80','I':'#ff2000', 'R':'gray','V': '#5AB3E6'})
sim.display(6, node_size = 40) #display time 6
plt.savefig('SIRV_2dgrid.png')

How to write code in python that reads large dataset and converts to specific datatypes with good performance

My Python script reads data from an input stream. My aim is to tokenize the stream line by line. This allows me to recognize the fields in the incoming data. I then convert the fields to the required data types.
I am currently reading from sys.stdin. After performing some logic, I send the result to the std.out file.
My issue at the moment is that my code appears really slow. I profile the python script with line_profiler and 1 gigabyte of data took 2.5 hours. The profiler shows that a lot of time is spent in calling the conversion functions int, float and str. I definitely think there is a better way to write this code.
##IMPORT STATEMENTS
import time ,sys ,os
from datetime import datetime
from distutils.util import strtobool
fpy_backup_stderr=sys.stdout
sys.stdout=sys.stderr
##UTILITY VARIABLES
fpy_sep=","
fpypy_output_row_start="fpy_pyout:"
fpy_carriagereturn=":fpy_py_cr:"
fpy_linefeed=":fpy_py_lf:"
fpy_null="PY_NULL"
fpy_end_row_signal="##"
fpy_debug_flag=False
def fpy_debug(msg):
MSG_PREFIX=""
msg=MSG_PREFIX+msg
if fpy_debug_flag == True:
sys.stderr.write(msg + "\n")
####MAP OUTPUT FIELD TO PRECISION
fpy_column_out_precision_dict ={
"p_string_out":100
}
##COLLECT THE CURRENT VALUE OF THE OUTPUT FIELDS
def fpy_collect_out_columns_asmap():
e_val ={
"p_bigint_out":p_bigint_out,
"p_double_out":p_double_out,
"p_int_out":p_int_out,
"p_string_out":p_string_out
}
return e_val
file=open("/export/home/devbld/ftpy.target.txt" + str(os.getpid()),"a+")
#profile
def fwrite ( elem):pass
#file.write(elem)
##GENERATE THE OUTPUT ROW
def generateRow():
column_values=fpy_collect_out_columns_asmap()
sys.stdout=fpy_backup_stderr
fwrite(fpypy_output_row_start)
if constant_out is None:
fwrite(fpy_null)
else:
fwrite(str(constant_out))
fwrite(fpy_sep)
if p_bigint_out is None:
fwrite(fpy_null)
else:
fwrite(str(p_bigint_out))
fwrite(fpy_sep)
if p_double_out is None:
fwrite(fpy_null)
else:
fwrite(repr(p_double_out))
fwrite(fpy_sep)
if p_int_out is None:
fwrite(fpy_null)
else:
fwrite(str(p_int_out))
fwrite(fpy_sep)
if p_string_out is None:
fwrite(fpy_null)
else:
prec=fpy_column_out_precision_dict["p_string_out"]
astr=str(p_string_out).replace("\n",fpy_linefeed)
fwrite(astr[:int(prec)])
fwrite("\n")
sys.stdout=sys.stderr
##MATERIALIZE THE INPUT ROW COLUMNS
def fpy_materialize_columns(fpy_row_arr):
global p_bigint,p_double,p_int,p_string,constant
index =0
v = fpy_row_arr[0]
if v is not None:
p_bigint = int(v)
v = fpy_row_arr[1]
if v is not None:
p_double = float(v)
v = fpy_row_arr[2]
if v is not None:
p_int = int(v)
v = fpy_row_arr[3]
if v is not None:
p_string =v
start= time.time()
#profile
def main_func():
print("calling main_func")
fpy_row_container=""
rows=map(str.rstrip,sys.stdin.readlines())
##for fpypy_line in sys.stdin:
for fpypy_line in rows:
temps=fpypy_line.rstrip()
if temps.endswith(fpy_end_row_signal):
size= len(fpy_end_row_signal)
fpypy_line= temps[:-size]
fpy_row_raw=fpypy_line
##RESET THE INPUT FIELDS
p_bigint=None
p_double=None
p_int=None
p_string=None
constant=None
##RESET THE OUTPUT FIELDS
constant_out=None
p_bigint_out=None
p_double_out=None
p_int_out=None
p_string_out=None
fpy_row_arr=fpy_row_raw.split(fpy_sep)
for n,i in enumerate(fpy_row_arr):
if i == fpy_null:
fpy_row_arr[n]=None
v = fpy_row_arr[0]
if v is not None:
p_bigint = int(v)
v = fpy_row_arr[1]
if v is not None:
p_double = float(v)
v = fpy_row_arr[2]
if v is not None:
p_int = int(v)
v = fpy_row_arr[3]
if v is not None:
p_string =v
#MAIN
p_bigint_out=p_bigint
p_double_out=p_double
p_int_out=p_int
p_string_out=p_string
constant_out=constant
column_values={
"p_bigint_out":p_bigint_out,
"p_double_out":p_double_out,
"p_int_out":p_int_out,
"p_string_out":p_string_out
}
sys.stdout=fpy_backup_stderr
fwrite(fpypy_output_row_start)
if constant_out is None:
fwrite(fpy_null)
else:
fwrite(str(constant_out))
fwrite(fpy_sep)
if p_bigint_out is None:
fwrite(fpy_null)
else:
fwrite(str(p_bigint_out))
fwrite(fpy_sep)
if p_double_out is None:
fwrite(fpy_null)
else:
fwrite(repr(p_double_out))
fwrite(fpy_sep)
if p_int_out is None:
fwrite(fpy_null)
else:
fwrite(str(p_int_out))
fwrite(fpy_sep)
if p_string_out is None:
fwrite(fpy_null)
else:
prec=fpy_column_out_precision_dict["p_string_out"]
astr=str(p_string_out).replace("\n",fpy_linefeed)
fwrite(astr[:int(prec)])
fwrite("\n")
sys.stdout=sys.stderr
fpy_row_container=""
else:
print("wait why did i miss " + fpypy_line)
fpy_row_container=fpy_row_container + fpypy_line
main_func()
end=time.time()
timeelapsed=(end-start)/60
sys.stderr.write("time elapsed in minutes " + str(timeelapsed))
file.close()
The results suggest to me that the bottleneck is not in your code at all. How much CPU does the script consume when you run it? If it's less than 100% on an individual core, your I/O channel is probably where time is spent, waiting for more data to arrive or for written data to be committed to disk.

Categories