I've a data file that looks like this:
58f0965a62d62099f5c0771d35dbc218 0.868632614612579 [0.028979932889342308, 0.004080114420503378, 0.03757167607545853] [-0.006008833646774292, -0.010409083217382431, 0.01565541699528694]
36f7859ce47417470bc28384694f0ac4 0.835115909576416 [0.026130573824048042, -0.00358427781611681, 0.06635218113660812] [-0.06970945745706558, 0.03816794604063034, 0.03491008281707764]
59f7d617bb662155b0d49ce3f27093ed 0.907200276851654 [0.009903069585561752, -0.009721670299768448, 0.0151780480518937] [-0.03264783322811127, 0.0035394825972616673, -0.05089104175567627]
where the columns are respectively
an md5 hash of the data point
a target float output
an array of floats that I want to read into a np.array object
another array of floats that I want to read into a np.array object
I've been reading the file as such to create a numpy array files for the two matrices of array of floats:
import numpy as np
from tqdm import tqdm
import pandas as pd
lol = []
with open('data.tsv') as fin:
for line in tqdm(fin):
md5hash, score, vector1, vector2 = line.strip().split('\t')
row = {'md5_hash': md5hash, 'score':float(score),
'vector1': np.array(eval(vector1)),
'vector2': np.array(eval(vector2))
}
lol.append(row)
df = pd.DataFrame(lol)
training_vector1 = np.array(list(df['vector1']))
# Save the training vectors.
np.save('vector1.npz', training_vector1)
training_vector2 = np.array(list(df['vector2']))
# Save the training vectors.
np.save('vector1.npz', training_vector2)
While this works for small dataset, the actual dataset has a lot more floats in the arrays and it's close to 200 million rows. Here's a sample of 100 rows https://gist.github.com/1f6f0b2501dc334db1e0038d36452f5d
How to efficiently read the array columns in the tsv file into a single npz files for each column efficiently?
First, a note on the overall problem.
Any approach that loads 200M rows similar to the sample input you provided would require some 1.1 TB of memory.
While this is possible, it is certainly not ideal.
Therefore, I would not recommend going forward with this, but rather look into approaches specifically designed for handling large dataset, e.g. HDF5.
Having said that, the problem at hand is not particular complex, but passing through pandas and eval() is probably neither desirable nor beneficial.
The same could be said for cut pre-processing into CSV files that are only marginally simpler to read.
Assuming that np.save() will be equally fast, regardless of how the array is produced, we could say that the following function replicates well the processing in OP:
def process_tsv_OP(filepath="100-translation.embedded-3.tsv"):
lol = []
with open(filepath, "r") as fin:
for line in fin:
md5hash, score, vector1, vector2 = line.strip().split('\t')
row = {'md5_hash': md5hash, 'score':float(score),
'vector1': np.array(eval(vector1)),
'vector2': np.array(eval(vector2))
}
lol.append(row)
df = pd.DataFrame(lol)
training_vector1 = np.array(list(df['vector1']))
training_vector2 = np.array(list(df['vector2']))
return training_vector1, training_vector2
This can be simplified by avoiding pandas and "evil-eval()" (and a number of copying around in memory):
def text2row(text):
text = text[1:-1]
return [float(x) for x in text.split(',')]
def process_tsv(filepath="100-translation.embedded-3.tsv"):
with open(filepath, "r") as in_file:
v1 = []
v2 = []
for line in in_file:
_, _, text_r1, text_r2 = line.strip().split('\t')
r1 = text2row(text_r1)
r2 = text2row(text_r2)
v1.append(r1)
v2.append(r2)
v1 = np.array(v1)
v2 = np.array(v2)
return v1, v2
It is easy to show that the two produce the same output:
def same_res(x, y):
return all(np.allclose(i, j) for i, j in zip(x, y))
same_res(process_tsv(), process_tsv_OP())
# True
but with substantially different timings:
%timeit process_tsv_OP()
# 1 loop, best of 5: 300 ms per loop
%timeit process_tsv()
# 10 loops, best of 5: 86.1 ms per loop
(on the sample input file obtained with: wget https://gist.githubusercontent.com/alvations/1f6f0b2501dc334db1e0038d36452f5d/raw/ee31c052a4dbda131df182f0237dbe6e5197dff2/100-translation.embedded-3.tsv)
Preprocessing the input with cut does not seem to be that beneficial:
!time cut -f3 100-translation.embedded-3.tsv | rev | cut -c2- | rev | cut -c2- > vector1.csv
# real 0m0.184s
# user 0m0.102s
# sys 0m0.233s
!time cut -f4 100-translation.embedded-3.tsv | rev | cut -c2- | rev | cut -c2- > vector2.csv
# real 0m0.208s
# user 0m0.113s
# sys 0m0.279s
%timeit np.genfromtxt('vector1.csv', delimiter=','); np.genfromtxt('vector2.csv', delimiter=',')
# 1 loop, best of 5: 130 ms per loop
and, while some time may be saved by using pd.read_csv():
%timeit pd.read_csv('vector1.csv').to_numpy(); pd.read_csv('vector2.csv').to_numpy()
# 10 loops, best of 5: 85.7 ms per loop
this seems to be even slower than the original approach on the provided dataset (although cut itself may scale better for larger inputs).
If you really want to stick to the npy file format for this, you may at least wish to append to your output in blocks.
While this is not supported well with NumPy alone, you could use NpyAppendArray (see also here).
The modified process_tsv() would look like:
import os
from npy_append_array import NpyAppendArray
def process_tsv_append(
in_filepath="100-translation.embedded-3.tsv",
out1_filepath="out1.npy",
out2_filepath="out2.npy",
append_every=10,
):
# clear output files
for filepath in (out1_filepath, out2_filepath):
if os.path.isfile(filepath):
os.remove(filepath)
with \
open(in_filepath, "r") as in_file, \
NpyAppendArray(out1_filepath) as npaa1, \
NpyAppendArray(out2_filepath) as npaa2:
v1 = []
v2 = []
for i, line in enumerate(in_file, 1):
_, _, text_r1, text_r2 = line.strip().split("\t")
r1 = text2row(text_r1)
r2 = text2row(text_r2)
v1.append(r1)
v2.append(r2)
if i % append_every == 0:
npaa1.append(np.array(v1))
npaa2.append(np.array(v2))
v1 = []
v2 = []
if len(v1) > 0: # assumes len(v1) == len(v2)
npaa1.append(np.array(v1))
npaa2.append(np.array(v2))
process_tsv_append()
v1 = np.load("out1.npy")
v2 = np.load("out2.npy")
same_res(process_tsv(), (v1, v2))
# True
All this can be speed up relatively blindly with Cython, but the speed-up seems to be marginal:
%%cython -c-O3 -c-march=native -a
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, infer_types=True
import numpy as np
cpdef text2row_cy(text):
return [float(x) for x in text[1:-1].split(',')]
cpdef process_tsv_cy(filepath="100-translation.embedded-3.tsv"):
with open(filepath, "r") as in_file:
v1 = []
v2 = []
for line in in_file:
_, _, text_r1, text_r2 = line.strip().split('\t')
r1 = text2row_cy(text_r1)
r2 = text2row_cy(text_r2)
v1.append(r1)
v2.append(r2)
v1 = np.array(v1)
v2 = np.array(v2)
return v1, v2
print(same_res(process_tsv_cy(), process_tsv_OP()))
# True
%timeit process_tsv_cy()
# 10 loops, best of 5: 72.4 ms per loop
Similarly, pre-allocating the arrays does not seem to be beneficial:
def text2row_out(text, out):
for i, x in enumerate(text[1:-1].split(',')):
out[i] = float(x)
def process_tsv_alloc(filepath="100-translation.embedded-3.tsv"):
num_lines = open(filepath, "r").read().count("\n")
with open(filepath, "r") as in_file:
# num lines
num_lines = in_file.read().count("\n")
# num cols
in_file.seek(0)
line = next(in_file)
_, _, text_r1, text_r2 = line.strip().split('\t')
num_cols1 = len(text_r1.split(","))
num_cols2 = len(text_r2.split(","))
# populate arrays
v1 = np.empty((num_lines, num_cols1))
v2 = np.empty((num_lines, num_cols2))
in_file.seek(0)
for i, line in enumerate(in_file):
_, _, text_r1, text_r2 = line.strip().split('\t')
text2row_out(text_r1, v1[i])
text2row_out(text_r2, v2[i])
return v1, v2
print(same_res(process_tsv_alloc(), process_tsv_OP()))
%timeit process_tsv_alloc()
# 10 loops, best of 5: 110 ms per loop
A significant reduction in the running time can be obtained with Numba (and possibly with Cython too) by rewriting everything to be closer to C. In order to make our code compatible with -- and beneficial to have it accelerated by -- Numba, we need to make significant modifications:
open the file as bytes (no longer supporting UTF-8, which is not a significant issue for the problem at hand)
read and process the file in blocks, which needs to be sufficiently large, say in the order of 1M
write all string handling functions by hand, notably the string-to-float conversion
import numpy as np
import numba as nb
#nb.njit
def bytes2int(text):
c_min = ord("0")
c_max = ord("9")
n = len(text)
valid = n > 0
# determine sign
start = n - 1
stop = -1
sign = 1
if valid:
first = text[0]
if first == ord("+"):
stop = 0
elif first == ord("-"):
sign = -1
stop = 0
# parse rest
number = 0
j = 0
for i in range(start, stop, -1):
c = text[i]
if c_min <= c <= c_max:
number += (c - c_min) * 10 ** j
j += 1
else:
valid = False
break
return sign * number if valid else None
#nb.njit
def bytes2float_helper(text):
sep = ord(".")
c_min = ord("0")
c_max = ord("9")
n = len(text)
valid = n > 0
# determine sign
start = n - 1
stop = -1
sign = 1
if valid:
first = text[0]
if first == ord("+"):
stop = 0
elif first == ord("-"):
sign = -1
stop = 0
# parse rest
sep_pos = 0
number = 0
j = 0
for i in range(start, stop, -1):
c = text[i]
if c_min <= c <= c_max:
number += (c - c_min) * 10 ** j
j += 1
elif c == sep and sep_pos == 0:
sep_pos = j
else:
valid = False
break
return sign * number, sep_pos, valid
#nb.njit
def bytes2float(text):
exp_chars = b"eE"
exp_pos = -1
for exp_char in exp_chars:
for i, c in enumerate(text[::-1]):
if c == exp_char:
exp_pos = i
break
if exp_pos > -1:
break
if exp_pos > 0:
exp_number = bytes2int(text[-exp_pos:])
if exp_number is None:
exp_number = 0
number, sep_pos, valid = bytes2float_helper(text[:-exp_pos-1])
result = number / 10.0 ** (sep_pos - exp_number) if valid else None
else:
number, sep_pos, valid = bytes2float_helper(text)
result = number / 10.0 ** sep_pos if valid else None
return result
#nb.njit
def btrim(text):
space = ord(" ")
tab = ord("\t")
nl = ord("\n")
cr = ord("\r")
start = 0
stop = 0
for c in text:
if c == space or c == tab or c == nl or c == cr:
start += 1
else:
break
for c in text[::-1]:
if c == space:
stop += 1
else:
break
if start == 0 and stop == 0:
return text
elif stop == 0:
return text[start:]
else:
return text[start:-stop]
#nb.njit
def text2row_nb(text, sep, num_cols, out, curr_row):
last_i = 0
j = 0
for i, c in enumerate(text):
if c == sep:
x = bytes2float(btrim(text[last_i:i]))
out[curr_row, j] = x
last_i = i + 2
j += 1
x = bytes2float(btrim(text[last_i:]))
out[curr_row, j] = x
#nb.njit
def process_line(line, psep, sep, num_psep, num_cols1, num_cols2, out1, out2, curr_row):
if len(line) > 0:
psep_pos = np.empty(num_psep, dtype=np.int_)
j = 0
for i, char in enumerate(line):
if char == psep:
psep_pos[j] = i
j += 1
text2row_nb(line[psep_pos[-2] + 2:psep_pos[-1] - 1], sep, num_cols1, out1, curr_row)
text2row_nb(line[psep_pos[-1] + 2:-1], sep, num_cols2, out2, curr_row)
#nb.njit
def decode_block(block, psep, sep, num_lines, num_cols1, num_cols2, out1, out2, curr_row):
nl = ord("\n")
last_i = 0
i = j = 0
for c in block:
if c == nl:
process_line(block[last_i:i], psep, sep, 3, num_cols1, num_cols2, out1, out2, curr_row)
j += 1
last_i = i
curr_row += 1
if j >= num_lines:
break
i += 1
return block[i + 1:], curr_row
#nb.njit
def count_nl(block, start=0):
nl = ord("\n")
for c in block:
if c == nl:
start += 1
return start
def process_tsv_block(filepath="100-translation.embedded-3.tsv", size=2 ** 18):
with open(filepath, "rb") as in_file:
# count newlines
num_lines = 0
while True:
block = in_file.read(size)
if block:
num_lines = count_nl(block, num_lines)
else:
break
# count num columns
in_file.seek(0)
line = next(in_file)
_, _, text_r1, text_r2 = line.strip().split(b'\t')
num_cols1 = len(text_r1.split(b","))
num_cols2 = len(text_r2.split(b","))
# fill output arrays
v1 = np.empty((num_lines, num_cols1))
v2 = np.empty((num_lines, num_cols2))
in_file.seek(0)
remainder = b""
curr_row = 0
while True:
block = in_file.read(size)
if block:
block = remainder + block
num_lines = count_nl(block)
if num_lines > 0:
remainder, curr_row = decode_block(block, ord("\t"), ord(","), num_lines, num_cols1, num_cols2, v1, v2, curr_row)
else:
remainder = block
else:
num_lines = count_nl(remainder)
if num_lines > 0:
remainder, curr_row = decode_block(remainder, ord("\t"), ord(","), num_lines, num_cols1, num_cols2, v1, v2, curr_row)
break
return v1, v2
The prize for all this work is a mere ~2x speed up over process_tsv():
print(same_res(process_tsv_block(), process_tsv_OP()))
# True
%timeit process_tsv_block()
# 10 loops, best of 5: 48.8 ms per loop
Cut the 3rd column, remove the first and last square brackets
cut -f3 data.tsv | rev | cut -c2- | rev | cut -c2- > vector1.csv
Repeat the same for Vector 2
cut -f4 data.tsv | rev | cut -c2- | rev | cut -c2- > vector2.csv
Read the csv into numpy in Python save to npy file.
import numpy as np
np.save('vector1.npy', np.genfromtxt('vector1.csv', delimiter=','))
np.save('vector1.npy', np.genfromtxt('vector2.csv', delimiter=','))
The other answers are good, the version below is a variation that uses dask. Since the original data is in text format, let's use dask.bag API.
First, import modules and define a utility function:
from dask.array import from_delayed, from_npy_stack, to_npy_stack, vstack
from dask.bag import read_text
from numpy import array, nan, stack
def process_line(line):
"""Utility function adapted from the snippet in the question."""
md5hash, score, vector1, vector2 = line.strip().split("\t")
row = {
"md5_hash": md5hash,
"score": float(score),
"vector1": array(eval(vector1)),
"vector2": array(eval(vector2)),
}
return row
Next, create a bag:
bag = read_text("100-translation.embedded-3.tsv", blocksize="1mb").map(process_line)
Since the sample snippet is small, to simulate 'big data', let's pretend that we can load '1mb' at once. This should create 3 partitions in the bag.
Next, isolate the vectors/arrays and convert them to dask.arrays:
# create delayed versions of the arrays
a1 = bag.pluck("vector1").map_partitions(stack).to_delayed()
a2 = bag.pluck("vector2").map_partitions(stack).to_delayed()
# convert the delayed objects to dask array
A1 = vstack(
[from_delayed(a, shape=(nan, 768), dtype="float") for a in a1],
allow_unknown_chunksizes=True,
)
A2 = vstack(
[from_delayed(a, shape=(nan, 768), dtype="float") for a in a2],
allow_unknown_chunksizes=True,
)
Now, we can save the arrays as npy stacks:
to_npy_stack("_A1", A1)
to_npy_stack("_A2", A2)
Note that this processing is not ideal, since the workers will pass over the data twice (once for each array), but with the current API I couldn't think of a better way.
Furthermore, note that the npy stacks preserve the 'unknown' chunks as metadata, even though all the relevant information was computed. This is something that could be improved in dask codebase, but for now the easiest fix is to load the data again, compute chunks, rechunk (to get nice, grid-like structure) and save again:
# rechunk into regular-sized format
A1 = from_npy_stack("_A1")
A1.compute_chunk_sizes()
A1.rechunk(chunks=(40, 768))
to_npy_stack("A1_final", A1)
# rechunk into regular-sized format
A2 = from_npy_stack("_A2")
A2.compute_chunk_sizes()
A2.rechunk(chunks=(40, 768))
to_npy_stack("A2_final", A2)
Of course on the real dataset, you'd want to use bigger chunks. And the final save operation does not have to be to numpy stacks, depending on your interest this could now be stored as HDF5 or zarr array.
If the output format is changed to a raw binary file then the input file can be processed line by line without storing the complete result in RAM.
import numpy as np
fh_in = open('data.tsv')
fh_vec1 = open('vector1.bin', 'wb')
fh_vec2 = open('vector2.bin', 'wb')
linecount = 0
for line in fh_in:
hash_, score, vec1, vec2 = line.strip().split('\t')
np.fromstring(vec1.strip('[]'), sep=',').tofile(fh_vec1)
np.fromstring(vec2.strip('[]'), sep=',').tofile(fh_vec2)
linecount += 1
A raw binary file doesn't store any info about dtype, shape, or byte order.
For loading it back into an array you can use np.fromfile or np.memmap and then call .reshape(linecount, -1) on it.
I have an input file of about 10^5 rows.
Each row is a sequence of 24 bits, i.e.:
1 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0
I need to compute the Hamming Distance for every pair of rows.
Here's my first implementation using SciPy hamming function:
from scipy.spatial.distance import hamming
with open('input.txt', 'r') as file:
reader = csv.reader(file, delimiter=' ')
nodes = {}
b = 24 # Number of bits
for nodeNum, node in enumerate(reader):
node[nodeNum] = [int(i) for i in node]
for u, uBits in nodes.items():
for v, vBits in nodes.items():
distance = hamming(uBits, vBits) * b
# Do stuff
Second implementation I came up with:
node[nodeNum] = sum([int(bit)*2**power for power, bit in enumerate(node)])
Here I only store the decimal value but I then have to manually count set bits resulting from each XOR operation:
def hamming(a, b):
N = a ^ b
distance = 0
ptr = 1
while N:
distance += ((N + 1) //
2 * ptr)
N -= (N + 1) // 2
ptr += 1
return distance
How can I improve my code (both in terms of memory usage and runtime, ideally)?
This may be the fastest you can do (watchout, for your data size it'd require to allocate 74.5 GiB of memory):
import numpy as np
nodes = []
with open('input.txt', 'r') as file:
reader = csv.reader(file, delimiter=' ')
for node in reader:
nodes.append([int(i) for i in node])
dists = 2 * np.inner(nodes-0.5, 0.5-nodes) + nodes.shape[1] / 2
Just for fun, here is a 40X faster version in Julia:
using LoopVectorization, Tullio
function hamming!(nodes,dists)
#tullio dists[i,j] = sum(nodes[i,k] ⊻ nodes[j,k])
end
n = 10^5
nodes = rand(Int8[0,1],n,24)
dists = Matrix{Int8}(undef,n,n)
#time hamming!(nodes,dists) # Run twice
# 1.886367 seconds (114 allocations: 6.594 KiB)
While we're at it, I invite you to enter the world of Julia. It offers similar speeds to C++ and a pleasant syntax similar to Python.
Why not just put the whole .csv into an array and then let scipy do all the work of computing pairwise distances?
import numpy as np
import pandas as pd
import scipy.spatial.distance
nodes = []
with open('input.txt', 'r') as file:
reader = csv.reader(file, delimiter=' ')
for node in reader:
nodes.append([int(i) for i in node])
nodes = np.array(nodes) # not strictly necessary
dists = scipy.spatial.distance.pdist(nodes, 'hamming')
I have a huge list (45M+ data poitns), with numerical values:
[78,0,5,150,9000,5,......,25,9,78422...]
I can easily get the maximum and minimum values, the number of these values, and the sum of them:
file_handle=open('huge_data_file.txt','r')
sum_values=0
min_value=None
max_value=None
for i,line in enumerate(file_handle):
value=int(line[:-1])
if min_value==None or value<min_value:
min_value=value
if max_value==None or value>max_value:
max_value=value
sum_values+=value
average_value=float(sum_values)/i
However, this is not what I need. I need a list of 10 numbers, where the number of data points between each two consecutive points is equal, for example
median points [0,30,120,325,912,1570,2522,5002,7025,78422]
and we have the number of data points between 0 and 30 or between 30 and 120 to be almost 4.5 million data points.
How can we do this?
=============================
EDIT:
I am well aware that we will need to sort the data. The problem is that I cannot fit all this data in one variable in memory, but I need to read it sequentially from a generator (file_handle)
If you are happy with an approximation, here is a great (and fairly easy to implement) algorithm for computing quantiles from stream data: "Space-Efficient Online Computation of Quantile Summaries" by Greenwald and Khanna.
The silly numpy approach:
import numpy as np
# example data (produced by numpy but converted to a simple list)
datalist = list(np.random.randint(0, 10000000, 45000000))
# converted back to numpy array (start here with your data)
arr = np.array(datalist)
np.percentile(arr, 10), np.percentile(arr, 20), np.percentile(arr, 30)
# ref:
# http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
You can also hack something together where you just do like:
arr.sort()
# And then select the 10%, 20% etc value, add some check for equal amount of
# numbers within a bin and then calculate the average, excercise for reader :-)
The thing is that calling this function several times will slow it down, so really, just sort the array and then select the elements yourself.
As you said in the comments that you want a solution that can scale to larger datasets then can be stored in RAM, feed the data into an SQLlite3 database. Even if your data set is 10GB and you only have 8GB RAM a SQLlite3 database should still be able to sort the data and give it back to you in order.
The SQLlite3 database gives you a generator over your sorted data.
You might also want to look into going beyond python and take some other database solution.
Here's a pure-python implementation of the partitioned-on-disk sort. It's slow, ugly code, but it works and hopefully each stage is relatively clear (the merge stage is really ugly!).
#!/usr/bin/env python
import os
def get_next_int_from_file(f):
l = f.readline()
if not l:
return None
return int(l.strip())
MAX_SAMPLES_PER_PARTITION = 1000000
PARTITION_FILENAME = "_{}.txt"
# Partition data set
part_id = 0
eof = False
with open("data.txt", "r") as fin:
while not eof:
print "Creating partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
for _ in range(MAX_SAMPLES_PER_PARTITION):
line = fin.readline()
if not line:
eof = True
break
fout.write(line)
part_id += 1
num_partitions = part_id
# Sort each partition
for part_id in range(num_partitions):
print "Reading unsorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "r") as fin:
samples = [int(line.strip()) for line in fin.readlines()]
print "Disk-Deleting unsorted {}".format(part_id)
os.remove(PARTITION_FILENAME.format(part_id))
print "In-memory sorting partition {}".format(part_id)
samples.sort()
print "Writing sorted partition {}".format(part_id)
with open(PARTITION_FILENAME.format(part_id), "w") as fout:
fout.writelines(["{}\n".format(sample) for sample in samples])
# Merge-sort the partitions
# NB This is a very inefficient implementation!
print "Merging sorted partitions"
part_files = []
part_next_int = []
num_lines_out = 0
# Setup data structures for the merge
for part_id in range(num_partitions):
fin = open(PARTITION_FILENAME.format(part_id), "r")
next_int = get_next_int_from_file(fin)
if next_int is None:
continue
part_files.append(fin)
part_next_int.append(next_int)
with open("data_sorted.txt", "w") as fout:
while part_files:
# Find the smallest number across all files
min_number = None
min_idx = None
for idx in range(len(part_files)):
if min_number is None or part_next_int[idx] < min_number:
min_number = part_next_int[idx]
min_idx = idx
# Now add that number, and move the relevent file along
fout.write("{}\n".format(min_number))
num_lines_out += 1
if num_lines_out % MAX_SAMPLES_PER_PARTITION == 0:
print "Merged samples: {}".format(num_lines_out)
next_int = get_next_int_from_file(part_files[min_idx])
if next_int is None:
# Remove this partition, it's now finished
del part_files[min_idx:min_idx + 1]
del part_next_int[min_idx:min_idx + 1]
else:
part_next_int[min_idx] = next_int
# Cleanup partition files
for part_id in range(num_partitions):
os.remove(PARTITION_FILENAME.format(part_id))
My code a proposal for finding the result without needing much space. In testing it found a quantile value in 7 minutes 51 seconds for a dataset of size 45 000 000.
from bisect import bisect_left
class data():
def __init__(self, values):
random.shuffle(values)
self.values = values
def __iter__(self):
for i in self.values:
yield i
def __len__(self):
return len(self.values)
def sortedValue(self, percentile):
val = list(self)
val.sort()
num = int(len(self)*percentile)
return val[num]
def init():
numbers = data([x for x in range(1,1000000)])
print(seekPercentile(numbers, 0.1))
print(numbers.sortedValue(0.1))
def seekPercentile(numbers, percentile):
lower, upper = minmax(numbers)
maximum = upper
approx = _approxPercentile(numbers, lower, upper, percentile)
return neighbor(approx, numbers, maximum)
def minmax(list):
minimum = float("inf")
maximum = float("-inf")
for num in list:
if num>maximum:
maximum = num
if num<minimum:
minimum = num
return minimum, maximum
def neighbor(approx, numbers, maximum):
dif = maximum
for num in numbers:
if abs(approx-num)<dif:
result = num
dif = abs(approx-num)
return result
def _approxPercentile(numbers, lower, upper, percentile):
middles = []
less = []
magicNumber = 10000
step = (upper - lower)/magicNumber
less = []
for i in range(1, magicNumber-1):
middles.append(lower + i * step)
less.append(0)
for num in numbers:
index = bisect_left(middles,num)
if index<len(less):
less[index]+= 1
summing = 0
for index, testVal in enumerate(middles):
summing += less[index]
if summing/len(numbers) < percentile:
print(" Change lower from "+str(lower)+" to "+ str(testVal))
lower = testVal
if summing/len(numbers) > percentile:
print(" Change upper from "+str(upper)+" to "+ str(testVal))
upper = testVal
break
precision = 0.01
if (lower+precision)>upper:
return lower
else:
return _approxPercentile(numbers, lower, upper, percentile)
init()
I edited my code a bit and I now think that this way works at least decently even when it's not optimal.
So, I have this challenge on CodeEval, but I seem don't know where to start, so I need some pointers (and answers if you can) to help me figure out this challenge.
DESCRIPTION:
There is a board (matrix). Every cell of the board contains one integer, which is 0 initially.
The next operations can be applied to the Query Board:
SetRow i x: it means that all values in the cells on row "i" have been change value to "x" after this operation.
SetCol j x: it means that all values in the cells on column "j" have been changed to value "x" after this operation.
QueryRow i: it means that you should output the sum of values on row "i".
QueryCol j: it means that you should output the sum of values on column "j".
The board's dimensions are 256x256
i and j are integers from 0 to 255
x is an integer from 0 to 31
INPUT SAMPLE:
Your program should accept as its first argument a path to a filename. Each line in this file contains an operation of a query. E.g.
SetCol 32 20
SetRow 15 7
SetRow 16 31
QueryCol 32
SetCol 2 14
QueryRow 10
SetCol 14 0
QueryRow 15
SetRow 10 1
QueryCol 2
OUTPUT SAMPLE:
For each query, output the answer of the query. E.g.
5118
34
1792
3571
I'm not that great on Python, but this challenge is pretty interesting, although I didn't have any clues on how to solve it. So, I need some help from you guys.
Thanks!
You could use a sparse matrix for this; addressed by (col, row) tuples as keys in a dictionary, to save memory. 64k cells is a big list otherwise (2MB+ on a 64-bit system):
matrix = {}
This is way more efficient, as the challenge is unlikely to set values for all rows and columns on the board.
Setting a column or row is then:
def set_col(col, x):
for i in range(256):
matrix[i, col] = x
def set_row(row, x):
for i in range(256):
matrix[row, i] = x
and summing a row or column is then:
def get_col(col):
return sum(matrix.get((i, col), 0) for i in range(256))
def get_row(row):
return sum(matrix.get((row, i), 0) for i in range(256))
WIDTH, HEIGHT = 256, 256
board = [[0] * WIDTH for i in range(HEIGHT)]
def set_row(i, x):
global board
board[i] = [x]*WIDTH
... implement each function, then parse each line of input to decide which function to call,
for line in inf:
dat = line.split()
if dat[0] == "SetRow":
set_row(int(dat[1]), int(dat[2]))
elif ...
Edit: Per Martijn's comments:
total memory usage for board is about 2.1MB. By comparison, after 100 random row/column writes, matrix is 3.1MB (although it tops out there and doesn't get any bigger).
yes, global is unnecessary when modifying a global object (just don't try to assign to it).
while dispatching from a dict is good and efficient, I did not want to inflict it on someone who is "not that great on Python", especially for just four entries.
For sake of comparison, how about
time = 0
WIDTH, HEIGHT = 256, 256
INIT = 0
rows = [(time, INIT) for _ in range(WIDTH)]
cols = [(time, INIT) for _ in range(HEIGHT)]
def set_row(i, x):
global time
time += 1
rows[int(i)] = (time, int(x))
def set_col(i, x):
global time
time += 1
cols[int(i)] = (time, int(x))
def query_row(i):
rt, rv = rows[int(i)]
total = rv * WIDTH + sum(cv - rv for ct, cv in cols if ct > rt)
print(total)
def query_col(j):
ct, cv = cols[int(j)]
total = cv * HEIGHT + sum(rv - cv for rt, rv in rows if rt > ct)
print(total)
ops = {
"SetRow": set_row,
"SetCol": set_col,
"QueryRow": query_row,
"QueryCol": query_col
}
inf = """SetCol 32 20
SetRow 15 7
SetRow 16 31
QueryCol 32
SetCol 2 14
QueryRow 10
SetCol 14 0
QueryRow 15
SetRow 10 1
QueryCol 2""".splitlines()
for line in inf:
line = line.split()
op = line.pop(0)
ops[op](*line)
which only uses 4.3k of memory for rows[] and cols[].
Edit2:
using your code from above for matrix, set_row, set_col,
import sys
for n in range(256):
set_row(n, 1)
print("{}: {}".format(2*(n+1)-1, sys.getsizeof(matrix)))
set_col(n, 1)
print("{}: {}".format(2*(n+1), sys.getsizeof(matrix)))
which returns (condensed:)
1: 12560
2: 49424
6: 196880
22: 786704
94: 3146000
... basically the allocated memory quadruples at each step. If I change the memory measure to include key-tuples,
def get_matrix_size():
return sys.getsizeof(matrix) + sum(sys.getsizeof(key) for key in matrix)
it increases more smoothly, but still takes a bit jump at the above points:
5 : 127.9k
6 : 287.7k
21 : 521.4k
22 : 1112.7k
60 : 1672.0k
61 : 1686.1k <-- approx expected size on your reported problem set
93 : 2121.1k
94 : 4438.2k