Spark: More Efficient Aggregation to join strings from different rows

Spark: More Efficient Aggregation to join strings from different rows - python

I'm currently working with DNA sequence data and I have run into a bit of a performance roadblock.
I have two lookup dictionaries/hashes (as RDDs) with DNA "words" (short sequences) as keys and a list of index positions as the value. One is for a shorter query sequence and the other for a database sequence. Creating the tables is pretty fast even with very, very large sequences.
For the next step, I need to pair these up and find "hits" (pairs of index positions for each common word).
I first join the lookup dictionaries, which is reasonably fast. However, I now need the pairs, so I have to flatmap twice, once to expand the list of indices from the query and the second time to expand the list of indices from the database. This isn't ideal, but I don't see another way to do it. At least it performs ok.
The output at this point is: (query_index, (word_length, diagonal_offset)), where the diagonal offset is the database_sequence_index minus the query sequence index.
However, I now need to find pairs of indices on with the same diagonal offset (db_index - query_index) and reasonably close together and join them (so I increase the length of the word), but only as pairs (i.e. once I join one index with another, I don't want anything else to merge with it).
I do this with an aggregateByKey operation using a special object called Seed().
PARALELLISM = 16 # I have 4 cores with hyperthreading
def generateHsps(query_lookup_table_rdd, database_lookup_table_rdd):
global broadcastSequences
def mergeValueOp(seedlist, (query_index, seed_length)):
return seedlist.addSeed((query_index, seed_length))
def mergeSeedListsOp(seedlist1, seedlist2):
return seedlist1.mergeSeedListIntoSelf(seedlist2)
hits_rdd = (query_lookup_table_rdd.join(database_lookup_table_rdd)
.flatMap(lambda (word, (query_indices, db_indices)): [(query_index, db_indices) for query_index in query_indices], preservesPartitioning=True)
.flatMap(lambda (query_index, db_indices): [(db_index - query_index, (query_index, WORD_SIZE)) for db_index in db_indices], preservesPartitioning=True)
.aggregateByKey(SeedList(), mergeValueOp, mergeSeedListsOp, PARALLELISM)
.map(lambda (diagonal, seedlist): (diagonal, seedlist.mergedSeedList))
.flatMap(lambda (diagonal, seedlist): [(query_index, seed_length, diagonal) for query_index, seed_length in seedlist])
)
return hits_rdd
Seed():
class SeedList():
def __init__(self):
self.unmergedSeedList = []
self.mergedSeedList = []
#Try to find a more efficient way to do this
def addSeed(self, (query_index1, seed_length1)):
for i in range(0, len(self.unmergedSeedList)):
(query_index2, seed_length2) = self.unmergedSeedList[i]
#print "Checking ({0}, {1})".format(query_index2, seed_length2)
if min(abs(query_index2 + seed_length2 - query_index1), abs(query_index1 + seed_length1 - query_index2)) <= WINDOW_SIZE:
self.mergedSeedList.append((min(query_index1, query_index2), max(query_index1+seed_length1, query_index2+seed_length2)-min(query_index1, query_index2)))
self.unmergedSeedList.pop(i)
return self
self.unmergedSeedList.append((query_index1, seed_length1))
return self
def mergeSeedListIntoSelf(self, seedlist2):
print "merging seed"
for (query_index2, seed_length2) in seedlist2.unmergedSeedList:
wasmerged = False
for i in range(0, len(self.unmergedSeedList)):
(query_index1, seed_length1) = self.unmergedSeedList[i]
if min(abs(query_index2 + seed_length2 - query_index1), abs(query_index1 + seed_length1 - query_index2)) <= WINDOW_SIZE:
self.mergedSeedList.append((min(query_index1, query_index2), max(query_index1+seed_length1, query_index2+seed_length2)-min(query_index1, query_index2)))
self.unmergedSeedList.pop(i)
wasmerged = True
break
if not wasmerged:
self.unmergedSeedList.append((query_index2, seed_length2))
return self
This is where the performance really breaks down for even sequences of moderate length.
Is there any better way to do this aggregation? My gut feeling says yes, but I can't come up with it.
I know this is a very long winded and technical question, and I would really appreciate any insight even if there is no easy solution.
Edit: Here is how I am making the lookup tables:
def createLookupTable(sequence_rdd, sequence_name, word_length):
global broadcastSequences
blank_list = []
def addItemToList(lst, val):
lst.append(val)
return lst
def mergeLists(lst1, lst2):
#print "Merging"
return lst1+lst2
return (sequence_rdd
.flatMap(lambda seq_len: range(0, seq_len - word_length + 1))
.repartition(PARALLELISM)
#.partitionBy(PARALLELISM)
.map(lambda index: (str(broadcastSequences.value[sequence_name][index:index + word_length]), index), preservesPartitioning=True)
.aggregateByKey(blank_list, addItemToList, mergeLists, PARALLELISM))
#.map(lambda (word, indices): (word, sorted(indices))))
And here is the function that runs the whole operation:
def run(query_seq, database_sequence, translate_query=False):
global broadcastSequences
scoring_matrix = 'nucleotide' if isinstance(query_seq.alphabet, DNAAlphabet) else 'blosum62'
sequences = {'query': query_seq,
'database': database_sequence}
broadcastSequences = sc.broadcast(sequences)
query_rdd = sc.parallelize([len(query_seq)])
query_rdd.cache()
database_rdd = sc.parallelize([len(database_sequence)])
database_rdd.cache()
query_lookup_table_rdd = createLookupTable(query_rdd, 'query', WORD_SIZE)
query_lookup_table_rdd.cache()
database_lookup_table_rdd = createLookupTable(database_rdd, 'database', WORD_SIZE)
seeds_rdd = generateHsps(query_lookup_table_rdd, database_lookup_table_rdd)
return seeds_rdd
Edit 2: I have tweaked things a bit and slightly improved performance by replacing:
.flatMap(lambda (word, (query_indices, db_indices)): [(query_index, db_indices) for query_index in query_indices], preservesPartitioning=True)
.flatMap(lambda (query_index, db_indices): [(db_index - query_index, (query_index, WORD_SIZE)) for db_index in db_indices], preservesPartitioning=True)
in hits_rdd with:
.flatMap(lambda (word, (query_indices, db_indices)): itertools.product(query_indices, db_indices))
.map(lambda (query_index, db_index): (db_index - query_index, (query_index, WORD_SIZE) ))
At least now I'm not burning up storage with intermediate data structures as much.

Let's forget about the technical details of what your doing and think "functionally" about the steps involved, forgetting about the details of the implementation. Functional thinking like this is an important part of parallel data analysis; ideally if we can break the problem up like this, we can reason more clearly about the steps involved, and end up with clearer and often more concise. Thinking in terms of a tabular data model, I would consider your problem to consist of the following steps:
Join your two datasets on the sequence column.
Create a new column delta containing the difference between the indices.
Sort by (either) index to make sure that the subsequences are in the correct order.
Group by delta and concatenate the strings in the sequence column, to obtain the full matches between your datasets.
For the first 3 steps, I think it makes sense to use DataFrames, since this data model makes sense in my head of the kind processing that we're doing. (Actually I might use DataFrames for step 4 as well, except pyspark doesn't currently support custom aggregate functions for DataFrames, although Scala does).
For the fourth step (which is if I understand correctly what you're really asking about in your question), it's a little tricky to think about how to do this functionally, however I think an elegant and efficient solution is to use a reduce (also known a right fold); this pattern can be applied to any problem that you can phrase in terms of iteratively applying an associative binary function, that is a function where the "grouping" of any 3 arguments doesn't matter (although the order certainly may matter), Symbolically, this is a function x,y -> f(x,y) where f(x, f(y, z)) = f(f(x, y), z). String (or more generally list) concatenation is just such a function.
Here's an example of how this might look in pyspark; hopefully you can adapt this to the details of your data:
#setup some sample data
query = [['abcd', 30] ,['adab', 34] ,['dbab',38]]
reference = [['dbab', 20], ['ccdd', 24], ['abcd', 50], ['adab',54], ['dbab',58], ['dbab', 62]]
#create data frames
query_df = sqlContext.createDataFrame(query, schema = ['sequence1', 'index1'])
reference_df = sqlContext.createDataFrame(reference, schema = ['sequence2', 'index2'])
#step 1: join
matches = query_df.join(reference_df, query_df.sequence1 == reference_df.sequence2)
#step 2: calculate delta column
matches_delta = matches.withColumn('delta', matches.index2 - matches.index1)
#step 3: sort by index
matches_sorted = matches_delta.sort('delta').sort('index2')
#step 4: convert back to rdd and reduce
#note that + is just string concatenation for strings
r = matches_sorted['delta', 'sequence1'].rdd
r.reduceByKey(lambda x, y : x + y).collect()
#expected output:
#[(24, u'dbab'), (-18, u'dbab'), (20, u'abcdadabdbab')]

Related

How to generate complex Hypothesis data frames with internal row and column dependencies?

Is there an elegant way of using hypothesis to directly generate complex pandas data frames with internal row and column dependencies? Let's say I want columns such as:
[longitude][latitude][some-text-meta][some-numeric-meta][numeric-data][some-junk][numeric-data][…
Geographic coordinates can be individually picked at random, but sets must usually come from a general area (e.g. standard reprojections don't work if you have two points on opposite sides of the globe). It's easy to handle that by choosing an area with one strategy and columns of coordinates from inside that area with another. All good so far…
#st.composite
def plaus_spamspam_arrs(
draw,
st_lonlat=plaus_lonlat_arr,
st_values=plaus_val_arr,
st_areas=plaus_area_arr,
st_meta=plaus_meta_arr,
bounds=ARR_LEN,
):
"""Returns plausible spamspamspam arrays"""
size = draw(st.integers(*bounds))
coords = draw(st_lonlat(size=size))
values = draw(st_values(size=size))
areas = draw(st_areas(size=size))
meta = draw(st_meta(size=size))
return PlausibleData(coords, values, areas, meta)
The snippet above makes clean numpy arrays of coordinated single-value data. But the numeric data in the columns example (n-columns interspersed with junk) can also have row-wise dependencies such as needing to be normalised to some factor involving a row-wise sum and/or something else chosen dynamically at runtime.
I can generate all these bits separately, but I can't see how to stitch them into a single data frame without using a clumsy concat-based technique that, I presume, would disrupt draw-based shrinking. Moreover, I need a solution that adapts beyond what's above, so a hack likely get me too far…
Maybe there's something with builds? I just can't quite see out how to do it. Thanks for sharing if you know! A short example as inspiration would likely be enough.
Update
I can generate columns roughly as follows:
#st.composite
def plaus_df_inputs(
draw, *, nrows=None, ncols=None, nrow_bounds=ARR_LEN, ncol_bounds=COL_LEN
):
"""Returns …"""
box_lon, box_lat = draw(plaus_box_geo())
ncols_jnk = draw(st.integers(*ncol_bounds)) if ncols is None else ncols
ncols_val = draw(st.integers(*ncol_bounds)) if ncols is None else ncols
keys_val = draw(plaus_smp_key_elm(size=ncols_val))
nrows = draw(st.integers(*nrow_bounds)) if nrows is None else nrows
cols = (
plaus_df_cols_lonlat(lons=plaus_lon(box_lon), lats=plaus_lat(box_lat))
+ plaus_df_cols_meta()
+ plaus_df_cols_value(keys=keys_val)
+ draw(plaus_df_cols_junk(size=ncols_jnk))
)
random.shuffle(cols)
return draw(st_pd.data_frames(cols, index=plaus_df_idx(size=nrows)))
where the sub-stats are things like
#st.composite
def plaus_df_cols_junk(
draw, *, size=1, names=plaus_meta(), dtypes=plaus_dtype(), unique=False
):
"""Returns strategy for list of columns of plausible junk data."""
result = set()
for _ in range(size):
result.add(draw(names.filter(lambda name: name not in result)))
return [
st_pd.column(name=result.pop(), dtype=draw(dtypes), unique=unique)
for _ in range(size)
]
What I need is something more elegant that incorporates the row-based dependencies.

from hypothesis import strategies as st
#st.composite
def interval_sets(draw):
# To create our interval sets, we'll draw from a strategy that shrinks well,
# and then transform it into the format we want. More specifically, we'll use
# a single lists() strategy so that the shrinker can delete chunks atomically,
# and then rearrange the floats that we draw as part of this.
base_elems = st.tuples(
# Different floats bounds to ensure we get at least one valid start and end.
st.text(),
st.floats(0, 1, exclude_max=True),
st.floats(0, 1, exclude_min=True),
)
base = draw(st.lists(base_elems, min_size=1, unique_by=lambda t: t[0]))
nums = sorted(sum((t[1:] for t in base), start=())) # arrange our endpoints
return [
{"name": name, "start": start, "end": end, "size": end - start}
for (name, _, _), start, end in zip(base, nums[::2], nums[1::2])
]

Memory efficient way to compare adjacent records in pyspark

I am looking for a memory-efficient way to create a column in a pyspark DF using a function that is taking as arguments values from 'adjacent' (i.e., when the dataset has been sorted on a particular column) records, which are permutations of a particular hash.
That is, for a particular dataframe, which has several 'hash_n' (up to ~five) columns, I am looking to sort on each hash column, and create a new column based on a function of the hash in 'this' column, as well as the next several (up to ~15) columns. The function is essentially comparing the 'similarity' of the hashes and appending the 'index' of the 'other' column if the similarity is above some threshold.
Initially I was doing this with a window function and a pyspark UDF, but having run into outOfMemory problems, I am now converting to RDD, creating a dictionary of 'nearby' RDDs by incrementing the index, unioning the resulting dictionary values, and applying the function to the union via reduceByKey. This approach seems to be an improvement, although I am still running into memory issues (yarn killing containers as result of executor using too much memory; have tried several settings without being able to get around this problem).
Here is the relevant part of the code I am using (it is slightly more convoluted as it has to allow for the possibility of a single column on which to partition (in terms of how the data is processed -- we only compare hashes if they are in the same partition) the data, 'partnCol', or else a list of the same, 'partnCols'; in practice we always have at least one partition column); the parameters here are nPerms, the number of times that the hash is permuted (so that we have a total of nPerms+1 hash columns), and B, the number of adjacent records to look at for matches.
HSHTBLDict = {}; rdd0Dict = {}; rddDictDict = {}
possMtchsDict = {}; mtchsDict = {}
for nn in range(nPerms+1):
if (partnCol):
HSHTBLDict[nn] = _addNamedDFIndex(HSHTBL.orderBy\
(partnCol, 'hash_{0}'.format(nn)), 'newIdx')
rdd0Dict[nn] = HSHTBLDict[nn].select('newIdx', partnCol, '__index__', \
'hash_{0}'.format(nn))\
.orderBy(partnCol, 'hash_{0}'.format(nn)).rdd.map(tuple)\
.map(lambda kv: ((kv[0], kv[1]), (kv[2:])))
elif (partnCols):
HSHTBLDict[nn] = _addNamedDFIndex(HSHTBL.orderBy(\
*(partnCols+['hash_{0}'.format(nn)])), 'newIdx')
rdd0Dict[nn] = HSHTBLDict[nn].select('newIdx', \
*(partnCols+['__index__', 'hash_{0}'.format(nn)]))\
.orderBy(*(partnCols+['hash_{0}'.format(nn)])).rdd.map(tuple)\
.map(lambda kv: ((kv[:len(partnCols)+1]), \
(kv[len(partnCols)+1:])))
else:
HSHTBLDict[nn] = _addNamedDFIndex(HSHTBL.orderBy(\
'hash_{0}'.format(nn)), 'newIdx')
rdd0Dict[nn] = HSHTBLDict[nn].select('newIdx', '__index__', \
'hash_{0}'.format(nn))\
.orderBy('hash_{0}'.format(nn)).rdd.map(tuple)\
.map(lambda kv: ((kv[0], ), (kv[1:])))
rddDictDict[nn] = {}
for b in range(1, B+1):
def funcPos(r, b=b):
return r.map(lambda kv: (tuple([kv[0][x] + b if x==0 else kv[0][x] \
for x in range(len(kv[0]))]), kv[1]))
rddDictDict[nn][b] = funcPos(rdd0Dict[nn])
possMtchsDict[nn] = sc.union([rdd0Dict[nn]] + rddDictDict[nn].values())\
.reduceByKey(lambda x,y: x+y, numPartitions=rddParts)\
.mapValues(lambda v: tuple(v[i:i+2] \
for i in range(0, len(v), 2)))
mtchsDict[nn] = possMtchsDict[nn].mapValues(lambda v: tuple([ tuple([ v[x][0], \
[p[0] for p in v if _hashSim(v[x][1], p[1]) > thr]])\
for x in range(len(v)) ]) )
# union together all combinations from all hash columns
Mtchs = sc.union(mtchsDict.values())
# map to rdd of (__index__, matchList) pairs
mls = Mtchs.flatMap(lambda kv: kv[1]).reduceByKey(lambda x,y: list(set(x+y)), \
numPartitions=rddParts)
If anyone has any advice / ideas would be very happy to listen to them.
I did try reducing the number of cores per executor, and I found that although the process was (comparatively very) slow I could get it to complete, although I have not yet found a combination of settings that work for the full range of nPerms / B that I would like to achieve. Possibly it is necessary to re-work the code / approach in order to do that. I would also like to understand better the way in which the amount of memory that yarn is allowed for things like shuffles is determined. I have found that decreasing --executor-memory and increasing --conf spark.yarn.executor.memoryOverhead seems to help, but it still not clear to me how one would calculate the amount of memory that will be available.

Why does my association model find subgroups in a dataset when there shouldn't any?

I give a lot of information on the methods that I used to write my code. If you just want to read my question, skip to the quotes at the end.
I'm working on a project that has a goal of detecting sub populations in a group of patients. I thought this sounded like the perfect opportunity to use association rule mining as I'm currently taking a class on the subject.
I there are 42 variables in total. Of those, 20 are continuous and had to be discretized. For each variable, I used the Freedman-Diaconis rule to determine how many categories to divide a group into.
def Freedman_Diaconis(column_values):
#sort the list first
column_values[1].sort()
first_quartile = int(len(column_values[1]) * .25)
third_quartile = int(len(column_values[1]) * .75)
fq_value = column_values[1][first_quartile]
tq_value = column_values[1][third_quartile]
iqr = tq_value - fq_value
n_to_pow = len(column_values[1])**(-1/3)
h = 2 * iqr * n_to_pow
retval = (column_values[1][-1] - column_values[1][1])/h
test = int(retval+1)
return test
From there I used min-max normalization
def min_max_transform(column_of_data, num_bins):
min_max_normalizer = preprocessing.MinMaxScaler(feature_range=(1, num_bins))
data_min_max = min_max_normalizer.fit_transform(column_of_data[1])
data_min_max_ints = take_int(data_min_max)
return data_min_max_ints
to transform my data and then I simply took the interger portion to get the final categorization.
def take_int(list_of_float):
ints = []
for flt in list_of_float:
asint = int(flt)
ints.append(asint)
return ints
I then also wrote a function that I used to combine this value with the variable name.
def string_transform(prefix, column, index):
transformed_list = []
transformed = ""
if index < 4:
for entry in column[1]:
transformed = prefix+str(entry)
transformed_list.append(transformed)
else:
prefix_num = prefix.split('x')
for entry in column[1]:
transformed = str(prefix_num[1])+'x'+str(entry)
transformed_list.append(transformed)
return transformed_list
This was done to differentiate variables that have the same value, but appear in different columns. For example, having a value of 1 for variable x14 means something different from getting a value of 1 in variable x20. The string transform function would create 14x1 and 20x1 for the previously mentioned examples.
After this, I wrote everything to a file in basket format
def create_basket(list_of_lists, headers):
#for filename in os.listdir("."):
# if filename.e
if not os.path.exists('baskets'):
os.makedirs('baskets')
down_length = len(list_of_lists[0])
with open('baskets/dataset.basket', 'w') as basketfile:
basket_writer = csv.DictWriter(basketfile, fieldnames=headers)
for i in range(0, down_length):
basket_writer.writerow({"trt": list_of_lists[0][i], "y": list_of_lists[1][i], "x1": list_of_lists[2][i],
"x2": list_of_lists[3][i], "x3": list_of_lists[4][i], "x4": list_of_lists[5][i],
"x5": list_of_lists[6][i], "x6": list_of_lists[7][i], "x7": list_of_lists[8][i],
"x8": list_of_lists[9][i], "x9": list_of_lists[10][i], "x10": list_of_lists[11][i],
"x11": list_of_lists[12][i], "x12":list_of_lists[13][i], "x13": list_of_lists[14][i],
"x14": list_of_lists[15][i], "x15": list_of_lists[16][i], "x16": list_of_lists[17][i],
"x17": list_of_lists[18][i], "x18": list_of_lists[19][i], "x19": list_of_lists[20][i],
"x20": list_of_lists[21][i], "x21": list_of_lists[22][i], "x22": list_of_lists[23][i],
"x23": list_of_lists[24][i], "x24": list_of_lists[25][i], "x25": list_of_lists[26][i],
"x26": list_of_lists[27][i], "x27": list_of_lists[28][i], "x28": list_of_lists[29][i],
"x29": list_of_lists[30][i], "x30": list_of_lists[31][i], "x31": list_of_lists[32][i],
"x32": list_of_lists[33][i], "x33": list_of_lists[34][i], "x34": list_of_lists[35][i],
"x35": list_of_lists[36][i], "x36": list_of_lists[37][i], "x37": list_of_lists[38][i],
"x38": list_of_lists[39][i], "x39": list_of_lists[40][i], "x40": list_of_lists[41][i]})
and I used the apriori package in Orange to see if there were any association rules.
rules = Orange.associate.AssociationRulesSparseInducer(patient_basket, support=0.3, confidence=0.3)
print "%4s %4s %s" % ("Supp", "Conf", "Rule")
for r in rules:
my_rule = str(r)
split_rule = my_rule.split("->")
if 'trt' in split_rule[1]:
print 'treatment rule'
print "%4.1f %4.1f %s" % (r.support, r.confidence, r)
Using this, technique I found quite a few association rules with my testing data.
THIS IS WHERE I HAVE A PROBLEM
When I read the notes for the training data, there is this note
...That is, the only
reason for the differences among observed responses to the same treatment across patients is
random noise. Hence, there is NO meaningful subgroup for this dataset...
My question is,
why do I get multiple association rules that would imply that there are subgroups, when according to the notes I shouldn't see anything?
I'm getting lift numbers that are above 2 as opposed to the 1 that you should expect if everything was random like the notes state.
Supp Conf Rule
0.3 0.7 6x0 -> trt1
Even though my code runs, I'm not getting results anywhere close to what should be expected. This leads me to believe that I messed something up, but I'm not sure what it is.

After some research, I realized that my sample size is too small for the number of variables that I have. I would need a way larger sample size in order to really use the method that I was using. In fact, the method that I tried to use was developed with the assumption that it would be run on databases with hundreds of thousands or millions of rows.

Python Linear Search Better Efficiency

I've got a question regarding Linear Searching in Python. Say I've got the base code of
for l in lines:
for f in search_data:
if my_search_function(l[1],[f[0],f[2]]):
print "Found it!"
break
in which we want to determine where in search_data exists the value stored in l[1]. Say my_search_function() looks like this:
def my_search_function(search_key, search_values):
for s in search_values:
if search_key in s:
return True
return False
Is there any way to increase the speed of processing? Binary Search would not work in this case, as lines and search_data are multidimensional lists and I need to preserve the indexes. I've tried an outside-in approach, i.e.
for line in lines:
negative_index = -1
positive_index = 0
middle_element = len(search_data) /2 if len(search_data) %2 == 0 else (len(search_data)-1) /2
found = False
while positive_index < middle_element:
# print str(positive_index)+","+str(negative_index)
if my_search_function(line[1], [search_data[positive_index][0],search_data[negative_index][0]]):
print "Found it!"
break
positive_index = positive_index +1
negative_index = negative_index -1
However, I'm not seeing any speed increases from this. Does anyone have a better approach? I'm looking to cut the processing speed in half as I'm working with large amounts of CSV and the processing time for one file is > 00:15 which is unacceptable as I'm processing batches of 30+ files. Basically the data I'm searching on is essentially SKUs. A value from lines[0] could be something like AS123JK and a valid match for that value could be AS123. So a HashMap would not work here, unless there exists a way to do partial matches in a HashMap lookup that wouldn't require me breaking down the values like ['AS123', 'AS123J', 'AS123JK'], which is not ideal in this scenario. Thanks!

Binary Search would not work in this case, as lines and search_data are multidimensional lists and I need to preserve the indexes.
Regardless, it may be worth your while to extract the strings (along with some reference to the original data structure) into a flat list, sort it, and perform fast binary searches on it with help of the bisect module.
Or, instead of a large number of searches, sort also a combined list of all the search keys and traverse both lists in parallel, looking for matches. (Proceeding in a similar manner to the merge step in merge sort, without actually outputting a merged list)
Code to illustrate the second approach:
lines = ['AS12', 'AS123', 'AS123J', 'AS123JK','AS124']
search_keys = ['AS123', 'AS125']
try:
iter_keys = iter(sorted(search_keys))
key = next(iter_keys)
for line in sorted(lines):
if line.startswith(key):
print('Line {} matches {}'.format(line, key))
else:
while key < line[:len(key)]:
key = next(iter_keys)
except StopIteration: # all keys processed
pass

Depends on problem detail.
For instance if you search for complete words, you could create a hashtable on searchable elements, and the final search would be a simple lookup.
Filling the hashtable is pseudo-linear.

Ultimately, I was broke down and implemented Binary Search on my multidimensional lists by sorting using the sorted() function with a lambda as a key argument.Here is the first pass code that I whipped up. It's not 100% efficient, but it's a vast improvement from where we were
def binary_search(master_row, source_data,master_search_index, source_search_index):
lower_bound = 0
upper_bound = len(source_data) - 1
found = False
while lower_bound <= upper_bound and not found:
middle_pos = (lower_bound + upper_bound) // 2
if source_data[middle_pos][source_search_index] < master_row[master_search_index]:
if search([source_data[middle_pos][source_search_index]],[master_row[master_search_index]]):
return {"result": True, "index": middle_pos}
break
lower_bound = middle_pos + 1
elif source_data[middle_pos][source_search_index] > master_row[master_search_index] :
if search([master_row[master_search_index]],[source_data[middle_pos][source_search_index]]):
return {"result": True, "index": middle_pos}
break
upper_bound = middle_pos - 1
else:
if len(source_data[middle_pos][source_search_index]) > 5:
return {"result": True, "index": middle_pos}
else:
break
and then where we actually make the Binary Search call
#where master_copy is the first multidimensional list, data_copy is the second
#the search columns are the columns we want to search against
for line in master_copy:
for m in master_search_columns:
found = False
for d in data_search_columns:
data_copy = sorted(data_copy, key=lambda x: x[d], reverse=False)
results = binary_search(line, data_copy,m, d)
found = results["result"]
if found:
line = update_row(line, data_copy[results["index"]], column_mapping)
found_count = found_count +1
break
if found:
break
Here's the info for sorting a multidimensional list Python Sort Multidimensional Array Based on 2nd Element of Subarray

Algorithmic / coding help for a PySpark markov model

I need some help getting my brain around designing an (efficient) markov chain in spark (via python). I've written it as best as I could, but the code I came up with doesn't scale.. Basically for the various map stages, I wrote custom functions and they work fine for sequences of a couple thousand, but when we get in the 20,000+ (and I've got some up to 800k) things slow to a crawl.
For those of you not familiar with markov moodels, this is the gist of it..
This is my data.. I've got the actual data (no header) in an RDD at this point.
ID, SEQ
500, HNL, LNH, MLH, HML
We look at sequences in tuples, so
(HNL, LNH), (LNH,MLH), etc..
And I need to get to this point.. where I return a dictionary (for each row of data) that I then serialize and store in an in memory database.
{500:
{HNLLNH : 0.333},
{LNHMLH : 0.333},
{MLHHML : 0.333},
{LNHHNL : 0.000},
etc..
}
So in essence, each sequence is combined with the next (HNL,LNH become 'HNLLNH'), then for all possible transitions (combinations of sequences) we count their occurrence and then divide by the total number of transitions (3 in this case) and get their frequency of occurrence.
There were 3 transitions above, and one of those was HNLLNH.. So for HNLLNH, 1/3 = 0.333
As a side not, and I'm not sure if it's relevant, but the values for each position in a sequence are limited.. 1st position (H/M/L), 2nd position (M/L), 3rd position (H,M,L).
What my code had previously done was to collect() the rdd, and map it a couple times using functions I wrote. Those functions first turned the string into a list, then merged list[1] with list[2], then list[2] with list[3], then list[3] with list[4], etc.. so I ended up with something like this..
[HNLLNH],[LNHMLH],[MHLHML], etc..
Then the next function created a dictionary out of that list, using the list item as a key and then counted the total ocurrence of that key in the full list, divided by len(list) to get the frequency. I then wrapped that dictionary in another dictionary, along with it's ID number (resulting in the 2nd code block, up a above).
Like I said, this worked well for small-ish sequences, but not so well for lists with a length of 100k+.
Also, keep in mind, this is just one row of data. I have to perform this operation on anywhere from 10-20k rows of data, with rows of data varying between lengths of 500-800,000 sequences per row.
Any suggestions on how I can write pyspark code (using the API map/reduce/agg/etc.. functions) to do this efficiently?
EDIT
Code as follows.. Probably makes sense to start at the bottom. Please keep in mind I'm learning this(Python and Spark) as I go, and I don't do this for a living, so my coding standards are not great..
def f(x):
# Custom RDD map function
# Combines two separate transactions
# into a single transition state
cust_id = x[0]
trans = ','.join(x[1])
y = trans.split(",")
s = ''
for i in range(len(y)-1):
s= s + str(y[i] + str(y[i+1]))+","
return str(cust_id+','+s[:-1])
def g(x):
# Custom RDD map function
# Calculates the transition state probabilities
# by adding up state-transition occurrences
# and dividing by total transitions
cust_id=str(x.split(",")[0])
trans = x.split(",")[1:]
temp_list=[]
middle = int((len(trans[0])+1)/2)
for i in trans:
temp_list.append( (''.join(i)[:middle], ''.join(i)[middle:]) )
state_trans = {}
for i in temp_list:
state_trans[i] = temp_list.count(i)/(len(temp_list))
my_dict = {}
my_dict[cust_id]=state_trans
return my_dict
def gen_tsm_dict_spark(lines):
# Takes RDD/string input with format CUST_ID(or)PROFILE_ID,SEQ,SEQ,SEQ....
# Returns RDD of dict with CUST_ID and tsm per customer
# i.e. {cust_id : { ('NLN', 'LNN') : 0.33, ('HPN', 'NPN') : 0.66}
# creates a tuple ([cust/profile_id], [SEQ,SEQ,SEQ])
cust_trans = lines.map(lambda s: (s.split(",")[0],s.split(",")[1:]))
with_seq = cust_trans.map(f)
full_tsm_dict = with_seq.map(g)
return full_tsm_dict
def main():
result = gen_tsm_spark(my_rdd)
# Insert into DB
for x in result.collect():
for k,v in x.iteritems():
db_insert(k,v)

You can try something like below. It depends heavily on tooolz but if you prefer to avoid external dependencies you can easily replace it with some standard Python libraries.
from __future__ import division
from collections import Counter
from itertools import product
from toolz.curried import sliding_window, map, pipe, concat
from toolz.dicttoolz import merge
# Generate all possible transitions
defaults = sc.broadcast(dict(map(
lambda x: ("".join(concat(x)), 0.0),
product(product("HNL", "NL", "HNL"), repeat=2))))
rdd = sc.parallelize(["500, HNL, LNH, NLH, HNL", "600, HNN, NNN, NNN, HNN, LNH"])
def process(line):
"""
>>> process("000, HHH, LLL, NNN")
('000', {'LLLNNN': 0.5, 'HHHLLL': 0.5})
"""
bits = line.split(", ")
transactions = bits[1:]
n = len(transactions) - 1
frequencies = pipe(
sliding_window(2, transactions), # Get all transitions
map(lambda p: "".join(p)), # Joins strings
Counter, # Count
lambda cnt: {k: v / n for (k, v) in cnt.items()} # Get frequencies
)
return bits[0], frequencies
def store_partition(iter):
for (k, v) in iter:
db_insert(k, merge([defaults.value, v]))
rdd.map(process).foreachPartition(store_partition)
Since you know all possible transitions I would recommend using a sparse representation and ignore zeros. Moreover you can replace dictionaries with sparse vectors to reduce memory footprint.

you can achieve this result by using pure Pyspark, i did using it using pyspark.
To create frequencies, let say you have already achieved and these are input RDDs
ID, SEQ
500, [HNL, LNH, MLH, HML ...]
and to get frequencies like, (HNL, LNH),(LNH, MLH)....
inputRDD..map(lambda (k, list): get_frequencies(list)).flatMap(lambda x: x) \
.reduceByKey(lambda v1,v2: v1 +v2)
get_frequencies(states_list):
"""
:param states_list: Its a list of Customer States.
:return: State Frequencies List.
"""
rest = []
tuples_list = []
for idx in range(0,len(states_list)):
if idx + 1 < len(states_list):
tuples_list.append((states_list[idx],states_list[idx+1]))
unique = set(tuples_list)
for value in unique:
rest.append((value, tuples_list.count(value)))
return rest
and you will get results
((HNL, LNH), 98),((LNH, MLH), 458),() ......
after this you may convert result RDDs into Dataframes or yu can directly insert into DB using RDDs mapPartitions

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Spark: More Efficient Aggregation to join strings from different rows - python

Related

How to generate complex Hypothesis data frames with internal row and column dependencies?

Memory efficient way to compare adjacent records in pyspark

Why does my association model find subgroups in a dataset when there shouldn't any?

Python Linear Search Better Efficiency

Algorithmic / coding help for a PySpark markov model

Categories

Resources