I'm a beginner, trying to write code listing the most frequently overlapping ranges in a list of ranges.
So, input is various ranges (#1 through #7 in the example figure; https://prntscr.com/kj80xl) and I would like to find the most common range (in the example 3,000- 4,000 in 6 out of 7 - 86 %). Actually, I would like to find top 5 most frequent.
Not all ranges overlap. Ranges are always positive and given as integers with 1 distance (standard range).
What I have now is only code comparing one sequence to another and returning the overlap, but after that I'm stuck.
def range_overlap(range_x,range_y):
x = (range_x[0], (range_x[-1])+1)
y = (range_y[0], (range_y[-1])+1)
overlap = (max(x[0],y[0]),min(x[-1],(y[-1])))
if overlap[0] <= overlap[1]:
return range(overlap[0], overlap[1])
else:
return "Out of range"
I would be very grateful for any help.
Better solution
I came up with a simpler solution (at least IMHO) so here it is:
def get_abs_min(ranges):
return min([min(r) for r in ranges])
def get_abs_max(ranges):
return max([max(r) for r in ranges])
def count_appearances(i, ranges):
return sum([1 for r in ranges if i in r])
def create_histogram(ranges):
keys = [str(i) for i in range(len(ranges) + 1)]
histogram = dict.fromkeys(keys)
results = []
min = get_abs_min(range_list)
max = get_abs_max(range_list)
for i in range(min, max):
count = str(count_appearances(i, ranges))
if histogram[count] is None:
histogram[count] = dict(start=i, end=None)
elif histogram[count]['end'] is None:
histogram[count]['end'] = i
elif histogram[count]['end'] == i - 1:
histogram[count]['end'] = i
else:
start = histogram[count]['start']
end = histogram[count]['end']
results.append((range(start, end + 1), count))
histogram[count]['start'] = i
histogram[count]['end'] = None
for count, d in histogram.items():
if d is not None and d['start'] is not None and d['end'] is not None:
results.append((range(d['start'], d['end'] + 1), count))
return results
def main(ranges, top):
appearances = create_histogram(ranges)
return sorted(appearances, key=lambda t: t[1], reverse=True)[:top]
The idea here is as simple as iterating through a superposition of all the ranges and building a histogram of appearances (e.g. the number of original ranges this current i appears in)
After that just sort and slice according to the chosen size of the results.
Just call main with the ranges and the top number you want (or None if you want to see all results).
OLDER EDITS BELOW
I (almost) agree with #Kasramvd's answer.
here is my take on it:
from collections import Counter
from itertools import combinations
def range_overlap(x, y):
common_part = list(set(x) & set(y))
if common_part:
return range(common_part[0], common_part[-1] +1)
else:
return False
def get_most_common(range_list, top_frequent):
overlaps = Counter(range_overlap(i, j) for i, j in
combinations(list_of_ranges, 2))
return [(r, i) for (r, i) in overlaps.most_common(top_frequent) if r]
you need to input the range_list and the number of top_frequent you want.
EDIT
the previous answer solved this question for all 2's combinations over the range list.
This edit is tested against your input and results with the correct answer:
from collections import Counter
from itertools import combinations
def range_overlap(*args):
sets = [set(r) for r in args]
common_part = list(set(args[0]).intersection(*sets))
if common_part:
return range(common_part[0], common_part[-1] +1)
else:
return False
def get_all_possible_combinations(range_list):
all_combos = []
for i in range(2, len(range_list)):
all_combos.append(combinations(range_list, i))
all_combos = [list(combo) for combo in all_combos]
return all_combos
def get_most_common_for_combo(combo):
return list(filter(None, [range_overlap(*option) for option in combo]))
def get_most_common(range_list, top_frequent):
all_overlaps = []
combos = get_all_possible_combinations(range_list)
for combo in combos:
all_overlaps.extend(get_most_common_for_combo(combo))
return [r for (r, i) in Counter(all_overlaps).most_common(top_frequent) if r]
And to get the results just run get_most_common(range_list, top_frequent)
Tested on my machine (ubunut 16.04 with python 3.5.2) with your input range_list and top_frequent = 5 with the results:
[range(3000, 4000), range(2500, 4000), range(1500, 4000), range(3000, 6000), range(1, 4000)]
You can first change your function to return a valid range in both cases so that you can use it in a set of comparisons. Also, since Python's range objects are not already created iterables but smart objects that only get start, stop and step attributes of a range and create the range on-demand, you can do a little change on your function as well.
def range_overlap(range_x,range_y):
rng = range(max(range_x.start, range_y.start),
min(range_x.stop, range_y.stop)+1)
if rng.start < rng.stop:
return rng.start, rng.stop
Now, if you have a set of ranges and you want to compare all the pairs you can use itertools.combinations to get all the pairs and then using range_overlap and collections.Counter you can find the number of overlapped ranges.
from collections import Counter
from itertools import combinations
overlaps = Counter(range_overlap(i,j) for i, j in
combinations(list_of_ranges, 2))
Related
I am using Python 3.6 where you can nicely zip individual generators of one kind to get a multidimensional generator of the same kind. Take the following example, where get_random_sequence is a generator that yields an infinite random number sequence to simulate one individual asset at a stock market.
import random
from typing import Iterator
def get_random_sequence(start_value: float) -> Iterator[float]:
value = start_value
yield value
while True:
r = random.uniform(-.1, .1) * value
value = value + r
yield value
g = get_random_sequence(1.)
a = [next(g) for _ in range(5)]
print(a)
# [1.0, 1.015821868415922, 1.051470250712725, 0.9827564500218019, 0.9001851912863]
This generator can be easily extended with Python's zip function and yield from to generate successive asset values at a simulated market with an arbitrary number of assets.
def get_market(no_assets: int = 10) -> Iterator[Sequence[float]]:
rg = tuple(get_random_sequence(random.uniform(10., 60.)) for _ in range(no_assets))
yield from zip(*rg)
gm = get_market(2)
b = [next(gm) for _ in range(5)]
print(b)
# [(55.20719435959121, 54.15552382961163), (51.64409510285255, 53.6327489348457), (50.16517363363749, 52.92881727359184), (48.8692976247231, 52.7090801870517), (52.49414777987645, 49.733746261206036)]
What I like about this approach is the use of yield from to avoid a while True: loop in which a tuple of n assets would have to be constructed explicitly.
My question is: Is there a way to apply yield from in a similar manner when the zipped generators receive values over send()?
Consider the following generator that yields the ratio of successive values in an infinite sequence.
from typing import Optional, Generator
def ratio_generator() -> Generator[float, Optional[float], None]:
value_last = yield
value = yield
while True:
ratio = 0. if value_last == 0. else value / value_last
value_last = value
value = yield ratio
gr = ratio_generator()
next(gr) # move to the first yield
g = get_random_sequence(1.)
a = []
for _v in g:
_r = gr.send(_v)
if _r is None:
# two values are required for a ratio
continue
a.append(_r)
if len(a) >= 5:
break
print(a)
# [1.009041186223442, 0.9318419861800313, 1.0607677437816718, 0.9237896996817375, 0.9759635921282439]
The best way to "zip" this generator I could come up with, unfortunately, does not involve yield from at all... but instead the ugly while True: solution mentioned above.
def ratio_generator_multiple(no_values: int) -> Generator[Sequence[float], Optional[Sequence[float]], None]:
gs = tuple(ratio_generator() for _ in range(no_values))
for each_g in gs:
next(each_g)
values = yield
ratios = tuple(g.send(v) for g, v in zip(gs, values))
while True: # :(
values = yield None if None in ratios else ratios
ratios = tuple(g.send(v) for g, v in zip(gs, values))
rgm = ratio_generator_multiple(2)
next(rgm) # move to the first yield
gm = get_market(2)
b = []
for _v in gm:
_r = rgm.send(_v)
if _r is None:
# two values are required for a ratio
continue
b.append(_r)
if len(b) >= 5:
break
print(b)
# [(1.0684036496767984, 1.0531433541856687), (1.0279604693226763, 1.0649271401851732), (1.0469406709985847, 0.9350856571355237), (0.9818403001921499, 1.0344633443394962), (1.0380945284830183, 0.9081599684720663)]
Is there a way to do something like values = yield from zip(*(g.send(v) for g, v in zip(generators, values))) so I can still use yield from on the zipped generators without a while True:?
(The given example doesn't work, because it doesn't refresh the values on the right-hand side with the values on the left-hand side.)
I realize that this is more of an aesthetic problem. It would still be nice to know though...
import numpy as np
x = ([1,2,3,3])
y = ([1,2,3])
z = ([6,6,1,2,9,9])
(only positive values)
In each array i need to return the most common value, or, if values come up the same amount of times - return the minimum.
This is home assignment and I can't use anything but numpy.
outputs:
f(x) = 3,
f(y) = 1,
f(z) = 6
for a numpy exclusive solution something like this will work:
occurances = np.bincount(x)
print (np.argmax(occurances))
The above mentioned method won't work if there is a negative number in the list. So in order to account for such an occurrence kindly use:
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
print(not_required[np.argmax(counts)])
else:
print(not_required[np.argmax(counts)])
It's called a mode function. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html
Without numpy
n_dict = {}
for k in x:
try:
n_dict[k] += 1
except KeyError:
n_dict[k] = 1
rev_n_dict = {}
for k in n_dict:
if n_dict[k] not in rev_n_dict:
rev_n_dict[n_dict[k]] = [k]
else:
rev_n_dict[n_dict[k]].append(k)
local_max = 0
for k in rev_n_dict:
if k > local_max:
local_max = k
if len(rev_n_dict[local_max]) > 0:
print (min(rev_n_dict[local_max]))
else:
print (rev_n_dict[local_max])
To add to the previous results, you could use a collections.Counter object:
my_array = [3,24,543,3,1,6,7,8,....,223213,13213]
from collections import Counter
my_counter = Counter( my_array)
most_common_value = my_counter.most_common(1)[0][0]
It is quite simple but certainly not pretty. I have used variable names that will be self explanatory along with the comments. Feel free to ask if there is a doubt.
import numpy as np
x=([6,6,1,2,9,9])
def tester(x):
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
highest_occurance=[not_required[np.argmax(counts)]]
number_of_counts=np.max(counts)
else:
highest_occurance=not_required[np.argmax(counts)]
number_of_counts=np.max(counts)
return highest_occurance,number_of_counts
most_abundant,first_test_counts=(tester(x))
new_x=[vals for vals in x if vals not in most_abundant]
second_most_abundant,second_test_counts=(tester(new_x))
if second_test_counts==first_test_counts:
print("Atleast two elements have the same number of counts",most_abundant," and", second_most_abundant, "have %s"%first_test_counts,"occurances")
else:
print("%s occurrs for the max of %s times"%(most_abundant,first_test_counts))
we can also loop it to check if there are more than two elements with the same occurrence, instead of using an if else for a specific case of only looking at two elements
I have data for businesses that has categories and review counts. I have grouped the categories together for each business and I want to separate out those businesses that have review counts that are above the median number of review counts within each category and those that are below the median number of review counts. Essentially, I need to return a Series of median values indexed by category and use that to find out if a given business is greater than the median for its category. I have to compare its review count to the median for its category.
My code is throwing errors and I can't figure out why. Suggestions? I've tried both of the below.
n = df.groupby('category')['review_count'].size()
def cats_median_split(n):
s = df.groupby('category')['review_count'].median()
if n > s:
return True
else:
return False
df.groupby('category')['review_count'].apply(cats_median_split)
OR:
n = df.groupby('category')['review_count'].sum()
def cats_median_split(n):
s = n.median()
if n > s:
return True
else:
return False
df.groupby('category')['review_count'].apply(cats_median_split)
If I understood correctly you wish to:
def median (seq, index=0):
customcmp = lambda x, y: cmp(x[index], y[index])
seq = sorted(seq, customcmp)
l = len(seq)
if l%2==0:
return (seq[l/2-1][index]+seq[l/2][index])/2.0
return seq[l/2][index]
def split (seq, index=0, trashhold=0):
left = []; right = []
for element in seq:
if element[index]<trashhold:
left.append(element)
else:
right.append(element)
return left, right
cats = [(123, 345), (99, 258), (9753, 36754), (234, 216), (123456, 76543)]
m = median(cats, 1)
split(cats, 0, m)
For median you better use numpy, but for smaller sequences this implementation will do.
This question already has answers here:
Union of multiple ranges
(5 answers)
Closed 7 years ago.
I'm trying to remove overlapping values from a collection of ranges.
The ranges are represented by a string like this:
499-505 100-115 80-119 113-140 500-550
I want the above to be reduced to two ranges: 80-140 499-550. That covers all the values without overlap.
Currently I have the following code.
cr = "100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250".split(" ")
ar = []
br = []
for i in cr:
(left,right) = i.split("-")
ar.append(left);
br.append(right);
inc = 0
for f in br:
i = int(f)
vac = []
jnc = 0
for g in ar:
j = int(g)
if(i >= j):
vac.append(j)
del br[jnc]
jnc += jnc
print vac
inc += inc
I split the array by - and store the range limits in ar and br. I iterate over these limits pairwise and if the i is at least as great as the j, I want to delete the element. But the program doesn't work. I expect it to produce this result: 80-125 500-550 200-250 180-185
For a quick and short solution,
from operator import itemgetter
from itertools import groupby
cr = "499-505 100-115 80-119 113-140 500-550".split(" ")
fullNumbers = []
for i in cr:
a = int(i.split("-")[0])
b = int(i.split("-")[1])
fullNumbers+=range(a,b+1)
# Remove duplicates and sort it
fullNumbers = sorted(list(set(fullNumbers)))
# Taken From http://stackoverflow.com/questions/2154249
def convertToRanges(data):
result = []
for k, g in groupby(enumerate(data), lambda (i,x):i-x):
group = map(itemgetter(1), g)
result.append(str(group[0])+"-"+str(group[-1]))
return result
print convertToRanges(fullNumbers)
#Output: ['80-140', '499-550']
For the given set in your program, output is ['80-125', '180-185', '200-250', '500-550']
Main Possible drawback of this solution: This may not be scalable!
Let me offer another solution that doesn't take time linearly proportional to the sum of the range sizes. Its running time is linearly proportional to the number of ranges.
def reduce(range_text):
parts = range_text.split()
if parts == []:
return ''
ranges = [ tuple(map(int, part.split('-'))) for part in parts ]
ranges.sort()
new_ranges = []
left, right = ranges[0]
for range in ranges[1:]:
next_left, next_right = range
if right + 1 < next_left: # Is the next range to the right?
new_ranges.append((left, right)) # Close the current range.
left, right = range # Start a new range.
else:
right = max(right, next_right) # Extend the current range.
new_ranges.append((left, right)) # Close the last range.
return ' '.join([ '-'.join(map(str, range)) for range in new_ranges ]
This function works by sorting the ranges, then looking at them in order and merging consecutive ranges that intersect.
Examples:
print(reduce('499-505 100-115 80-119 113-140 500-550'))
# => 80-140 499-550
print(reduce('100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250'))
# => 80-125 180-185 200-250 500-550
Instead of a complete shuffle, I am looking for a partial shuffle function in python.
Example : "string" must give rise to "stnrig", but not "nrsgit"
It would be better if I can define a specific "percentage" of characters that have to be rearranged.
Purpose is to test string comparison algorithms. I want to determine the "percentage of shuffle" beyond which an(my) algorithm will mark two (shuffled) strings as completely different.
Update :
Here is my code. Improvements are welcome !
import random
percent_to_shuffle = int(raw_input("Give the percent value to shuffle : "))
to_shuffle = list(raw_input("Give the string to be shuffled : "))
num_of_chars_to_shuffle = int((len(to_shuffle)*percent_to_shuffle)/100)
for i in range(0,num_of_chars_to_shuffle):
x=random.randint(0,(len(to_shuffle)-1))
y=random.randint(0,(len(to_shuffle)-1))
z=to_shuffle[x]
to_shuffle[x]=to_shuffle[y]
to_shuffle[y]=z
print ''.join(to_shuffle)
This is a problem simpler than it looks. And the language has the right tools not to stay between you and the idea,as usual:
import random
def pashuffle(string, perc=10):
data = list(string)
for index, letter in enumerate(data):
if random.randrange(0, 100) < perc/2:
new_index = random.randrange(0, len(data))
data[index], data[new_index] = data[new_index], data[index]
return "".join(data)
Your problem is tricky, because there are some edge cases to think about:
Strings with repeated characters (i.e. how would you shuffle "aaaab"?)
How do you measure chained character swaps or re arranging blocks?
In any case, the metric defined to shuffle strings up to a certain percentage is likely to be the same you are using in your algorithm to see how close they are.
My code to shuffle n characters:
import random
def shuffle_n(s, n):
idx = range(len(s))
random.shuffle(idx)
idx = idx[:n]
mapping = dict((idx[i], idx[i-1]) for i in range(n))
return ''.join(s[mapping.get(x,x)] for x in range(len(s)))
Basically chooses n positions to swap at random, and then exchanges each of them with the next in the list... This way it ensures that no inverse swaps are generated and exactly n characters are swapped (if there are characters repeated, bad luck).
Explained run with 'string', 3 as input:
idx is [0, 1, 2, 3, 4, 5]
we shuffle it, now it is [5, 3, 1, 4, 0, 2]
we take just the first 3 elements, now it is [5, 3, 1]
those are the characters that we are going to swap
s t r i n g
^ ^ ^
t (1) will be i (3)
i (3) will be g (5)
g (5) will be t (1)
the rest will remain unchanged
so we get 'sirgnt'
The bad thing about this method is that it does not generate all the possible variations, for example, it could not make 'gnrits' from 'string'. This could be fixed by making partitions of the indices to be shuffled, like this:
import random
def randparts(l):
n = len(l)
s = random.randint(0, n-1) + 1
if s >= 2 and n - s >= 2: # the split makes two valid parts
yield l[:s]
for p in randparts(l[s:]):
yield p
else: # the split would make a single cycle
yield l
def shuffle_n(s, n):
idx = range(len(s))
random.shuffle(idx)
mapping = dict((x[i], x[i-1])
for i in range(len(x))
for x in randparts(idx[:n]))
return ''.join(s[mapping.get(x,x)] for x in range(len(s)))
import random
def partial_shuffle(a, part=0.5):
# which characters are to be shuffled:
idx_todo = random.sample(xrange(len(a)), int(len(a) * part))
# what are the new positions of these to-be-shuffled characters:
idx_target = idx_todo[:]
random.shuffle(idx_target)
# map all "normal" character positions {0:0, 1:1, 2:2, ...}
mapper = dict((i, i) for i in xrange(len(a)))
# update with all shuffles in the string: {old_pos:new_pos, old_pos:new_pos, ...}
mapper.update(zip(idx_todo, idx_target))
# use mapper to modify the string:
return ''.join(a[mapper[i]] for i in xrange(len(a)))
for i in xrange(5):
print partial_shuffle('abcdefghijklmnopqrstuvwxyz', 0.2)
prints
abcdefghljkvmnopqrstuxwiyz
ajcdefghitklmnopqrsbuvwxyz
abcdefhwijklmnopqrsguvtxyz
aecdubghijklmnopqrstwvfxyz
abjdefgcitklmnopqrshuvwxyz
Evil and using a deprecated API:
import random
# adjust constant to taste
# 0 -> no effect, 0.5 -> completely shuffled, 1.0 -> reversed
# Of course this assumes your input is already sorted ;)
''.join(sorted(
'abcdefghijklmnopqrstuvwxyz',
cmp = lambda a, b: cmp(a, b) * (-1 if random.random() < 0.2 else 1)
))
maybe like so:
>>> s = 'string'
>>> shufflethis = list(s[2:])
>>> random.shuffle(shufflethis)
>>> s[:2]+''.join(shufflethis)
'stingr'
Taking from fortran's idea, i'm adding this to collection. It's pretty fast:
def partial_shuffle(st, p=20):
p = int(round(p/100.0*len(st)))
idx = range(len(s))
sample = random.sample(idx, p)
res=str()
samptrav = 1
for i in range(len(st)):
if i in sample:
res += st[sample[-samptrav]]
samptrav += 1
continue
res += st[i]
return res