Hi I'm trying to make a list of the maximum value of a unique string within a list.
example:
a = ['DS10.json', 'DS11.json', 'DT4.json', 'DT5.json', 'DT6.json', 'CJ6.json', 'CJ7.json']
should return me a list of the following:
['DS11.json', 'DT6.json', 'CJ7.json']
I have tried the following code:
def j(l):
p = []
for i in l:
digcode = i.split('.')[0]
if any(s.startswith(digcode[:2]) for s in p): #there exists prefex in list
if digcode[2:] > p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]:
p.pop([n for n, l in enumerate(p) if l.startswith(digcode[:2])][0])
p.append(digcode)
else:
pass
else:
p.append(digcode)
return p
But when I apply it to a larger sample it does not do an accurate job
>>> o = ['AS6.json', 'AS7.json', 'AS8.json', 'AS9.json', 'BS1.json', 'BS2.json', 'BS3.json', 'BS4.json', 'BS5.json', 'CS1.json', 'CS2.json', 'CS3.json', 'CS4.json', 'CS5.json', 'CS6.json', 'DS10.json', 'DS11.json', 'DS4.json', 'DS5.json', 'DS6.json', 'DS7.json', 'DS8.json', 'DS9.json', 'ES4.json', 'ES5.json', 'ES6.json', 'FS5.json', 'FS6.json', 'FS7.json', 'FS8.json', 'MS4.json', 'MS5.json', 'MS6.json', 'MS7.json', 'MS8.json', 'MS9.json', 'NR1.json', 'NR2.json', 'NR3.json', 'NR4.json', 'NR5.json', 'NR6.json', 'NR7.json', 'NR8.json', 'VR1.json', 'VR2.json', 'VR3.json', 'VR4.json', 'VR5.json', 'VR6.json', 'VR7.json', 'VR8.json', 'XS11.json', 'XS9.json']
>>> j(o)
['AS9', 'BS5', 'CS6', 'DS9', 'ES6', 'FS8', 'MS9', 'NR8', 'VR8', 'XS9']
which is incorrect as there is a XS11 and DS11 as an example.
I would appreciate if someone could help me rectify my problem or perhaps find a simpler solution to my problem. Thank you
You are making string comparisons; '9' is greater than '11' because the character '9' comes later in the alphabet. You'll have to convert those to integers first.
I'd use a dictionary to map prefixes to the maximum number:
def find_latest(lst):
prefixes = {}
for entry in lst:
code, value = entry[:2], int(entry.partition('.')[0][2:])
if value > prefixes.get(code, (float('-inf'), ''))[0]:
prefixes[code] = (value, entry)
return [entry for value, entry in prefixes.values()]
This is far more efficient as it doesn't loop over your whole input list each time; you are processing the list N^2 times (add one element and you are adding N tests to work through); it processes your list in N steps instead. So instead of 100 tests for 10 elements, this just executes 10 tests.
Demo:
>>> sample = ['AS6.json', 'AS7.json', 'AS8.json', 'AS9.json', 'BS1.json', 'BS2.json', 'BS3.json', 'BS4.json', 'BS5.json', 'CS1.json', 'CS2.json', 'CS3.json', 'CS4.json', 'CS5.json', 'CS6.json', 'DS10.json', 'DS11.json', 'DS4.json', 'DS5.json', 'DS6.json', 'DS7.json', 'DS8.json', 'DS9.json', 'ES4.json', 'ES5.json', 'ES6.json', 'FS5.json', 'FS6.json', 'FS7.json', 'FS8.json', 'MS4.json', 'MS5.json', 'MS6.json', 'MS7.json', 'MS8.json', 'MS9.json', 'NR1.json', 'NR2.json', 'NR3.json', 'NR4.json', 'NR5.json', 'NR6.json', 'NR7.json', 'NR8.json', 'VR1.json', 'VR2.json', 'VR3.json', 'VR4.json', 'VR5.json', 'VR6.json', 'VR7.json', 'VR8.json', 'XS11.json', 'XS9.json']
>>> def find_latest(lst):
... prefixes = {}
... for entry in lst:
... code, value = entry[:2], int(entry.partition('.')[0][2:])
... if value > prefixes.get(code, (float('-inf'), ''))[0]:
... prefixes[code] = (value, entry)
... return [entry for value, entry in prefixes.values()]
...
>>> find_latest(sample)
['FS8.json', 'VR8.json', 'AS9.json', 'MS9.json', 'BS5.json', 'CS6.json', 'XS11.json', 'NR8.json', 'DS11.json', 'ES6.json']
It looks as though your digcode[2:] values are being compared lexicographically (dictionary order), rather than numerically.
So 9 is considered to be "larger than" 11, because in a list of words, a word that began with "9" would come after a word that began with "11".
For comparison purposes you should convert digcode[2:] to a number i.e. int(digcode[2:])
if digcode[2:] > p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]:
to
if int(digcode[2:]) > int(p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]):
This gives:
>>> j(o)
['AS9', 'BS5', 'CS6', 'DS11', 'ES6', 'FS8', 'MS9', 'NR8', 'VR8', 'XS11']
Related
I have a list A. I want to generate a new list Anew with if-else loop such that an average is taken whenever there are two values. For example, A[2][0][0]and A[2][1][0] exist hence, average is taken, A[3][1][0] doesn't exist and no average is possible for this position in Anew.
A=[[[0.369577823]],
[[0.987884443]],
[[1.244425627], [1.058887146]],
[[3.707809378]],
[[0.476881021]],
[[0.304499065], [1.037728999]],
[[3.073758392], [5.032947535]]]
Anew=[[[A[0][0][0]],
[A[1][0][0]],
[(A[2][0][0]+A[2][1][0])/2],
[A[3][0][0]],
[A[4][0][0]],
[(A[5][0][0]+A[5][1][0])/2],
[(A[6][0][0]+A[6][1][0])/2]]]
The desired output is
[[[0.369577823],
[0.987884443],
[1.1516563865],
[3.707809378],
[0.476881021],
[0.671114032],
[4.0533529635]]]
Try this.
A=[[[0.369577823]], [[0.987884443]], [[1.244425627], [1.058887146]], [[3.707809378]], [[0.476881021]], [[0.304499065], [1.037728999]],
[[3.073758392], [5.032947535]]]
s2 = [[]]
for a in A:
s = []
for i in a:
s.append(sum(i))
s2[0].append([sum(s)/len(s)])
print(s2)
OUTPUT
[[[0.369577823], [0.987884443], [1.1516563865], [3.707809378], [0.476881021], [0.671114032], [4.0533529635]]]
all_numbers=[[[0.369577823]],
[[0.987884443]],
[[1.244425627], [1.058887146]],
[[3.707809378]],
[[0.476881021]],
[[0.304499065], [1.037728999]],
[[3.073758392], [5.032947535]]]
new_numbers = [[]]
for numbers in all_numbers:
if len(numbers) == 1:
new_numbers[0].append(numbers[0])
else:
new_add = 0
for number in numbers:
new_add += number[0]
new_add = new_add/len(numbers)
new_numbers[0].append([new_add])
print(new_numbers)
output
[[[0.369577823], [0.987884443], [1.1516563865], [3.707809378], [0.476881021], [0.671114032], [4.0533529635]]]
I am running my own little experiment and need a little help with the code.
I am creating a list that stores 100 sets in index locations 0-99, with each stored set storing random numbers ranging from 1 to 100 that came from a randomly generated list containing 100 numbers.
For each set of numbers, I use the set() command to filter out any duplicates before appending this set to a list...so basically I have a list of 100 sets which contain numbers between 1-100.
I wrote a little bit of code to check the length of each set - I noticed that my sets were often 60-69 elements in length! Basically, 1/3 of all numbers is a duplicate.
The code:
from random import randint
sets = []
#Generate list containing 100 sets of sets.
#sets contain numbers between 1 and 100 and no duplicates.
for i in range(0, 100):
nums = []
for x in range(1, 101):
nums.append(randint(1, 100))
sets.append(set(nums))
#print sizes of each set
for i in range(0, len(sets)):
print(len(sets[i]))
#I now want to create a final set
#using the data stored within all sets to
#see if there is any unique value.
So here is the bit I can't get my head around...I want to see if there is a unique number in all of those sets! What I can't work out is how I go about doing that.
I know I can directly compare a set with another set if they are stored in their own variables...but I can't work out an efficient way of looping through a list of sets and compare them all to create a new set which, I hope, might contain just one unique value!
I have seen this code in the documentation...
s.symmetric_difference_update(t)
But I can't work out how I might apply that to my code.
Any help would be greatly appreciated!!
You could use a Counter dict to count the occurrences keeping values that only have a value of 1 across all sets:
from collections import Counter
sets = [{randint(1, 100) for _ in range(100)} for i in range(100)]
from itertools import chain
cn = Counter(chain.from_iterable(sets))
unique = [k for k, v in cn.items() if v == 1] # use {} to get a set
print(unique)
For an element to only be unique to any set the count of the element must be 1 across all sets in your list.
If we use a simple example where we add a value definitely outside our range:
In [27]: from random import randint
In [28]: from collections import Counter
In [29]: from itertools import chain
In [30]: sets = [{randint(1, 100) for _ in range(100)} for i in range(0, 100)]+ [{1, 2, 102},{3,4,103}]
In [31]: cn = Counter(chain.from_iterable(sets))
In [32]: unique = [k for k, v in cn.items() if v == 1]
In [33]: print(unique)
[103, 102]
If you want to find the sets that contain any of those elements:
In [34]: for st in sets:
....: if not st.isdisjoint(unique):
....: print(st)
....:
set([1, 2, 102])
set([3, 4, 103])
For your edited part of the question you can still use a Counter dict using Counter.most_common to get the min and max occurrence:
from collections import Counter
cn = Counter()
identified_sets = 0
sets = ({randint(1, MAX) for _ in range(MAX)} for i in range(MAX))
for i, st in enumerate(sets):
cn.update(st)
if len(st) < 60 or len(st) > 70:
print("Set {} Set Length: {}, Duplicates discarded: {:.0f}% *****".
format(i, len(st), (float((MAX - len(st)))/MAX)*100))
identified_sets += 1
else:
print("Set {} Set Length: {}, Duplicates discarded: {:.0f}%".
format(i, len(st), (float((MAX - len(st)))/MAX)*100))
#print lowest fequency
comm = cn.most_common()
print("key {} : count {}".format(comm[-1][0],comm[-1][1]))
#print highest frequency
print("key {} : count {}".format(comm[0][0], comm[0][1]))
print("Count of identified sets: {}, {:.0f}%".
format(identified_sets, (float(identified_sets)/MAX)*100))
If you call random.seed(0) before you create the sets in this and your own code you will see they both return identical numbers.
well you can do:
result = set()
for s in sets:
result.symmetric_difference_update(s)
After looking through the comments I decided to do things a little differently to accomplish my goal. Essentially, I realised I just wanted to check the frequency of numbers generated by a random number generator after all duplicates have been removed. I thought I could do this by using sets to remove duplicates and then using a set to remove duplicates found in sets...but this actually doesn't work!!
I also noticed that with 100 sets containing a maximum 100 possible numbers, on average the number of duplicated numbers was around 30-40%. As you increase the maximum number of sets and, thus the maximum number of numbers generated, the % of duplicated numbers discarded decreases by a clear pattern.
After further investigation you can work out the % of discarded numbers - its all down to probability of hitting the same number once a number has been generated...
Anyway...thanks for the help!
The code updated:
from random import randint
sets = []
identified_sets = 0
MAX = 100
for i in range(0, MAX):
nums = []
for x in range(1, MAX + 1):
nums.append(randint(1, MAX))
nums.sort()
print("Set %i" % i)
print(nums)
print()
sets.append(set(nums))
for i in range(0, len(sets)):
#Only relevant when using MAX == 100
if len(sets[i]) < 60 or len(sets[i]) > 70:
print("Set %i Set Length: %i, Duplicates discarded: %.0f%s *****" %
(i, len(sets[i]), (float((MAX - len(sets[i])))/MAX)*100, "%"))
identified_sets += 1
else:
print("Set %i Set Length: %i, Duplicates discarded: %.0f%s" %
(i, len(sets[i]), (float((MAX - len(sets[i])))/MAX)*100, "%"))
#dictionary of numbers
count = {}
for i in range(1, MAX + 1):
count[i] = 0
#count occurances of numbers
for s in sets:
for e in s:
count[int(e)] += 1
#print lowest fequency
print("key %i : count %i" %
(min(count, key=count.get), count[min(count, key=count.get)]))
#print highest frequency
print("key %i : count %i" %
(max(count, key=count.get), count[max(count, key=count.get)]))
#print identified sets <60 and >70 in length as these appear less often
print("Count of identified sets: %i, %.0f%s" %
(identified_sets, (float(identified_sets)/MAX)*100, "%"))
You can keep the reversed matrix as well, which is a mapping from numbers to the set of set indexes where this number has places in. This mapping should be a dict (from numbers to sets) in gerenal, but a simple list of sets can do the trick here.
(We could use Counter too, instead of keeping the whole reversed matrix)
from random import randint
sets = [set() for _ in range(100)]
byNum = [set() for _ in range(100)]
#Generate list containing 100 sets of sets.
#sets contain numbers between 1 and 100 and no duplicates.
for setIndex in range(0, 100):
for numIndex in range(100):
num = randint(1, 100)
byNum[num].add(setIndex)
sets[setIndex].add(num)
#print sizes of each set
for setIndex, _set in enumerate(sets):
print(setIndex, len(_set))
#I now want to create a final set
#using the data stored within all sets to
#see if there is any unique value.
for num, setIndexes in enumerate(byNum)[1:]:
if len(setIndexes) == 100:
print 'number %s has appeared in all the random sets'%num
I am trying to append every possible two-digit combination to the end of each string in a list.
The strings are each eight characters, and the digits should replace the seventh and eighth characters.
I am using itertools.product() to generate these two-digit combos, but I am not sure how to then append these combinations to strings. I tried using join(), but that sandwiches the string between each of the two digits.
My next attempt is below but doesn't work because you cannot concatenate 'str' and 'itertools.product' objects.
for p in passwords:
candidates += list(p[:6] + itertools.product(string.digits, string.digits))
So, passwords looks like this
['american', 'japanese']
and the output should be
['americ00', 'americ01', 'americ02', …, 'japane98', 'japane99']
Since you're just counting, the product isn't necessary This could be abbreviated to:
lst = ['american', 'japanese']
candidates = ['{}{:02d}'.format(e[:6], i) for e in lst for i in range(100)]
Which is alanalgous to the loop
candidates = []
for e in lst:
for i in range(100):
candidates.append('{}{:02d}'.format(e[:6], i))
If really want product for one reason or another:
['{}{}{}'.format(e[:6], i, j) for e in lst for i, j in itertools.product(string.digits, string.digits)]
This can also generalize to a product with more arguments
[e[:6] + ''.join(pr) for e in lst for pr in itertools.product(string.digits, string.digits, string.digits)]
If I am understanding this right, you want to create a list of candidates for each password. in that case, you could use something like below, where the replacements are calculated only once, and all the candidates are comma separated values:
import itertools
import string
candidates = []
replacements = ['%s%s' % (x, y) for x, y in itertools.product(string.digits, string.digits)]
list_of_l = ['american', 'japanese']
for l in list_of_l:
candidates += [l[:6] + repl for repl in replacements]
>>> print candidates
['americ00', 'americ01', 'americ02', 'americ03', 'americ04', 'americ05', 'americ06', 'americ07', 'americ08', 'americ09', 'americ10',
'americ11', 'americ12', 'americ13', 'americ14', 'americ15', 'americ16', 'americ17', 'americ18', 'americ19', 'americ20', 'americ21',
'americ22', 'americ23', 'americ24', 'americ25', 'americ26', 'americ27', 'americ28', 'americ29', 'americ30', 'americ31', 'americ32',
'americ33', 'americ34', 'americ35', 'americ36', 'americ37', 'americ38', 'americ39', 'americ40', 'americ41', 'americ42', 'americ43',
'americ44', 'americ45', 'americ46', 'americ47', 'americ48', 'americ49', 'americ50', 'americ51', 'americ52', 'americ53', 'americ54',
'americ55', 'americ56', 'americ57', 'americ58', 'americ59', 'americ60', 'americ61', 'americ62', 'americ63', 'americ64', 'americ65',
'americ66', 'americ67', 'americ68', 'americ69', 'americ70', 'americ71', 'americ72', 'americ73', 'americ74', 'americ75', 'americ76',
'americ77', 'americ78', 'americ79', 'americ80', 'americ81', 'americ82', 'americ83', 'americ84', 'americ85', 'americ86', 'americ87',
'americ88', 'americ89', 'americ90', 'americ91', 'americ92', 'americ93', 'americ94', 'americ95', 'americ96', 'americ97', 'americ98',
'americ99', 'japane00', 'japane01', 'japane02', 'japane03', 'japane04', 'japane05', 'japane06', 'japane07', 'japane08', 'japane09',
'japane10', 'japane11', 'japane12', 'japane13', 'japane14', 'japane15', 'japane16', 'japane17', 'japane18', 'japane19', 'japane20',
'japane21', 'japane22', 'japane23', 'japane24', 'japane25', 'japane26', 'japane27', 'japane28', 'japane29', 'japane30', 'japane31',
'japane32', 'japane33', 'japane34', 'japane35', 'japane36', 'japane37', 'japane38', 'japane39', 'japane40', 'japane41', 'japane42',
'japane43', 'japane44', 'japane45', 'japane46', 'japane47', 'japane48', 'japane49', 'japane50', 'japane51', 'japane52', 'japane53',
'japane54', 'japane55', 'japane56', 'japane57', 'japane58', 'japane59', 'japane60', 'japane61', 'japane62', 'japane63', 'japane64',
'japane65', 'japane66', 'japane67', 'japane68', 'japane69', 'japane70', 'japane71', 'japane72', 'japane73', 'japane74', 'japane75',
'japane76', 'japane77', 'japane78', 'japane79', 'japane80', 'japane81', 'japane82', 'japane83', 'japane84', 'japane85', 'japane86',
'japane87', 'japane88', 'japane89', 'japane90', 'japane91', 'japane92', 'japane93', 'japane94', 'japane95', 'japane96', 'japane97',
'japane98', 'japane99']
Note that list is a python builtin, so it is not advised to use it as a variable name and hence I've used list_of_l to differentiate.
I created a dictionary myDict holding 10 million entries in the following form. Each entry in the dictionary represent {(id, age): code}
>>> myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
A constant ageOffset is defined with value = 0.1
Given an (id,age) tuple, how can I fetch all values from myDict which have key (id, X) where:
age <= X <= age+ageOffset
I need to perform this fetch operation 20 billion times.
Examples:
1.
myTup = ('1039', '68.0')
the answer is: '42731,42781,V4501'
2.
myTup = ('0845', '60.0')
Ans : No value returned
Edit:
Can I create a sub-dictionary, on the basis of partial match on the first element of the Key. I mean, If first element of the tuple Key matched, then create a subdictionary. According to my data, this wont be longer than a couple of hundreds. And then perform linear range search comparing the second element in the tuple key and finding the corresponding values.
To do this operation 20 billion(!) times, you will have to preprocess your data a bit.
First, I would group by id:
def preprocess(data):
from collections import defaultdict # Python 2.5+ only
preprocessed = defaultdict(list)
# group by id
for (id, age), value in data.iteritems():
preprocessed[id].append((float(age), value))
# sort lists for binary search, see edit
for key, value in preprocessed.iteritems():
value.sort()
return preprocessed
Result should look like this:
>>> preprocess(myDict)
defaultdict(<type 'list'>, {
'0845': [(60.4476, '2724,27800,4019')],
'0983': [(63.3936, '41401,4168,4240,V1582,V7281')],
'1039': [(68.0864, '42731,42781,V4501'), (68.1704, '4770,4778,V071')]}
If relatively few items share the same id, thus resulting in short lists, you might get away with filtering the list.
def lookup(data, id, age, age_offset=0.1):
if id in data:
return [value for x, value in data[id] if age <= x <= age+age_offset]
else:
return None
lookup(preprocessed, '1039', 68.0) # Note that I use floats for age
['42731,42781,V4501']
However, if many items share the same id, you will have to traverse long lists, making the lookup relatively slow. In this case, you will have to apply further optimizations.
Edit: as suggested by #Andrey Petrov
from bisect import bisect_left
from itertools import islice, takewhile
def optimized_lookup(data, id, age, age_offset=0.1):
if id in data:
l = data[id]
idx = bisect_left(l, age)
return [a for a,v in takewhile(lambda (x, value): x <= age+age_offset, islice(l, idx, None))]
else:
return None
Here's a way to do it in numpy, and though I haven't tested it I'm pretty confident it will be vastly faster than looping over the dictionary. I replaced the dictionary structure with a Numpy record array, and used np.where to locate the rows where they match the parameters you gave.
import numpy as np
myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
records=[]
for k,v in myDict.iteritems():
records.append([k[0], float(k[1]), v])
myArr = np.rec.fromrecords(records, formats='S10, f4, S100',
names="ID, Age, Code")
def findInMyArray(arr, requestedID, requestedAge, tolerance=0.1):
idx = np.where(((arr["Age"] - requestedAge) < tolerance) & (arr["ID"] == requestedID))
return idx
idx = findInMyArray(myArr, "1039", 68.0, tolerance=0.1)
print "The index found is: ", idx
print "The values are: ", myArr["Code"][idx[0]]
def getr(t):
id = float(t[0])
age = float(t[1])
os = 0.1
rs = []
correct_id=fixed[id]
for k in correct_id.keys():
if (k > age and k <= age + os):
rs.append(correct_id.get(k))
return rs
ct = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281' }
fixed={}
for k in ct:
if not(float(k[0]) in fixed):
fixed[float(k[0])]={}
fixed[float(k[0])][float(k[1])] = ct[k]
print "1"
myTup = ('1039', '68.0')
assert(getr(myTup) == ['42731,42781,V4501'])
#the answer is: '42731,42781,V4501'
print "2"
myTup = ('0845', '60.0')
assert(getr(myTup) == [])
#Ans : No value returned
Instead of a complete shuffle, I am looking for a partial shuffle function in python.
Example : "string" must give rise to "stnrig", but not "nrsgit"
It would be better if I can define a specific "percentage" of characters that have to be rearranged.
Purpose is to test string comparison algorithms. I want to determine the "percentage of shuffle" beyond which an(my) algorithm will mark two (shuffled) strings as completely different.
Update :
Here is my code. Improvements are welcome !
import random
percent_to_shuffle = int(raw_input("Give the percent value to shuffle : "))
to_shuffle = list(raw_input("Give the string to be shuffled : "))
num_of_chars_to_shuffle = int((len(to_shuffle)*percent_to_shuffle)/100)
for i in range(0,num_of_chars_to_shuffle):
x=random.randint(0,(len(to_shuffle)-1))
y=random.randint(0,(len(to_shuffle)-1))
z=to_shuffle[x]
to_shuffle[x]=to_shuffle[y]
to_shuffle[y]=z
print ''.join(to_shuffle)
This is a problem simpler than it looks. And the language has the right tools not to stay between you and the idea,as usual:
import random
def pashuffle(string, perc=10):
data = list(string)
for index, letter in enumerate(data):
if random.randrange(0, 100) < perc/2:
new_index = random.randrange(0, len(data))
data[index], data[new_index] = data[new_index], data[index]
return "".join(data)
Your problem is tricky, because there are some edge cases to think about:
Strings with repeated characters (i.e. how would you shuffle "aaaab"?)
How do you measure chained character swaps or re arranging blocks?
In any case, the metric defined to shuffle strings up to a certain percentage is likely to be the same you are using in your algorithm to see how close they are.
My code to shuffle n characters:
import random
def shuffle_n(s, n):
idx = range(len(s))
random.shuffle(idx)
idx = idx[:n]
mapping = dict((idx[i], idx[i-1]) for i in range(n))
return ''.join(s[mapping.get(x,x)] for x in range(len(s)))
Basically chooses n positions to swap at random, and then exchanges each of them with the next in the list... This way it ensures that no inverse swaps are generated and exactly n characters are swapped (if there are characters repeated, bad luck).
Explained run with 'string', 3 as input:
idx is [0, 1, 2, 3, 4, 5]
we shuffle it, now it is [5, 3, 1, 4, 0, 2]
we take just the first 3 elements, now it is [5, 3, 1]
those are the characters that we are going to swap
s t r i n g
^ ^ ^
t (1) will be i (3)
i (3) will be g (5)
g (5) will be t (1)
the rest will remain unchanged
so we get 'sirgnt'
The bad thing about this method is that it does not generate all the possible variations, for example, it could not make 'gnrits' from 'string'. This could be fixed by making partitions of the indices to be shuffled, like this:
import random
def randparts(l):
n = len(l)
s = random.randint(0, n-1) + 1
if s >= 2 and n - s >= 2: # the split makes two valid parts
yield l[:s]
for p in randparts(l[s:]):
yield p
else: # the split would make a single cycle
yield l
def shuffle_n(s, n):
idx = range(len(s))
random.shuffle(idx)
mapping = dict((x[i], x[i-1])
for i in range(len(x))
for x in randparts(idx[:n]))
return ''.join(s[mapping.get(x,x)] for x in range(len(s)))
import random
def partial_shuffle(a, part=0.5):
# which characters are to be shuffled:
idx_todo = random.sample(xrange(len(a)), int(len(a) * part))
# what are the new positions of these to-be-shuffled characters:
idx_target = idx_todo[:]
random.shuffle(idx_target)
# map all "normal" character positions {0:0, 1:1, 2:2, ...}
mapper = dict((i, i) for i in xrange(len(a)))
# update with all shuffles in the string: {old_pos:new_pos, old_pos:new_pos, ...}
mapper.update(zip(idx_todo, idx_target))
# use mapper to modify the string:
return ''.join(a[mapper[i]] for i in xrange(len(a)))
for i in xrange(5):
print partial_shuffle('abcdefghijklmnopqrstuvwxyz', 0.2)
prints
abcdefghljkvmnopqrstuxwiyz
ajcdefghitklmnopqrsbuvwxyz
abcdefhwijklmnopqrsguvtxyz
aecdubghijklmnopqrstwvfxyz
abjdefgcitklmnopqrshuvwxyz
Evil and using a deprecated API:
import random
# adjust constant to taste
# 0 -> no effect, 0.5 -> completely shuffled, 1.0 -> reversed
# Of course this assumes your input is already sorted ;)
''.join(sorted(
'abcdefghijklmnopqrstuvwxyz',
cmp = lambda a, b: cmp(a, b) * (-1 if random.random() < 0.2 else 1)
))
maybe like so:
>>> s = 'string'
>>> shufflethis = list(s[2:])
>>> random.shuffle(shufflethis)
>>> s[:2]+''.join(shufflethis)
'stingr'
Taking from fortran's idea, i'm adding this to collection. It's pretty fast:
def partial_shuffle(st, p=20):
p = int(round(p/100.0*len(st)))
idx = range(len(s))
sample = random.sample(idx, p)
res=str()
samptrav = 1
for i in range(len(st)):
if i in sample:
res += st[sample[-samptrav]]
samptrav += 1
continue
res += st[i]
return res