Optimizing python code - python

Any tips on optimizing this python code for finding next palindrome:
Input number can be of 1000000 digits
COMMENTS ADDED
#! /usr/bin/python
def inc(lst,lng):#this function first extract the left half of the string then
#convert it to int then increment it then reconvert it to string
#then reverse it and finally append it to the left half.
#lst is input number and lng is its length
if(lng%2==0):
olst=lst[:lng/2]
l=int(lng/2)
olst=int(olst)
olst+=1
olst=str(olst)
p=len(olst)
if l<p:
olst2=olst[p-2::-1]
else:
olst2=olst[::-1]
lst=olst+olst2
return lst
else:
olst=lst[:lng/2+1]
l=int(lng/2+1)
olst=int(olst)
olst+=1
olst=str(olst)
p=len(olst)
if l<p:
olst2=olst[p-3::-1]
else:
olst2=olst[p-2::-1]
lst=olst+olst2
return lst
t=raw_input()
t=int(t)
while True:
if t>0:
t-=1
else:
break
num=raw_input()#this is input number
lng=len(num)
lst=num[:]
if(lng%2==0):#this if find next palindrome to num variable
#without incrementing the middle digit and store it in lst.
olst=lst[:lng/2]
olst2=olst[::-1]
lst=olst+olst2
else:
olst=lst[:lng/2+1]
olst2=olst[len(olst)-2::-1]
lst=olst+olst2
if int(num)>=int(lst):#chk if lst satisfies criteria for next palindrome
num=inc(num,lng)#otherwise call inc function
print num
else:
print lst

I think most of the time in this code is spent converting strings to integers and back. The rest is slicing strings and bouncing around in the Python interpreter. What can be done about these three things? There are a few unnecessary conversions in the code, which we can remove. I see no way to avoid the string slicing. To minimize your time in the interpreter you just have to write as little code as possible :-) and it also helps to put all your code inside functions.
The code at the bottom of your program, which takes a quick guess to try and avoid calling inc(), has a bug or two. Here's how I might write that part:
def nextPal(num):
lng = len(num)
guess = num[:lng//2] + num[(lng-1)//2::-1] # works whether lng is even or odd
if guess > num: # don't bother converting to int
return guess
else:
return inc(numstr, n)
This simple change makes your code about 100x faster for numbers where inc doesn't need to be called, and about 3x faster for numbers where it does need to be called.
To do better than that, I think you need to avoid converting to int entirely. That means incrementing the left half of the number without using ordinary Python integer addition. You can use an array and carry out the addition algorithm "by hand":
import array
def nextPal(numstr):
# If we don't need to increment, just reflect the left half and return.
n = len(numstr)
h = n//2
guess = numstr[:n-h] + numstr[h-1::-1]
if guess > numstr:
return guess
# Increment the left half of the number without converting to int.
a = array.array('b', numstr)
zero = ord('0')
ten = ord('9') + 1
for i in range(n - h - 1, -1, -1):
d = a[i] + 1
if d == ten:
a[i] = zero
else:
a[i] = d
break
else:
# The left half was all nines. Carry the 1.
# Update n and h since the length changed.
a.insert(0, ord('1'))
n += 1
h = n//2
# Reflect the left half onto the right half.
a[n-h:] = a[h-1::-1]
return a.tostring()
This is another 9x faster or so for numbers that require incrementing.
You can make this a touch faster by using a while loop instead of for i in range(n - h - 1, -1, -1), and about twice as fast again by having the loop update both halves of the array rather than just updating the left-hand half and then reflecting it at the end.

You don't have to find the palindrome, you can just generate it.
Split the input number, and reflect it. If the generated number is too small, then increment the left hand side and reflect it again:
def nextPal(n):
ns = str(n)
oddoffset = 0
if len(ns) % 2 != 0:
oddoffset = 1
leftlen = len(ns) / 2 + oddoffset
lefts = ns[0:leftlen]
right = lefts[::-1][oddoffset:]
p = int(lefts + right)
if p < n:
## Need to increment middle digit
left = int(lefts)
left += 1
lefts = str(left)
right = lefts[::-1][oddoffset:]
p = int(lefts + right)
return p
def test(n):
print n
p = nextPal(n)
assert p >= n
print p
test(1234567890)
test(123456789)
test(999999)
test(999998)
test(888889)
test(8999999)

EDIT
NVM, just look at this page: http://thetaoishere.blogspot.com/2009/04/finding-next-palindrome-given-number.html

Using strings. n >= 0
from math import floor, ceil, log10
def next_pal(n):
# returns next palindrome, param is an int
n10 = str(n)
m = len(n10) / 2.0
s, e = int(floor(m - 0.5)), int(ceil(m + 0.5))
start, middle, end = n10[:s], n10[s:e], n10[e:]
assert (start, middle[0]) == (end[-1::-1], middle[-1]) #check that n is actually a palindrome
r = int(start + middle[0]) + 1 #where the actual increment occurs (i.e. add 1)
r10 = str(r)
i = 3 - len(middle)
if len(r10) > len(start) + 1:
i += 1
return int(r10 + r10[-i::-1])
Using log, more optized. n > 9
def next_pal2(n):
k = log10(n + 1)
l = ceil(k)
s, e = int(floor(l/2.0 - 0.5)), int(ceil(l/2.0 + 0.5))
mmod, emod = 10**(e - s), int(10**(l - e))
start, end = divmod(n, emod)
start, middle = divmod(start, mmod)
r1 = 10*start + middle%10 + 1
i = middle > 9 and 1 or 2
j = s - i + 2
if k == l:
i += 1
r2 = int(str(r1)[-i::-1])
return r1*10**j + r2

Related

How to write a program with unspecified number of if-else

I have a little problem which looks like a “combinatorics” problem.
We know that 1+2+3+…+k = (k^2 + k)/2; so, let’s take the set of numbers S = {1,2,3,4,…,(k^2 + k)/2} and divide this collection into k parts:
The 1st part has 1 element 1; the 2nd has 2 elements 2,3; the 3rd has 3 elements 4,5,6; …and so on…; the kth having k elements (k^2 – k + 2)/2,…,(k^2 + k)/2.
Then I have to draw a random integer in S, say i = random.randint(1, (k^2 + k)/2) and I have to do some operations according to the element that was drawn:
if i == 1:
`something`
else if 2 <= i <= 3:
`something else`
else if 4 <= i <= 6:
`something else`
…
else: # last line when `i` is in the last `kth` part
`something else`
The number k I have to use is variable, so I can't actually write the above program, because I don't know a priori where it should stop...
It seems to me that the best would be to define a function:
def cases(k):
i = random.randint(1, (k^2 + k)/2)
if i == 1:
`something`
else if 2 <= i <= 3:
… and so on…
But the problem remains: how could I write such a function without a specific k? There may be a trick in Python to do this, but I don't see how.
All ideas will be welcome.
Sorry for the indenting; I acknowledge it's wrong. Following the comment of Zach Munro, I allow myself an answer to better clarify the idea that I had in mind and which was not very clear.
Below is the kind of program I was thinking of; the something and something else are actually similar, for they use the same function, but with a different domain each time:
def cases(start, end, k):
delta = (end - start) / k
i = random.randint(1, (k**2 + k)/2)
if i == 1:
# random in the 1st part
x = random.uniform(start, start + 1*delta)
elif 2 <= i <= 3:
# random in the 2nd part
x = random.uniform(start + 1*delta, start + 2*delta)
elif 4 <= i <= 6:
# random in the 3rd part
x = random.uniform(start + 2*delta, start + 3*delta)
...
elif (k**2 – k + 2)/2 <= i <= (k**2 + k)/2:
# random in the kth part
x = random.uniform(start + (k-1)*delta, start + k*delta)
The question is always how to stop using ... in the program, but to make something that runs when the parameters are provided (start is the beginning of an interval, end is the end of the interval and k is in fact the number of parts into which we separate this interval).
Basically, we sample more and more as we move away from the origin of the interval.

Simple Fun #159: Middle Permutation

Task
You are given a string s. Every letter in s appears once.
Consider all strings formed by rearranging the letters in s. After ordering these strings in dictionary order, return the middle term. (If the sequence has a even length n, define its middle term to be the (n/2)th term.)
The problem
Hey guys. I am totaly stuck... I`ve got an algorithm to calculate the answer in O(n). All basic tests are passed. But I constantly fail all tests, where the lenght of string equals 23,24,25. Some scary stuff happens, always like this:
'noabcdefgijklymzustvpxwrq' should equal 'nmzyxwvutsrqpolkjigfedcba'
'lzyxvutsrqonmeijhkcafdgb' should equal 'lzyxvutsrqonmkjihgfedcba'
I mean that it goes in the right direction, but suddenly mistakes. Give me a hint what I should check or what thing to fix. Thanks a lot!
P.S. This execute middlePermutation in under 12000ms gave me the idea of solving
Code
import math
def middle_permutation(string):
ans, tmp = '', sorted(list(string))
dividend = math.factorial(len(tmp)) / 2
for i in range(len(tmp)):
perms = math.factorial(len(tmp)) / len(tmp)
if len(tmp) == 1:
ans += tmp[0]
break
letter = tmp[math.ceil(dividend / perms) - 1]
ans += letter
tmp.remove(letter)
dividend -= perms * (math.floor(dividend / perms))
print(len(string))
return ans
Here are some basic inputs
Test.describe("Basic tests")
Test.assert_equals(middle_permutation("abc"),"bac")
Test.assert_equals(middle_permutation("abcd"),"bdca")
Test.assert_equals(middle_permutation("abcdx"),"cbxda")
Test.assert_equals(middle_permutation("abcdxg"),"cxgdba")
Test.assert_equals(middle_permutation("abcdxgz"),"dczxgba")
You're not far from a good answer.
Because of 0 versus 1 indexing, you should start with
dividend = math.factorial(len(tmp)) // 2 - 1
and then you choose a slightly off letter, replace your code with
letter = tmp[dividend // perms]
Also as everything is integer here, it's better to use 'a // b' instead of math.floor(a / b).
All in all, here's a corrected version of your code:
def middle_permutation(string):
ans, tmp = '', sorted(list(string))
dividend = math.factorial(len(tmp)) // 2 - 1
for i in range(len(tmp)):
perms = math.factorial(len(tmp)) // len(tmp)
if len(tmp) == 1:
ans += tmp[0]
break
letter = tmp[dividend // perms]
ans += letter
tmp.remove(letter)
dividend -= perms * (dividend // perms)
return ans
and just for the beauty of it, a generalization:
def n_in_base(n, base):
r = []
for b in base:
r.append(n % b)
n //= b
return reversed(r)
def nth_permutation(s, n):
digits = n_in_base(n, range(1, len(s)+1))
alphabet = sorted(s)
return ''.join(alphabet.pop(ri) for ri in digits)
def middle_permutation(s):
return nth_permutation(s, math.factorial(len(s)) // 2 - 1)

Python: Streamlining Code for Brown Numbers

I was curious if any of you could come up with a more streamline version of code to calculate Brown numbers. as of the moment, this code can do ~650! before it moves to a crawl. Brown Numbers are calculated thought the equation n! + 1 = m**(2) Where M is an integer
brownNum = 8
import math
def squareNum(n):
x = n // 2
seen = set([x])
while x * x != n:
x = (x + (n // x)) // 2
if x in seen: return False
seen.add(x)
return True
while True:
for i in range(math.factorial(brownNum)+1,math.factorial(brownNum)+2):
if squareNum(i) is True:
print("pass")
print(brownNum)
print(math.factorial(brownNum)+1)
break
else:
print(brownNum)
print(math.factorial(brownNum)+1)
brownNum = brownNum + 1
continue
break
print(input(" "))
Sorry, I don't understand the logic behind your code.
I don't understand why you calculate math.factorial(brownNum) 4 times with the same value of brownNum each time through the while True loop. And in the for loop:
for i in range(math.factorial(brownNum)+1,math.factorial(brownNum)+2):
i will only take on the value of math.factorial(brownNum)+1
Anyway, here's my Python 3 code for a brute force search of Brown numbers. It quickly finds the only 3 known pairs, and then proceeds to test all the other numbers under 1000 in around 1.8 seconds on this 2GHz 32 bit machine. After that point you can see it slowing down (it hits 2000 around the 20 second mark) but it will chug along happily until the factorials get too large for your machine to hold.
I print progress information to stderr so that it can be separated from the Brown_number pair output. Also, stderr doesn't require flushing when you don't print a newline, unlike stdout (at least, it doesn't on Linux).
import sys
# Calculate the integer square root of `m` using Newton's method.
# Returns r: r**2 <= m < (r+1)**2
def int_sqrt(m):
if m <= 0:
return 0
n = m << 2
r = n >> (n.bit_length() // 2)
while True:
d = (n // r - r) >> 1
r += d
if -1 <= d <= 1:
break
return r >> 1
# Search for Browns numbers
fac = i = 1
while True:
if i % 100 == 0:
print('\r', i, file=sys.stderr, end='')
fac *= i
n = fac + 1
r = int_sqrt(n)
if r*r == n:
print('\nFound', i, r)
i += 1
You might want to:
pre calculate your square numbers, instead of testing for them on the fly
pre calculate your factorial for each loop iteration num_fac = math.factorial(brownNum) instead of multiple calls
implement your own, memoized, factorial
that should let you run to the hard limits of your machine
one optimization i would make would be to implement a 'wrapper' function around math.factorial that caches previous values of factorial so that as your brownNum increases, factorial doesn't have as much work to do. this is known as 'memoization' in computer science.
edit: found another SO answer with similar intention: Python: Is math.factorial memoized?
You should also initialize the square root more closely to the root.
e = int(math.log(n,4))
x = n//2**e
Because of 4**e <= n <= 4**(e+1) the square root will be between x/2 and x which should yield quadratic convergence of the Heron formula from the first iteration on.

Optimise the solution to Project Euler 12 (Python)

I have the following code for Project Euler Problem 12. However, it takes a very long time to execute. Does anyone have any suggestions for speeding it up?
n = input("Enter number: ")
def genfact(n):
t = []
for i in xrange(1, n+1):
if n%i == 0:
t.append(i)
return t
print "Numbers of divisors: ", len(genfact(n))
print
m = input("Enter the number of triangle numbers to check: ")
print
for i in xrange (2, m+2):
a = sum(xrange(i))
b = len(genfact(a))
if b > 500:
print a
For n, I enter an arbitrary number such as 6 just to check whether it indeed returns the length of the list of the number of factors.
For m, I enter entered 80 000 000
It works relatively quickly for small numbers. If I enter b > 50 ; it returns 28 for a, which is correct.
My answer here isn't pretty or elegant, it is still brute force. But, it simplifies the problem space a little and terminates successfully in less than 10 seconds.
Getting factors of n:
Like #usethedeathstar mentioned, it is possible to test for factors only up to n/2. However, we can do better by testing only up to the square root of n:
let n = 36
=> factors(n) : (1x36, 2x18, 3x12, 4x9, 6x6, 9x4, 12x3, 18x2, 36x1)
As you can see, it loops around after 6 (the square root of 36). We also don't need to explicitly return the factors, just find out how many there are... so just count them off with a generator inside of sum():
import math
def get_factors(n):
return sum(2 for i in range(1, round(math.sqrt(n)+1)) if not n % i)
Testing the triangular numbers
I have used a generator function to yield the triangular numbers:
def generate_triangles(limit):
l = 1
while l <= limit:
yield sum(range(l + 1))
l += 1
And finally, start testing:
def test_triangles():
triangles = generate_triangles(100000)
for i in triangles:
if get_factors(i) > 499:
return i
Running this with the profiler, it completes in less than 10 seconds:
$ python3 -m cProfile euler12.py
361986 function calls in 8.006 seconds
The BIGGEST time saving here is get_factors(n) testing only up to the square root of n - this makes it heeeaps quicker and you save heaps of memory overhead by not generating a list of factors.
As I said, it still isn't pretty - I am sure there are more elegant solutions. But, it fits the bill of being faster :)
I got my answer to run in 1.8 seconds with Python.
import time
from math import sqrt
def count_divisors(n):
d = {}
count = 1
while n % 2 == 0:
n = n / 2
try:
d[2] += 1
except KeyError:
d[2] = 1
for i in range(3, int(sqrt(n+1)), 2):
while n % i == 0 and i != n:
n = n / i
try:
d[i] += 1
except KeyError:
d[i] = 1
d[n] = 1
for _,v in d.items():
count = count * (v + 1)
return count
def tri_number(num):
next = 1 + int(sqrt(1+(8 * num)))
return num + (next/2)
def main():
i = 1
while count_divisors(i) < 500:
i = tri_number(i)
return i
start = time.time()
answer = main()
elapsed = (time.time() - start)
print("result %s returned in %s seconds." % (answer, elapsed))
Here is the output showing the timedelta and correct answer:
$ python ./project012.py
result 76576500 returned in 1.82238006592 seconds.
Factoring
For counting the divisors, I start by initializing an empty dictionary and a counter. For each factor found, I create key of d[factor] with value of 1 if it does not exist, otherwise, I increment the value d[factor].
For example, if we counted the factors 100, we would see d = {25: 1, 2: 2}
The first while loop, I factor out all 2's, dividing n by 2 each time. Next, I begin factoring at 3, skipping two each time (since we factored all even numbers already), and stopping once I get to the square root of n+1.
We stop at the square_root of n because if there's a pair of factors with one of the numbers bigger than square_root of n, the other of the pair has to be less than 10. If the smaller one doesn't exist, there is no matching larger factor.
https://math.stackexchange.com/questions/1343171/why-only-square-root-approach-to-check-number-is-prime
while n % 2 == 0:
n = n / 2
try:
d[2] += 1
except KeyError:
d[2] = 1
for i in range(3, int(sqrt(n+1)), 2):
while n % i == 0 and i != n:
n = n / i
try:
d[i] += 1
except KeyError:
d[i] = 1
d[n] = 1
Now that I have gotten each factor, and added it to the dictionary, we have to add the last factor (which is just n).
Counting Divisors
Now that the dictionary is complete, we loop through each of the items, and apply the following formula: d(n)=(a+1)(b+1)(c+1)...
https://www.wikihow.com/Determine-the-Number-of-Divisors-of-an-Integer
All this formula means is taking all of the counts of each factor, adding 1, then multiplying them together. Take 100 for example, which has factors 25, 2, and 2. We would calculate d(n)=(a+1)(b+1) = (1+1)(2+1) = (2)(3) = 6 total divisors
for _,v in d.items():
count = count * (v + 1)
return count
Calculate Triangle Numbers
Now, taking a look at tri_number(), you can see that I opted to calculate the next triangle number in a sequence without manually adding each whole number together (saving me millions of operations). Instead I used T(n) = n (n+1) / 2
http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/runsums/triNbProof.html
We are providing a whole number to the function as an argument, so we need to solve for n, which is going to be the whole number to add next. Once we have the next number (n), we simply add that single number to num and return
S=n(n+1)2
S=n2+n2
2S=n2+n
n2+n−2S=0
At this point, we use the quadratic formula for : ax2+bx+c=0.
n=−b±√b2−4ac / 2a
n=−1±√1−4(1)(−2S) / 2
n=−1±√1+8S / 2
https://socratic.org/questions/how-do-you-solve-for-n-in-s-n-n-1-2
So all tri_number() does is evaluate n=1+√1+8S / 2 (we ignore the negative equation here). The answer that is returned is the next triangle number in the sequence.
def tri_number(num):
next = 1 + int(sqrt(1+(8 * num)))
return num + (next/2)
Main Loop
Finally, we can look at main(). We start at whole number 1. We count the divisor of 1. If it is less than 500, we get the next triangle number, then try again and again until we get a number with > 500 divisors.
def main():
i = 1
while count_divisors(i) < 500:
i = tri_number(i)
return i
I am sure there are additional ways to optimize but I am not smart enough to understand those ways. If you find any better ways to optimize python, let me know! I originally solved project 12 in Golang, and that run in 25 milliseconds!
$ go run project012.go
76576500
2018/07/12 01:56:31 TIME: main() took 23.581558ms
one of the hints i can give is
def genfact(n):
t = []
for i in xrange(1, n+1):
if n%i == 0:
t.append(i)
return t
change that to
def genfact(n):
t=[]
for i in xrange(1,numpy.sqrt(n)+1):
if(n%i==0):
t.append(i)
t.apend(n/i)
since if a is a divisor than so is b=n/a, since a*b=a*n/b=n, That should help a part already (not sure if in your case a square is possible, but if so, add another case to exclude adding the same number twice)
You could devise a recursive thing too, (like if it is something like for 28, you get 1,28,2,14 and at the moment you are at knowing 14, you put in something to actually remember the divisors of 14 (memoize), than check if they are alraedy in the list, and if not, add them to the list, together with 28/d for each of the divisors of 14, and at the end just take out the duplicates
If you think my first answer is still not fast enough, ask for more, and i will check how it would be done to solve it faster with some more tricks (could probably make use of erastothenes sieve or so too, and some other tricks could be thought up as well if you would wish to really blow up the problem to huge proportions, like to check the first one with over 10k divisors or so)
while True:
c=0
n=1
m=1
for i in range(1,n+1):
if n%i==0:
c=c+1
m=m+1
n=m*(m+1)/2
if c>500:
break
print n
this is not my code but it is so optimized.
source: http://code.jasonbhill.com/sage/project-euler-problem-12/
import time
def num_divisors(n):
if n % 2 == 0: n = n / 2
divisors = 1
count = 0
while n % 2 == 0:
count += 1
n = n / 2
divisors = divisors * (count + 1)
p = 3
while n != 1:
count = 0
while n % p == 0:
count += 1
n = n / p
divisors = divisors * (count + 1)
p += 2
return divisors
def find_triangular_index(factor_limit):
n = 1
lnum, rnum = num_divisors(n), num_divisors(n + 1)
while lnum * rnum < 500:
n += 1
lnum, rnum = rnum, num_divisors(n + 1)
return n
start = time.time()
index = find_triangular_index(500)
triangle = (index * (index + 1)) / 2
elapsed = (time.time() - start)
print("result %s returned in %s seconds." % (triangle, elapsed))

Average of two strings in alphabetical/lexicographical order

Suppose you take the strings 'a' and 'z' and list all the strings that come between them in alphabetical order: ['a','b','c' ... 'x','y','z']. Take the midpoint of this list and you find 'm'. So this is kind of like taking an average of those two strings.
You could extend it to strings with more than one character, for example the midpoint between 'aa' and 'zz' would be found in the middle of the list ['aa', 'ab', 'ac' ... 'zx', 'zy', 'zz'].
Might there be a Python method somewhere that does this? If not, even knowing the name of the algorithm would help.
I began making my own routine that simply goes through both strings and finds midpoint of the first differing letter, which seemed to work great in that 'aa' and 'az' midpoint was 'am', but then it fails on 'cat', 'doggie' midpoint which it thinks is 'c'. I tried Googling for "binary search string midpoint" etc. but without knowing the name of what I am trying to do here I had little luck.
I added my own solution as an answer
If you define an alphabet of characters, you can just convert to base 10, do an average, and convert back to base-N where N is the size of the alphabet.
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def enbase(x):
n = len(alphabet)
if x < n:
return alphabet[x]
return enbase(x/n) + alphabet[x%n]
def debase(x):
n = len(alphabet)
result = 0
for i, c in enumerate(reversed(x)):
result += alphabet.index(c) * (n**i)
return result
def average(a, b):
a = debase(a)
b = debase(b)
return enbase((a + b) / 2)
print average('a', 'z') #m
print average('aa', 'zz') #mz
print average('cat', 'doggie') #budeel
print average('google', 'microsoft') #gebmbqkil
print average('microsoft', 'google') #gebmbqkil
Edit: Based on comments and other answers, you might want to handle strings of different lengths by appending the first letter of the alphabet to the shorter word until they're the same length. This will result in the "average" falling between the two inputs in a lexicographical sort. Code changes and new outputs below.
def pad(x, n):
p = alphabet[0] * (n - len(x))
return '%s%s' % (x, p)
def average(a, b):
n = max(len(a), len(b))
a = debase(pad(a, n))
b = debase(pad(b, n))
return enbase((a + b) / 2)
print average('a', 'z') #m
print average('aa', 'zz') #mz
print average('aa', 'az') #m (equivalent to ma)
print average('cat', 'doggie') #cumqec
print average('google', 'microsoft') #jlilzyhcw
print average('microsoft', 'google') #jlilzyhcw
If you mean the alphabetically, simply use FogleBird's algorithm but reverse the parameters and the result!
>>> print average('cat'[::-1], 'doggie'[::-1])[::-1]
cumdec
or rewriting average like so
>>> def average(a, b):
... a = debase(a[::-1])
... b = debase(b[::-1])
... return enbase((a + b) / 2)[::-1]
...
>>> print average('cat', 'doggie')
cumdec
>>> print average('google', 'microsoft')
jlvymlupj
>>> print average('microsoft', 'google')
jlvymlupj
It sounds like what you want, is to treat alphabetical characters as a base-26 value between 0 and 1. When you have strings of different length (an example in base 10), say 305 and 4202, your coming out with a midpoint of 3, since you're looking at the characters one at a time. Instead, treat them as a floating point mantissa: 0.305 and 0.4202. From that, it's easy to come up with a midpoint of .3626 (you can round if you'd like).
Do the same with base 26 (a=0...z=25, ba=26, bb=27, etc.) to do the calculations for letters:
cat becomes 'a.cat' and doggie becomes 'a.doggie', doing the math gives cat a decimal value of 0.078004096, doggie a value of 0.136390697, with an average of 0.107197397 which in base 26 is roughly "cumcqo"
Based on your proposed usage, consistent hashing ( http://en.wikipedia.org/wiki/Consistent_hashing ) seems to make more sense.
Thanks for everyone who answered, but I ended up writing my own solution because the others weren't exactly what I needed. I am trying to average app engine key names, and after studying them a bit more I discovered they actually allow any 7-bit ASCII characters in the names. Additionally I couldn't really rely on the solutions that converted the key names first to floating point, because I suspected floating point accuracy just isn't enough.
To take an average, first you add two numbers together and then divide by two. These are both such simple operations that I decided to just make functions to add and divide base 128 numbers represented as lists. This solution hasn't been used in my system yet so I might still find some bugs in it. Also it could probably be a lot shorter, but this is just something I needed to get done instead of trying to make it perfect.
# Given two lists representing a number with one digit left to decimal point and the
# rest after it, for example 1.555 = [1,5,5,5] and 0.235 = [0,2,3,5], returns a similar
# list representing those two numbers added together.
#
def ladd(a, b, base=128):
i = max(len(a), len(b))
lsum = [0] * i
while i > 1:
i -= 1
av = bv = 0
if i < len(a): av = a[i]
if i < len(b): bv = b[i]
lsum[i] += av + bv
if lsum[i] >= base:
lsum[i] -= base
lsum[i-1] += 1
return lsum
# Given a list of digits after the decimal point, returns a new list of digits
# representing that number divided by two.
#
def ldiv2(vals, base=128):
vs = vals[:]
vs.append(0)
i = len(vs)
while i > 0:
i -= 1
if (vs[i] % 2) == 1:
vs[i] -= 1
vs[i+1] += base / 2
vs[i] = vs[i] / 2
if vs[-1] == 0: vs = vs[0:-1]
return vs
# Given two app engine key names, returns the key name that comes between them.
#
def average(a_kn, b_kn):
m = lambda x:ord(x)
a = [0] + map(m, a_kn)
b = [0] + map(m, b_kn)
avg = ldiv2(ladd(a, b))
return "".join(map(lambda x:chr(x), avg[1:]))
print average('a', 'z') # m#
print average('aa', 'zz') # n-#
print average('aa', 'az') # am#
print average('cat', 'doggie') # d(mstr#
print average('google', 'microsoft') # jlim.,7s:
print average('microsoft', 'google') # jlim.,7s:
import math
def avg(str1,str2):
y = ''
s = 'abcdefghijklmnopqrstuvwxyz'
for i in range(len(str1)):
x = s.index(str2[i])+s.index(str1[i])
x = math.floor(x/2)
y += s[x]
return y
print(avg('z','a')) # m
print(avg('aa','az')) # am
print(avg('cat','dog')) # chm
Still working on strings with different lengths... any ideas?
This version thinks 'abc' is a fraction like 0.abc. In this approach space is zero and a valid input/output.
MAX_ITER = 10
letters = " abcdefghijklmnopqrstuvwxyz"
def to_double(name):
d = 0
for i, ch in enumerate(name):
idx = letters.index(ch)
d += idx * len(letters) ** (-i - 1)
return d
def from_double(d):
name = ""
for i in range(MAX_ITER):
d *= len(letters)
name += letters[int(d)]
d -= int(d)
return name
def avg(w1, w2):
w1 = to_double(w1)
w2 = to_double(w2)
return from_double((w1 + w2) * 0.5)
print avg('a', 'a') # 'a'
print avg('a', 'aa') # 'a mmmmmmmm'
print avg('aa', 'aa') # 'a zzzzzzzz'
print avg('car', 'duck') # 'cxxemmmmmm'
Unfortunately, the naïve algorithm is not able to detect the periodic 'z's, this would be something like 0.99999 in decimal; therefore 'a zzzzzzzz' is actually 'aa' (the space before the 'z' periodicity must be increased by one.
In order to normalise this, you can use the following function
def remove_z_period(name):
if len(name) != MAX_ITER:
return name
if name[-1] != 'z':
return name
n = ""
overflow = True
for ch in reversed(name):
if overflow:
if ch == 'z':
ch = ' '
else:
ch=letters[(letters.index(ch)+1)]
overflow = False
n = ch + n
return n
print remove_z_period('a zzzzzzzz') # 'aa'
I haven't programmed in python in a while and this seemed interesting enough to try.
Bear with my recursive programming. Too many functional languages look like python.
def stravg_half(a, ln):
# If you have a problem it will probably be in here.
# The floor of the character's value is 0, but you may want something different
f = 0
#f = ord('a')
L = ln - 1
if 0 == L:
return ''
A = ord(a[0])
return chr(A/2) + stravg_half( a[1:], L)
def stravg_helper(a, b, ln, x):
L = ln - 1
A = ord(a[0])
B = ord(b[0])
D = (A + B)/2
if 0 == L:
if 0 == x:
return chr(D)
# NOTE: The caller of helper makes sure that len(a)>=len(b)
return chr(D) + stravg_half(a[1:], x)
return chr(D) + stravg_helper(a[1:], b[1:], L, x)
def stravg(a, b):
la = len(a)
lb = len(b)
if 0 == la:
if 0 == lb:
return a # which is empty
return stravg_half(b, lb)
if 0 == lb:
return stravg_half(a, la)
x = la - lb
if x > 0:
return stravg_helper(a, b, lb, x)
return stravg_helper(b, a, la, -x) # Note the order of the args

Categories