I want to perform addition and multiplication in F_{2^8}
I currently have this code which seems to work for add but doesn't work for multiply; the issue seems to be that when I modulo by 100011011 (which represents x^8 + x^4 + x^3 + x + 1), it doesn't seem to do it. Another idea would be to use numpy.polynomial but it isn't as intuitive.
def toBinary(self, n):
return ''.join(str(1 & int(n) >> i) for i in range(8)[::-1])
def add(self, x, y):
"""
"10111001" + "10010100" = "00101101"
"""
if len(x)<8:
self.add('0'+x,y)
elif len(y)<8:
self.add(x,'0'+y)
try:
a = int(x,2); b = int(y,2)
z = int(x)+int(y)
s = ''
for i in str(z):
if int(i)%2 == 0:
s+='0'
else:
s+='1'
except:
return '00000000'
return s
def multiply(self, x, y):
"""
"10111001" * "10010100" = "10110010"
"""
if len(x)<8:
self.multiply('0'+x,y)
elif len(y)<8:
self.multiply(x,'0'+y)
result = '00000000'
result = '00000000'
while y!= '00000000' :
print(f'x:{x},y:{y},result:{result}')
if int(y[-1]) == 1 :
result = self.add(result ,x)
y = self.add(y, '00000001')
x = self.add(self.toBinary(int(x,2)<<1),'100011011')
y = self.toBinary(int(y,2)>>1) #b = self.multiply(b,inverse('00000010'))
return result
Python example for add (same as subtract), multiply, divide, and inverse. Assumes the input parameters are 8 bit values, and there is no check for divide by 0.
def add(x, y): # add is xor
return x^y
def sub(x, y): # sub is xor
return x^y
def mpy(x, y): # mpy two 8 bit values
p = 0b100011011 # mpy modulo x^8+x^4+x^3+x+1
m = 0 # m will be product
for i in range(8):
m = m << 1
if m & 0b100000000:
m = m ^ p
if y & 0b010000000:
m = m ^ x
y = y << 1
return m
def div(x, y): # divide using inverse
return mpy(x, inv(y)) # (no check for y = 0)
def inv(x): # x^254 = 1/x
p=mpy(x,x) # p = x^2
x=mpy(p,p) # x = x^4
p=mpy(p,x) # p = x^(2+4)
x=mpy(x,x) # x = x^8
p=mpy(p,x) # p = x^(2+4+8)
x=mpy(x,x) # x = x^16
p=mpy(p,x) # p = x^(2+4+8+16)
x=mpy(x,x) # x = x^32
p=mpy(p,x) # p = x^(2+4+8+16+32)
x=mpy(x,x) # x = x^64
p=mpy(p,x) # p = x^(2+4+8+16+32+64)
x=mpy(x,x) # x = x^128
p=mpy(p,x) # p = x^(2+4+8+16+32+64+128)
return p
print hex(add(0b01010101, 0b10101010)) # returns 0xff
print hex(mpy(0b01010101, 0b10101010)) # returns 0x59
print hex(div(0b01011001, 0b10101010)) # returns 0x55
For GF(2^n), both add and subtract are XOR. This means multiplies are carryless and divides are borrowless. The X86 has a carryless multiply for XMM registers, PCLMULQDQ. Divide by a constant can be done with carryless multiply by 2^64 / constant and using the upper 64 bits of the product. The inverse constant is generated using a loop for borrowless divide.
The reason for this is GF(2^n) elements are polynomials with 1 bit coefficients, (the coefficients are elements of GF(2)).
For GF(2^8), it would be simpler to generate exponentiate and log tables. Example C code:
#define POLY (0x11b)
/* all non-zero elements are powers of 3 for POLY == 0x11b */
typedef unsigned char BYTE;
/* ... */
static BYTE exp2[512];
static BYTE log2[256];
/* ... */
static void Tbli()
{
int i;
int b;
b = 0x01; /* init exp2 table */
for(i = 0; i < 512; i++){
exp2[i] = (BYTE)b;
b = (b << 1) ^ b; /* powers of 3 */
if(b & 0x100)
b ^= POLY;
}
log2[0] = 0xff; /* init log2 table */
for(i = 0; i < 255; i++)
log2[exp2[i]] = (BYTE)i;
}
/* ... */
static BYTE GFMpy(BYTE m0, BYTE m1) /* multiply */
{
if(0 == m0 || 0 == m1)
return(0);
return(exp2[log2[m0] + log2[m1]]);
}
/* ... */
static BYTE GFDiv(BYTE m0, BYTE m1) /* divide */
{
if(0 == m0)
return(0);
return(exp2[log2[m0] + 255 - log2[m1]]);
}
I created a Python package galois that extends NumPy arrays over finite fields. Working with GF(2^8) is quite easy, see my below example.
In [1]: import galois
In [2]: GF = galois.GF(2**8, irreducible_poly="x^8 + x^4 + x^3 + x + 1")
In [3]: print(GF.properties)
GF(2^8):
characteristic: 2
degree: 8
order: 256
irreducible_poly: x^8 + x^4 + x^3 + x + 1
is_primitive_poly: False
primitive_element: x + 1
# Your original values from your example
In [4]: a = GF(0b10111001); a
Out[4]: GF(185, order=2^8)
In [5]: b = GF(0b10010100); b
Out[5]: GF(148, order=2^8)
In [6]: c = a * b; c
Out[6]: GF(178, order=2^8)
# You can display the result as a polynomial over GF(2)
In [7]: GF.display("poly");
# This matches 0b10110010
In [8]: c
Out[8]: GF(x^7 + x^5 + x^4 + x, order=2^8)
You can work with arrays too.
In [12]: a = GF([1, 2, 3, 4]); a
Out[12]: GF([1, 2, 3, 4], order=2^8)
In [13]: b = GF([100, 110, 120, 130]); b
Out[13]: GF([100, 110, 120, 130], order=2^8)
In [14]: a * b
Out[14]: GF([100, 220, 136, 62], order=2^8)
It's open source, so you can review all the code. Here's a snippet of multiplication in GF(2^m). All of the inputs are integers. Here's how to perform the "polynomial multiplication" using integers with characteristic 2.
def _multiply_calculate(a, b, CHARACTERISTIC, DEGREE, IRREDUCIBLE_POLY):
"""
a in GF(2^m), can be represented as a degree m-1 polynomial a(x) in GF(2)[x]
b in GF(2^m), can be represented as a degree m-1 polynomial b(x) in GF(2)[x]
p(x) in GF(2)[x] with degree m is the irreducible polynomial of GF(2^m)
a * b = c
= (a(x) * b(x)) % p(x) in GF(2)
= c(x)
= c
"""
ORDER = CHARACTERISTIC**DEGREE
# Re-order operands such that a > b so the while loop has less loops
if b > a:
a, b = b, a
c = 0
while b > 0:
if b & 0b1:
c ^= a # Add a(x) to c(x)
b >>= 1 # Divide b(x) by x
a <<= 1 # Multiply a(x) by x
if a >= ORDER:
a ^= IRREDUCIBLE_POLY # Compute a(x) % p(x)
return c
The same example runs as follows.
In [72]: _multiply_calculate(0b10111001, 0b10010100, 2, 8, 0b100011011)
Out[72]: 178
In [73]: bin(_multiply_calculate(0b10111001, 0b10010100, 2, 8, 0b100011011))
Out[73]: '0b10110010'
Related
I'm trying to get the result in hex format, but I get the error "TypeError: 'float' object cannot be interpreted as an integer!"
39 d = chinese_remainder(a, n)
---> 40 number = hex(d)
41 print(number)
Code:
import functools
# Euclidean extended algorithm
def egcd(a, b):
if a == 0:
return b, 0, 1
else:
d, x, y = egcd(b % a, a)
return d, y - (b // a) * x, x
"""
Functions whcih calculate the CRT (
return x in ' x = a mod n'.
"""
def chinese_remainder(a, n):
modulus = functools.reduce(lambda a, b: a * b, n)
multipliers = []
for N_i in n:
N = modulus / N_i
gcd, inverse, y = egcd(N, N_i)
multipliers.append(inverse * N % modulus)
result = 0
for multi, a_i in zip(multipliers, a):
result = (result + multi * a_i) % modulus
return result
FN = 1184749
FM = 8118474
FL = 5386565
HN = 8686891
HM = 6036033
HK = 6029230
n = [FN, FM, FL]
a = [HN, HM, HK]
d = chinese_remainder(a, n)
number = hex(d)
print(number)
The result should be like this
FAB15A7AE056200F9
But it gives me
3.3981196080447865e + 19
How to fix this so that the result is in hex format ???
Normal division / operator returns float whereas you can use floor division // to get integers.
As others suggested, you have to use floor division to variable N like this N = modulus//N_i
I am still teaching some R mainly to myself (and to my students).
Here's an implementation of the Collatz sequence in R:
f <- function(n)
{
# construct the entire Collatz path starting from n
if (n==1) return(1)
if (n %% 2 == 0) return(c(n, f(n/2)))
return(c(n, f(3*n + 1)))
}
Calling f(13) I get
13, 40, 20, 10, 5, 16, 8, 4, 2, 1
However note that a vector is growing dynamically in size here. Such moves tend to be a recipe for inefficient code. Is there a more efficient version?
In Python I would use
def collatz(n):
assert isinstance(n, int)
assert n >= 1
def __colla(n):
while n > 1:
yield n
if n % 2 == 0:
n = int(n / 2)
else:
n = int(3 * n + 1)
yield 1
return list([x for x in __colla(n)])
I found a way to write into vectors without specifying their dimension a priori. Therefore a solution could be
collatz <-function(n)
{
stopifnot(n >= 1)
# define a vector without specifying the length
x = c()
i = 1
while (n > 1)
{
x[i] = n
i = i + 1
n = ifelse(n %% 2, 3*n + 1, n/2)
}
x[i] = 1
# now "cut" the vector
dim(x) = c(i)
return(x)
}
I was curious to see how a C++ implementation through Rcpp would compare to your two base R approaches. Here are my results.
First let's define a function collatz_Rcpp that returns the Hailstone sequence for a given integer n. The (non-recursive) implementation was adapted from Rosetta Code.
library(Rcpp)
cppFunction("
std::vector<int> collatz_Rcpp(int i) {
std::vector<int> v;
while(true) {
v.push_back(i);
if (i == 1) break;
i = (i % 2) ? (3 * i + 1) : (i / 2);
}
return v;
}
")
We now run a microbenchmark analysis using both your base R and the Rcpp implementation. We calculate the Hailstone sequences for the first 10000 integers
# base R implementation
collatz_R <- function(n) {
# construct the entire Collatz path starting from n
if (n==1) return(1)
if (n %% 2 == 0) return(c(n, collatz(n/2)))
return(c(n, collatz(3*n + 1)))
}
# "updated" base R implementation
collatz_R_updated <-function(n) {
stopifnot(n >= 1)
# define a vector without specifying the length
x = c()
i = 1
while (n > 1) {
x[i] = n
i = i + 1
n = ifelse(n %% 2, 3*n + 1, n/2)
}
x[i] = 1
# now "cut" the vector
dim(x) = c(i)
return(x)
}
library(microbenchmark)
n <- 10000
res <- microbenchmark(
baseR = sapply(1:n, collatz_R),
baseR_updated = sapply(1:n, collatz_R_updated),
Rcpp = sapply(1:n, collatz_Rcpp))
res
# expr min lq mean median uq max
# baseR 65.68623 73.56471 81.42989 77.46592 83.87024 193.2609
#baseR_updated 3861.99336 3997.45091 4240.30315 4122.88577 4348.97153 5463.7787
# Rcpp 36.52132 46.06178 51.61129 49.27667 53.10080 168.9824
library(ggplot2)
autoplot(res)
The (non-recursive) Rcpp implementation seems to be around 30% faster than the original (recursive) base R implementation. The "updated" (non-recursive) base R implementation is significantly slower than the original (recursive) base R approach (the microbenchmark takes around 10 minutes to finish on my MacBook Air due to baseR_updated).
How would I implement the Multiplicative Inverse in GF2^8 in Python 3?
My current functions look like this:
def gf_add(a, b):
return a ^ b
def gf_mul(a, b, mod=0x1B):
p = bytes(hex(0x00))
for i in range(8):
if (b & 1) != 0:
p ^= a
high_bit_set = bytes(a & 0x80)
a <<= 1
if high_bit_set != 0:
a ^= mod
b >>= 1
return p
Here is how I'd do it:
def gf_degree(a) :
res = 0
a >>= 1
while (a != 0) :
a >>= 1;
res += 1;
return res
def gf_invert(a, mod=0x1B) :
v = mod
g1 = 1
g2 = 0
j = gf_degree(a) - 8
while (a != 1) :
if (j < 0) :
a, v = v, a
g1, g2 = g2, g1
j = -j
a ^= v << j
g1 ^= g2 << j
a %= 256 # Emulating 8-bit overflow
g1 %= 256 # Emulating 8-bit overflow
j = gf_degree(a) - gf_degree(v)
return g1
The function gf_degree calculates the degree of the polynomial, and gf_invert, naturally, inverts any element of GF(2^8), except 0, of course.
The implementation of gf_invert follows a "text-book" algorithm on finding the multiplicative inverse of elements of a finite field.
Example
print(gf_invert(5)) # 82
print(gf_invert(1)) # 1
print(gf_invert(255)) # 28
Here is a live demo.
As mentioned in the comments you could also have used a logarithmic approach, or simply use brute force (trying every combination of multiplication).
You might look at my libgf2 module (which no one else actually uses) and use GF2Element:
from libgf2 import GF2Element
x = GF2Element(0x8, 0x11B)
x.inv
# find the inverse of x^3 in the quotient ring GF(2)[x]/p(x)
# where p(x) = x^8 + x^4 + x^3 + x + 1 (0x11B in bit vector format)
See this blog article for more details.
Note: libgf2 is in Python 2.7 so you'd have to port to Python 3, but it's a fairly small library.
I recently implemented Karatsuba Multiplication as a personal exercise. I wrote my implementation in Python following the pseudocode provided on wikipedia:
procedure karatsuba(num1, num2)
if (num1 < 10) or (num2 < 10)
return num1*num2
/* calculates the size of the numbers */
m = max(size_base10(num1), size_base10(num2))
m2 = m/2
/* split the digit sequences about the middle */
high1, low1 = split_at(num1, m2)
high2, low2 = split_at(num2, m2)
/* 3 calls made to numbers approximately half the size */
z0 = karatsuba(low1, low2)
z1 = karatsuba((low1+high1), (low2+high2))
z2 = karatsuba(high1, high2)
return (z2*10^(2*m2)) + ((z1-z2-z0)*10^(m2)) + (z0)
Here is my python implementation:
def karat(x,y):
if len(str(x)) == 1 or len(str(y)) == 1:
return x*y
else:
m = max(len(str(x)),len(str(y)))
m2 = m / 2
a = x / 10**(m2)
b = x % 10**(m2)
c = y / 10**(m2)
d = y % 10**(m2)
z0 = karat(b,d)
z1 = karat((a+b),(c+d))
z2 = karat(a,c)
return (z2 * 10**(2*m2)) + ((z1 - z2 - z0) * 10**(m2)) + (z0)
My question is about final merge of z0, z1, and z2.
z2 is shifted m digits over (where m is the length of the largest of two multiplied numbers).
Instead of simply multiplying by 10^(m), the algorithm uses *10^(2*m2)* where m2 is m/2.
I tried replacing 2*m2 with m and got incorrect results. I think this has to do with how the numbers are split but I'm not really sure what's going on.
Depending on your Python version you must or should replace / with the explicit floor division operator // which is the appropriate here; it rounds down ensuring that your exponents remain entire numbers.
This is essential for example when splitting your operands in high digits (by floor dividing by 10^m2) and low digits (by taking the residual modulo 10^m2) this would not work with a fractional m2.
It also explains why 2 * (x // 2) does not necessarily equal x but rather x-1 if x is odd.
In the last line of the algorithm 2 m2 is correct because what you are doing is giving a and c their zeros back.
If you are on an older Python version your code may still work because / used to be interpreted as floor division when applied to integers.
def karat(x,y):
if len(str(x)) == 1 or len(str(y)) == 1:
return x*y
else:
m = max(len(str(x)),len(str(y)))
m2 = m // 2
a = x // 10**(m2)
b = x % 10**(m2)
c = y // 10**(m2)
d = y % 10**(m2)
z0 = karat(b,d)
z1 = karat((a+b),(c+d))
z2 = karat(a,c)
return (z2 * 10**(2*m2)) + ((z1 - z2 - z0) * 10**(m2)) + (z0)
i have implemented the same idea but i have restricted to the 2 digit multiplication as the base case because i can reduce float multiplication in function
import math
def multiply(x,y):
sx= str(x)
sy= str(y)
nx= len(sx)
ny= len(sy)
if ny<=2 or nx<=2:
r = int(x)*int(y)
return r
n = nx
if nx>ny:
sy = sy.rjust(nx,"0")
n=nx
elif ny>nx:
sx = sx.rjust(ny,"0")
n=ny
m = n%2
offset = 0
if m != 0:
n+=1
offset = 1
floor = int(math.floor(n/2)) - offset
a = sx[0:floor]
b = sx[floor:n]
c = sy[0:floor]
d = sy[floor:n]
print(a,b,c,d)
ac = multiply(a,c)
bd = multiply(b,d)
ad_bc = multiply((int(a)+int(b)),(int(c)+int(d)))-ac-bd
r = ((10**n)*ac)+((10**(n/2))*ad_bc)+bd
return r
print(multiply(4,5))
print(multiply(4,58779))
print(int(multiply(4872139874092183,5977098709879)))
print(int(4872139874092183*5977098709879))
print(int(multiply(4872349085723098457,597340985723098475)))
print(int(4872349085723098457*597340985723098475))
print(int(multiply(4908347590823749,97098709870985)))
print(int(4908347590823749*97098709870985))
I tried replacing 2*m2 with m and got incorrect results. I think this has to do with how the numbers are split but I'm not really sure what's going on.
This goes to the heart of how you split your numbers for the recursive calls.
If you choose to use an odd n then n//2 will be rounded down to the nearest whole number, meaning your second number will have a length of floor(n/2) and you would have to pad the first with the floor(n/2) zeros.
Since we use the same n for both numbers this applies to both. This means if you stick to the original odd n for the final step, you would be padding the first term with the original n zeros instead of the number of zeros that would result from the combination of the first padding plus the second padding (floor(n/2)*2)
You have used m2 as a float. It needs to be an integer.
def karat(x,y):
if len(str(x)) == 1 or len(str(y)) == 1:
return x*y
else:
m = max(len(str(x)),len(str(y)))
m2 = m // 2
a = x // 10**(m2)
b = x % 10**(m2)
c = y // 10**(m2)
d = y % 10**(m2)
z0 = karat(b,d)
z1 = karat((a+b),(c+d))
z2 = karat(a,c)
return (z2 * 10**(2*m2)) + ((z1 - z2 - z0) * 10**(m2)) + (z0)
Your code and logic is correct, there is just issue with your base case. Since according to the algo a,b,c,d are 2 digit numbers you should modify your base case and keep the length of x and y equal to 2 in the base case.
I think it is better if you used math.log10 function to calculate the number of digits instead of converting to string, something like this :
def number_of_digits(number):
"""
Used log10 to find no. of digits
"""
if number > 0:
return int(math.log10(number)) + 1
elif number == 0:
return 1
else:
return int(math.log10(-number)) + 1 # Don't count the '-'
The base case if len(str(x)) == 1 or len(str(y)) == 1: return x*y is incorrect. If you run either of the python code given in answers against large integers, the karat() function will not produce the correct answer.
To make the code correct, you need to change the base case to if len(str(x) < 3 or len(str(y)) < 3: return x*y.
Below is a modified implementation of Paul Panzer's answer that correctly multiplies large integers.
def karat(x,y):
if len(str(x)) < 3 or len(str(y)) < 3:
return x*y
n = max(len(str(x)),len(str(y))) // 2
a = x // 10**(n)
b = x % 10**(n)
c = y // 10**(n)
d = y % 10**(n)
z0 = karat(b,d)
z1 = karat((a+b), (c+d))
z2 = karat(a,c)
return ((10**(2*n))*z2)+((10**n)*(z1-z2-z0))+z0
Anybody knows proper python implementation of TEA (Tiny Encryption Algorithm)? I tried the one I've found here: http://sysadminco.com/code/python-tea/ - but it does not seem to work properly.
It returns different results than other implementations in C or Java. I guess it's caused by completely different data types in python (or no data types in fact).
Here's the code and an example:
def encipher(v, k):
y=v[0];z=v[1];sum=0;delta=0x9E3779B9;n=32
w=[0,0]
while(n>0):
y += (z << 4 ^ z >> 5) + z ^ sum + k[sum & 3]
y &= 4294967295L # maxsize of 32-bit integer
sum += delta
z += (y << 4 ^ y >> 5) + y ^ sum + k[sum>>11 & 3]
z &= 4294967295L
n -= 1
w[0]=y; w[1]=z
return w
def decipher(v, k):
y=v[0]
z=v[1]
sum=0xC6EF3720
delta=0x9E3779B9
n=32
w=[0,0]
# sum = delta<<5, in general sum = delta * n
while(n>0):
z -= (y << 4 ^ y >> 5) + y ^ sum + k[sum>>11 & 3]
z &= 4294967295L
sum -= delta
y -= (z << 4 ^ z >> 5) + z ^ sum + k[sum&3]
y &= 4294967295L
n -= 1
w[0]=y; w[1]=z
return w
Python example:
>>> import tea
>>> key = [0xbe168aa1, 0x16c498a3, 0x5e87b018, 0x56de7805]
>>> v = [0xe15034c8, 0x260fd6d5]
>>> res = tea.encipher(v, key)
>>> "%X %X" % (res[0], res[1])
**'70D16811 F935148F'**
C example:
#include <unistd.h>
#include <stdio.h>
void encipher(unsigned long *const v,unsigned long *const w,
const unsigned long *const k)
{
register unsigned long y=v[0],z=v[1],sum=0,delta=0x9E3779B9,
a=k[0],b=k[1],c=k[2],d=k[3],n=32;
while(n-->0)
{
sum += delta;
y += (z << 4)+a ^ z+sum ^ (z >> 5)+b;
z += (y << 4)+c ^ y+sum ^ (y >> 5)+d;
}
w[0]=y; w[1]=z;
}
int main()
{
unsigned long v[] = {0xe15034c8, 0x260fd6d5};
unsigned long key[] = {0xbe168aa1, 0x16c498a3, 0x5e87b018, 0x56de7805};
unsigned long res[2];
encipher(v, res, key);
printf("%X %X\n", res[0], res[1]);
return 0;
}
$ ./tea
**D6942D68 6F87870D**
Please note, that both examples were run with the same input data (v and key), but results were different. I'm pretty sure C implementation is correct - it comes from a site referenced by wikipedia (I couldn't post a link to it because I don't have enough reputation points yet - some antispam thing)
I fixed it. Here is working TEA implementation in python:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import sys
from ctypes import *
def encipher(v, k):
y = c_uint32(v[0])
z = c_uint32(v[1])
sum = c_uint32(0)
delta = 0x9e3779b9
n = 32
w = [0,0]
while(n>0):
sum.value += delta
y.value += ( z.value << 4 ) + k[0] ^ z.value + sum.value ^ ( z.value >> 5 ) + k[1]
z.value += ( y.value << 4 ) + k[2] ^ y.value + sum.value ^ ( y.value >> 5 ) + k[3]
n -= 1
w[0] = y.value
w[1] = z.value
return w
def decipher(v, k):
y = c_uint32(v[0])
z = c_uint32(v[1])
sum = c_uint32(0xc6ef3720)
delta = 0x9e3779b9
n = 32
w = [0,0]
while(n>0):
z.value -= ( y.value << 4 ) + k[2] ^ y.value + sum.value ^ ( y.value >> 5 ) + k[3]
y.value -= ( z.value << 4 ) + k[0] ^ z.value + sum.value ^ ( z.value >> 5 ) + k[1]
sum.value -= delta
n -= 1
w[0] = y.value
w[1] = z.value
return w
if __name__ == "__main__":
key = [1,2,3,4]
v = [1385482522,639876499]
enc = encipher(v,key)
print enc
print decipher(enc,key)
And a small sample:
>>> v
[1385482522, 639876499]
>>> tea.decipher(tea.encipher(v,key),key)
[1385482522L, 639876499L]
Since TEA is a block cipher and your v is a very small block, I'd guess there may be block padding differences, or as Wikipedia notes:
http://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm:
Note that the reference implementation
is bound to a specific microprocessor
architecture meaning that byte order
considerations are important when
cyphertext is shared and processed on
different systems. The original paper
does not specify any details about
microprocessor architecture and so
anyone implementing a system using TEA
would need to make those
specifications for themselves.
I didn't inspect either implementation in detail. Your &= statements feel suspicious, too.
Tea is broken, do not use it.
XXTEA which is secure does not define endianess and stuff and you should should reinvent whell when you can use AES.
There is no point in using unsecure cryptography.
I strongy advice you to apply AES, it can be implemented in 8bit microcontolers whit just few kB of code
EDIT
Did you checked this code?
http://sysadminco.com/code/python-tea/