So I have two matrices, A and B, and I want to compute the min-plus product as given here: Min-plus matrix multiplication. For that I've implemented the following:
def min_plus_product(A,B):
B = np.transpose(B)
Y = np.zeros((len(B),len(A)))
for i in range(len(B)):
Y[i] = (A + B[i]).min(1)
return np.transpose(Y)
This works fine, but is slow for big matrices, is there a way to make it faster? I've heard that implemeting in C or using the GPU might be good options.
Here is an algo that saves a bit if the middle dimension is large enough and entries are uniformly distributed. It exploits the fact that the smallest sum typically will be from two small terms.
import numpy as np
def min_plus_product(A,B):
B = np.transpose(B)
Y = np.zeros((len(B),len(A)))
for i in range(len(B)):
Y[i] = (A + B[i]).min(1)
return np.transpose(Y)
def min_plus_product_opt(A,B, chop=None):
if chop is None:
# not sure this is optimal
chop = int(np.ceil(np.sqrt(A.shape[1])))
B = np.transpose(B)
Amin = A.min(1)
Y = np.zeros((len(B),len(A)))
for i in range(len(B)):
o = np.argsort(B[i])
Y[i] = (A[:, o[:chop]] + B[i, o[:chop]]).min(1)
if chop < len(o):
idx = np.where(Amin + B[i, o[chop]] < Y[i])[0]
for j in range(chop, len(o), chop):
if len(idx) == 0:
break
x, y = np.ix_(idx, o[j : j + chop])
slmin = (A[x, y] + B[i, o[j : j + chop]]).min(1)
slmin = np.minimum(Y[i, idx], slmin)
Y[i, idx] = slmin
nidx = np.where(Amin[idx] + B[i, o[j + chop]] < Y[i, idx])[0]
idx = idx[nidx]
return np.transpose(Y)
A = np.random.random(size=(1000,1000))
B = np.random.random(size=(1000,2000))
print(np.allclose(min_plus_product(A,B), min_plus_product_opt(A,B)))
import time
t = time.time();min_plus_product(A,B);print('naive {}sec'.format(time.time()-t))
t = time.time();min_plus_product_opt(A,B);print('opt {}sec'.format(time.time()-t))
Sample output:
True
naive 7.794037580490112sec
opt 1.65810227394104sec
A possible simple route is to use numba.
from numba import autojit
import numpy as np
#autojit(nopython=True)
def min_plus_product(A,B):
n = A.shape[0]
C = np.zeros((n,n))
for i in range(n):
for j in range(n):
minimum = A[i,0]+B[0,j]
for k in range(1,n):
minimum = min(A[i,k]+B[k,j],minimum)
C[i,j] = minimum
return C
Timings on 1000x1000 A,B matrices are:
1 loops, best of 3: 4.28 s per loop for the original code
1 loops, best of 3: 2.32 s per loop for the numba code
Here is a succinct and fully numpy solution, without any python-based loops:
(np.expand_dims(a, 0) + np.expand_dims(b.T, 1)).min(axis=2).T
Related
import numpy as np
from scipy.linalg import solve_triangular as triSolve
#O(n) per iteration, so overall O(nN), good for large SPD/SDD matrices
def GS_iter(A, b, N):
m = len(A)
L = np.tril(A)
P = L-A
print(P)
x = np.zeros(m)
print(x)
for k in range(N):
x = triSolve(L,b+P#x, True)
return x
#examples
A = np.array([[10,2,3,1],[1,10,0,1],[0.2,1,10,2],[0.1,3,3,10]])
b = np.array([1,2,1,0])
x = GS_iter(A,b,50000)
ans = A#x-b
print(ans)
print(np.linalg.norm(ans))
Above is my Gauss-Seidel method in Python. For some reason it is not converging even after 50000 iterations to the solution even when the matrix A is strict diagonal dominant. Below is the same implementation in MATLAB which works:
function x = gSeidel(A,B,N)
[n,~] = size(A);
L = tril(A);
P = L-A; %P = -U
x = zeros(n,1); %x_0
for k = 1:N
x = L\(B+P*x);
end
end
What mistakes did I make? I think it is in TriSolve method since if I replaced it with regular LU solver such as (np.linalg.solve) it works. Why doesn't triangular solve behave as intended here?
the lower argument is the fourth
replace your line by x = triSolve(L,b+P#x, lower=True)
Signature:
triSolve(
a,
b,
trans=0,
lower=False,
unit_diagonal=False,
overwrite_b=False,
debug=None,
check_finite=True,
)
Can this be done without a loop?
import numpy as np
n = 10
x = np.random.random(n+1)
a, b = 0.45, 0.55
for i in range(n):
x = a*x[:-1] + b*x[1:]
I came across this setup in another question. There it was a covered by a little obscure nomenclature. I guess it is related to Binomial options pricing model but don't quite understand the topic to be honest. I just was intrigued by the formula and this iterative update / shrinking of x and wondered if it can be done without a loop. But I can not wrap my head around it and I am not sure if this is even possible.
What makes me think that it might work is that this vatiaton
n = 10
a, b = 0.301201, 0.59692
x0 = 123
x = x0
for i in range(n):
x = a*x + b*x
# ~42
is actually just x0*(a + b)**n
print(np.allclose(x, x0*(a + b)**n))
# True
You are calculating:
sum( a ** (n - i) * b ** i * x[i] * choose(n, i) for 0 <= i <= n)
[That's meant to be pseudocode, not Python.] I'm not sure of the best way to convert that into Numpy.
choose(n, i) is n!/ (i! (n-i)!), not the numpy choose function.
Using #mathfux's comment, one can do
import numpy as np
from scipy.stats import binom
binomial = binom(p=p, n=n)
pmf = binomial(np.arange(n+1))
res = np.sum(x * pmf)
So
res = x.copy()
for i in range(n):
res = p*res[1:] + (p-1)*res[:-1]
is just the expected value of a binomial distributed random variable x.
I have a recommendation dataset that I have transformed into a matrix of the form:
item1 item2 item3 ...
user1 NaN 2.3 NaN
user2 1.7 3.4 NaN
user3 NaN 1.1 2.6
...
where NaN are items that the particular user has not reviewed yet. The above is in the form of a pandas dataframe. I want to construct an adjacency matrix from this, based on a predefined distance metric. I have a working function:
def compute_adjacency_matrix(reccomender_matrix):
# replace nan with 0
rec_num = reccomender_matrix.fillna(value=0)
# compute the distances between every two users
result = np.array([[compute_distance(li[2:], lj[2:]) for lj in rec_num.itertuples()] for li in rec_num.itertuples()])
adjacency_matrix = (result > 0.0).astype(int)
return adjacency_matrix
the problem is that, for large matrices, the line that computes result takes very long. What is the most efficient way of doing this, that would scale for larger datasets?
EDIT: Here is the compute distance function:
def compute_distance(vec1, vec2):
rez = sum(abs(v1[(v1>0)&(v2>0)] - v2[(v1>0)&(v2>0)]))
norm = np.count_nonzero(v1) if np.count_nonzero(v1) < np.count_nonzero(v2) else np.count_nonzero(v2)
norm_rez = rez / norm
return norm_rez
So it looks like you want a mean absolute distance metric, although that's not exactly what you wrote (since you're normalizing not by the size of the intersection but the size of the smaller vector). If you want mean absolute distance, it's simply:
def compute_distance(vec1, vec2):
return np.nanmean(np.abs(vec1 - vec2))
You can then use that metric with scipy.spatial.distance.pdist and squareform
from scipy.spatial.distance import pdist, squareform
def compute_adjacency_matrix(reccomender_matrix):
result = squareform(pdist(reccomender_matrix.values.T, metric = compute_distance))
result = np.nan_to_num(result)
adjacency_matrix = (result > 0.0).astype(int)
return adjacency_matrix
As noted in my comment, I think you need to rethink your metrics and outputs. That code will make anyone who's recommended the same item adjacent, no matter what score they gave - unless the gave the same scores, then they won't be adjacent. Not sure that's what you want.
A slightly better method would be carrying through the nans and using them to make your adjacency matrix.
def compute_adjacency_matrix(reccomender_matrix):
result = squareform(pdist(reccomender_matrix.values.T, metric = compute_distance))
adjacency_matrix = np.logical_not(np.isnan(result)).astype(int)
return adjacency_matrix
If you don't need the distances, you can do it all with binary operations:
def adjacency(x, y):
return np.any(np.logical_and(x, y))
def compute_adjacency_matrix(reccomender_matrix):
return squareform(pdist(np.isfinite(reccomender_matrix.values.T),
metric = adjacency)).astype(int)
Finally, you can do it all with numba if that's all too slow:
import numba as nb
#nb.njit
def compute_adjacency_matrix(reccomender_matrix):
n, m = reccomender_matrix.shape
out = np.zeros((m, m))
count = np.zeros((m, m))
dists = np.zeros((m, m))
adj = np.zeros((m, m))
for i in range(1, m):
for j in range(i + 1, m):
for k in range(n):
if not(np.isnan(reccomender_matrix[k, i]) or \
np.isnan(reccomender_matrix[k, j])):
out[i, j] += np.abs(reccomender_matrix[k, i] - reccomender_matrix[k, j])
count[i, j] += 1
for i in range(m):
for j in range(m):
if i == j:
dists[i, j] = 0.
elif i < j:
if count[i, j] != 0:
dists[i, j] = out[i, j] / count [i, j]
adj[i, j] = 1
else:
dists[i, j] = 0.
else:
dists[i, j] = dists[j, i]
adj[i, j] = adj[j, i]
return dists, adj
I am trying to optimize a snippet that gets called a lot (millions of times) so any type of speed improvement (hopefully removing the for-loop) would be great.
I am computing a correlation function of some j'th particle with all others
C_j(|r-r'|) = sqrt(E((s_j(r')-s_k(r))^2)) averaged over k.
My idea is to have a variable corrfun which bins data into some bins (the r, defined elsewhere). I find what bin of r each s_k belongs to and this is stored in ind. So ind[0] is the index of r (and thus the corrfun) for which the j=0 point corresponds to. Multiple points can fall into the same bin (in fact I want bins to be big enough to contain multiple points) so I sum together all of the (s_j(r')-s_k(r))^2 and then divide by number of points in that bin (stored in variable rw). The code I ended up making for this is the following (np is for numpy):
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
Note, the rw2 business was because I want to avoid divide by 0 problems but I do return the rw array and I want to be able to differentiate between the rw=0 and rw=1 elements. Perhaps there is a more elegant solution for this as well.
Is there a way to make the for-loop faster? While I would like to not add the self interaction (j==k) I am even ok with having self interaction if it means I can get significantly faster calculation (length of ind ~ 1E6 so self interaction is probably insignificant anyways).
Thank you!
Ilya
Edit:
Here is the full code. Note, in the full code I am averaging over j as well.
import numpy as np
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
print(r)
corrfun = r*0
rw = r*0
print(maxR)
''' go through all points'''
for j in range(0, n-1):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
return r, corrfun, rw
I debug test it the following way
from twopointcorr import twopointcorr
import numpy as np
import matplotlib.pyplot as plt
import time
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
print('running two point corr functinon')
start_time = time.time()
r,corrfun,rw = twopointcorr(x,y,s,0.1)
print("--- Execution time is %s seconds ---" % (time.time() - start_time))
fig1=plt.figure()
plt.plot(r, corrfun,'-x')
fig2=plt.figure()
plt.plot(r, rw,'-x')
plt.show()
Again, the main issue is that in the real dataset n~1E6. I can resample to make it smaller, of course, but I would love to actually crank through the dataset.
Here is the code that use broadcast, hypot, round, bincount to remove all the loops:
def twopointcorr2(x, y, s, dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
osub = lambda x:np.subtract.outer(x, x)
ind = np.clip(np.round(np.hypot(osub(x), osub(y)) / dr), 0, len(r)-1).astype(int)
rw = np.bincount(ind.ravel())
rw[0] -= len(x)
corrfun = np.bincount(ind.ravel(), (osub(s)**2).ravel())
return r, corrfun, rw
to compare, I modified your code as follows:
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
return r, corrfun, rw
and here is the code to check the results:
import numpy as np
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
r1, corrfun1, rw1 = twopointcorr(x,y,s,0.1)
r2, corrfun2, rw2 = twopointcorr2(x,y,s,0.1)
assert np.allclose(r1, r2)
assert np.allclose(corrfun1, corrfun2)
assert np.allclose(rw1, rw2)
and the %timeit results:
%timeit twopointcorr(x,y,s,0.1)
%timeit twopointcorr2(x,y,s,0.1)
outputs:
1 loop, best of 3: 5.16 s per loop
10 loops, best of 3: 134 ms per loop
Your original code on my system runs in about 5.7 seconds. I fully vectorized the inner loop and got it to run in 0.39 seconds. Simply replace your "go through all points" loop with this:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
# go through all points
for j in range(n): # n.b. previously n-1, not sure why
ind = inds[j]
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[ind[j]] -= 1 # subtract self
The first observation was that your hypot code was computing 2D distances, so I replaced that with cdist from SciPy to do it all in a single call. The second was that the inner for loop was slow, and thanks to an insightful comment from #hpaulj I vectorized that as well using np.add.at().
Since you asked how to vectorize the inner loop as well, I did that later. It now takes 0.25 seconds to run, for a total speedup of over 20x. Here's the final code:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
sn = np.tile(s, (n,1)) # n copies of s
diffs = (sn - sn.T)**2 # squares of pairwise differences
np.add.at(corrfun, inds, diffs)
rw = np.bincount(inds.flatten(), minlength=len(r))
np.subtract.at(rw, inds.diagonal(), 1) # subtract self
This uses more memory but does produce a substantial speedup vs. the single-loop version above.
Ok, so as it turns out outer products are incredibly memory expensive, however, using answers from #HYRY and #JohnZwinck i was able to make code that is still roughly linear in n in memory and computes fast (0.5 seconds for the test case)
import numpy as np
def twopointcorr(x,y,s,dr,maxR=-1):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
if maxR < dr:
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR+dr, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
ind = np.clip(np.round(np.hypot(x[j]-x,y[j]-y) / dr), 0, len(r)-1).astype(int)
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[0] -= n
corrfun = np.sqrt(np.divide(corrfun, np.maximum(rw,1)))
r=np.delete(r,-1)
rw=np.delete(rw,-1)
corrfun=np.delete(corrfun,-1)
return r, corrfun, rw
I have a system of equations in the form of A*x = B where [A] is a tridiagonal coefficient matrix. Using the Numpy solver numpy.linalg.solve I can solve the system of equations for x.
See example below of how I develop the tridiagonal [A] martix. the {B} vector, and solve for x:
# Solve system of equations with a tridiagonal coefficient matrix
# uses numpy.linalg.solve
# use Python 3 print function
from __future__ import print_function
from __future__ import division
# modules
import numpy as np
import time
ti = time.clock()
#---- Build [A] array and {B} column vector
m = 1000 # size of array, make this 8000 to see time benefits
A = np.zeros((m, m)) # pre-allocate [A] array
B = np.zeros((m, 1)) # pre-allocate {B} column vector
A[0, 0] = 1
A[0, 1] = 2
B[0, 0] = 1
for i in range(1, m-1):
A[i, i-1] = 7 # node-1
A[i, i] = 8 # node
A[i, i+1] = 9 # node+1
B[i, 0] = 2
A[m-1, m-2] = 3
A[m-1, m-1] = 4
B[m-1, 0] = 3
print('A \n', A)
print('B \n', B)
#---- Solve using numpy.linalg.solve
x = np.linalg.solve(A, B) # solve A*x = B for x
print('x \n', x)
#---- Elapsed time for each approach
print('NUMPY time', time.clock()-ti, 'seconds')
So my question relates to two sections of the above example:
Since I am dealing with a tridiagonal matrix for [A], also called a banded matrix, is there a more efficient way to solve the system of equations instead of using numpy.linalg.solve?
Also, is there a better way to create the tridiagonal matrix instead of using a for-loop?
The above example runs on Linux in about 0.08 seconds according to the time.clock() function.
The numpy.linalg.solve function works fine, but I'm trying to find an approach that takes advantage of the tridiagonal form of [A] in hopes of speeding up the solution even further and then apply that approach to a more complicated example.
There are two immediate performance improvements (1) do not use a loop, (2) use scipy.linalg.solve_banded().
I would write the code something more like
import scipy.linalg as la
# Create arrays and set values
ab = np.zeros((3,m))
b = 2*ones(m)
ab[0] = 9
ab[1] = 8
ab[2] = 7
# Fix end points
ab[0,1] = 2
ab[1,0] = 1
ab[1,-1] = 4
ab[2,-2] = 3
b[0] = 1
b[-1] = 3
return la.solve_banded ((1,1),ab,b)
There may be more elegant ways to construct the matrix, but this works.
Using %timeit in ipython the original code took 112 ms for m=1000. This code takes 2.94 ms for m=10,000, an order of magnitude larger problem yet still almost two orders of magnitude faster! I did not have the patience to wait on the original code for m=10,000. Most of the time in the original may be in constructing the array, I did not test this. Regardless, for large arrays it is much more efficient to only store the non-zero values of the matrix.
There is a scipy.sparse matrix type called scipy.sparse.dia_matrix which captures the structure of your matrix well (it will store 3 arrays, in "positions" 0 (diagonal), 1 (above) and -1 (below)). Using this type of matrix you can try scipy.sparse.linalg.lsqr for solving. If your problem has an exact solution, it will be found, otherwise it will find the solution in least squares sense.
from scipy import sparse
A_sparse = sparse.dia_matrix(A)
ret_values = sparse.linalg.lsqr(A_sparse, C)
x = ret_values[0]
However, this may not be completely optimal in terms of exploiting the triadiagonal structure, there may be a theoretical way of making this faster. What this conversion does do for you is cut down the matrix multiplication expenses to the essential: Only the 3 bands are used. This, in combination with the iterative solver lsqr should already yield a speedup.
Note: I am not proposing scipy.sparse.linalg.spsolve, because it converts your matrix to csr format. However, replacing lsqr with spsolve is worth a try, especially because spsolve can bind UMFPACK, see relevant doc on spsolve. Also, it may be of interest to take a look at this stackoverflow question and answer relating to UMFPACK
You could use scipy.linalg.solveh_banded.
EDIT: You CANNOT used the above as your matrix is not symmetric and I thought it was. However, as was mentioned above in the comment, the Thomas algorithm is great for this
a = [7] * ( m - 2 ) + [3]
b = [1] + [8] * ( m - 2 ) + [4]
c = [2] + [9] * ( m - 2 )
d = [1] + [2] * ( m - 2 ) + [3]
# This is taken directly from the Wikipedia page also cited above
# this overwrites b and d
def TDMASolve(a, b, c, d):
n = len(d) # n is the numbers of rows, a and c has length n-1
for i in xrange(n-1):
d[i+1] -= 1. * d[i] * a[i] / b[i]
b[i+1] -= 1. * c[i] * a[i] / b[i]
for i in reversed(xrange(n-1)):
d[i] -= d[i+1] * c[i] / b[i+1]
return [d[i] / b[i] for i in xrange(n)]
This code is not optimize nor does it use np, but if I (or any of the other fine folks here) have time, I will edit it so that it does those thing. It currently times at ~10 ms for m=10000.
This probably will help
There is a function creates_tridiagonal which will create tridiagonal matrix. There is another function which converts a matrix into diagonal ordered form as requested by SciPy solve_banded function.
import numpy as np
def lu_decomp3(a):
"""
c,d,e = lu_decomp3(a).
LU decomposition of tridiagonal matrix a = [c\d\e]. On output
{c},{d} and {e} are the diagonals of the decomposed matrix a.
"""
n = np.diagonal(a).size
assert(np.all(a.shape ==(n,n))) # check if square matrix
d = np.copy(np.diagonal(a)) # without copy (assignment destination is read-only) error is raised
e = np.copy(np.diagonal(a, 1))
c = np.copy(np.diagonal(a, -1))
for k in range(1,n):
lam = c[k-1]/d[k-1]
d[k] = d[k] - lam*e[k-1]
c[k-1] = lam
return c,d,e
def lu_solve3(c,d,e,b):
"""
x = lu_solve(c,d,e,b).
Solves [c\d\e]{x} = {b}, where {c}, {d} and {e} are the
vectors returned from lu_decomp3.
"""
n = len(d)
y = np.zeros_like(b)
y[0] = b[0]
for k in range(1,n):
y[k] = b[k] - c[k-1]*y[k-1]
x = np.zeros_like(b)
x[n-1] = y[n-1]/d[n-1] # there is no x[n] out of range
for k in range(n-2,-1,-1):
x[k] = (y[k] - e[k]*x[k+1])/d[k]
return x
from scipy.sparse import diags
def create_tridiagonal(size = 4):
diag = np.random.randn(size)*100
diag_pos1 = np.random.randn(size-1)*10
diag_neg1 = np.random.randn(size-1)*10
a = diags([diag_neg1, diag, diag_pos1], offsets=[-1, 0, 1],shape=(size,size)).todense()
return a
a = create_tridiagonal(4)
b = np.random.randn(4)*10
print('matrix a is\n = {} \n\n and vector b is \n {}'.format(a, b))
c, d, e = lu_decomp3(a)
x = lu_solve3(c, d, e, b)
print("x from our function is {}".format(x))
print("check is answer correct ({})".format(np.allclose(np.dot(a, x), b)))
## Test Scipy
from scipy.linalg import solve_banded
def diagonal_form(a, upper = 1, lower= 1):
"""
a is a numpy square matrix
this function converts a square matrix to diagonal ordered form
returned matrix in ab shape which can be used directly for scipy.linalg.solve_banded
"""
n = a.shape[1]
assert(np.all(a.shape ==(n,n)))
ab = np.zeros((2*n-1, n))
for i in range(n):
ab[i,(n-1)-i:] = np.diagonal(a,(n-1)-i)
for i in range(n-1):
ab[(2*n-2)-i,:i+1] = np.diagonal(a,i-(n-1))
mid_row_inx = int(ab.shape[0]/2)
upper_rows = [mid_row_inx - i for i in range(1, upper+1)]
upper_rows.reverse()
upper_rows.append(mid_row_inx)
lower_rows = [mid_row_inx + i for i in range(1, lower+1)]
keep_rows = upper_rows+lower_rows
ab = ab[keep_rows,:]
return ab
ab = diagonal_form(a, upper=1, lower=1) # for tridiagonal matrix upper and lower = 1
x_sp = solve_banded((1,1), ab, b)
print("is our answer the same as scipy answer ({})".format(np.allclose(x, x_sp)))