String Matching Using TF-IDF, NGrams and Cosine Similarity in Python - python

I am working on my first major data science project. I am attempting to match names between a large list of data from one source, to a cleansed dictionary in another. I am using this string matching blog as a guide.
I am attempting to use two different data sets. Unfortunately, I can't seem to get good results and I think I am not applying this appropriately.
Code:
import pandas as pd, numpy as np, re, sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
df_dirty = {"name":["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl","gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl"]}
df_clean = {"name":["google","bing","amazon","facebook"]}
print (df_dirty["name"])
print (df_clean["name"])
def ngrams(string, n=3):
string = (re.sub(r'[,-./]|\sBD',r'', string)).upper()
ngrams = zip(*[string[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M * ntop
indptr = np.zeros(M + 1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data, indices, indptr), shape=(M, N))
def get_matches_df(sparse_matrix, name_vector, top=5):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
print (top)
nr_matches = top
else:
print (sparsecols.size)
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similairity': similairity})
company_names = df_clean['name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 4, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)
matches_df = get_matches_df(matches, company_names, top=4)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(matches_df)
The expected result is as follows:
gogle = google
amazn = amazon
fcbook = facebook

You can import awesome_cossim_top function directly from the sparse_dot_topn lib.
Change the function get_matches_df with this:
def get_matches_df(sparse_matrix, A, B, top=100):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = A[sparserows[index]]
right_side[index] = B[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similairity': similairity})
Now you can execute your code as below:
df_dirty = {"name":["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl"]}
df_clean = {"name":["google","bing","amazon","facebook"]}
print (df_dirty["name"])
print (df_clean["name"])
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(df_clean['name'])
tf_idf_matrix_dirty = vectorizer.transform(df_dirty['name'])
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)
t = time.time()-t1
print("SELFTIMED:", t)
matches_df = get_matches_df(matches, df_dirty['name'], df_clean['name'], top=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(matches_df)
Basically the example you found identifies duplicates in its own array and you want to use 2 sources instead of one.
Hope it helps!

Related

Importing an array using Python

I have an array Pe with dimensions (2,3,3). I am importing Pe but then I want the code to calculate for each Pe and give me visited indices. Right now, I have to manually add Pe.Pe[0],Pe.Pe[1] in the line Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150). How do I implement Pe.Pe[i] where i goes from 0 to 1?
Pe array is
import numpy as np
Pe = np.array([[[128.22918457, 168.52413295, 209.72343319],
[129.01598287, 179.03716051, 150.68633749],
[131.00688309, 187.42601593, 193.68172751]],
[[ 87.70103267, 115.2603484 , 143.4381863 ],
[ 88.23915528, 122.45062554, 103.06037156],
[ 89.60081102, 128.18809696, 132.46662659]]])
print([Pe])
The code is
import numpy as np
import time
import Pe
def get_neighbor_indices(position, dimensions):
'''
dimensions is a shape of np.array
'''
i, j = position
indices = [(i+1,j), (i-1,j), (i,j+1), (i,j-1)]
return [
(i,j) for i,j in indices
if i>=0 and i<dimensions[0]
and j>=0 and j<dimensions[1]
]
def iterate_array(init_i, init_j, arr, condition_func):
'''
arr is an instance of np.array
condition_func is a function (value) => boolean
'''
indices_to_check = [(init_i,init_j)]
checked_indices = set()
result = []
t0 = None
t1 = None
timestamps = []
while indices_to_check:
pos = indices_to_check.pop()
if pos in checked_indices:
continue
item = arr[pos]
checked_indices.add(pos)
if condition_func(item):
result.append(item)
t1=time.time()
if(t0==None):
t0=t1
timestamps.append(t1-t0)
indices_to_check.extend(
get_neighbor_indices(pos, arr.shape)
)
return result,timestamps
Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150)
out = list(zip(*np.where(np.isin(Pe, Visited_Indices))))
print("Visited =",[Visited_Indices])

I'm having trouble with content based recommendation system prediction (NOT TDIDF)

I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Can someone help me? I've been working on this code for a few days. My entire body of code is below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset, Reader
user_artists = pd.read_table("user_artists.dat")
user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1
data = user_artists[['userID','artistID','ratings']]
data.head()
data.shape
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data.shape)
print(train.shape)
print(test.shape)
tags = pd.read_table("tags.dat", encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")
user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner")
movie=user_tag_merged_updated
movie
data2 = data[['userID','artistID','ratings']]
# train-test split
train, test2 = train_test_split(data2, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data2.shape)
print(train.shape)
print(test.shape)
data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0)
data_pivot2.head()
movie2 = [['tagID','artistID','year']]
movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0)
movie2.head()
# Content based as a function
from numpy.linalg import norm
def simil_cosine(a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def ContentBased(content_data, test_data, NN):
cdata = content_data.reset_index(drop=True).copy()
# store user and item dimensions
dim = cdata.shape[0]
nr_user = cdata.shape[0]
if test_data.shape[1] != dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], dim))
# similarity matrices
matrix = np.zeros(shape=(dim, dim), dtype=np.float)
matrixNN = np.zeros(shape=(dim, dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
matrix = matrix + matrix.T - np.diag(np.diag(matrix))
print('Similarity calculation done...')
# mask all values that are not nearest neighbors
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(dim):
crit_val = -np.sort(-matrix[i])[NN-1]
matrixNN[i] = v_cutoff(matrix[i], crit_val)
print('Nearest neighbor selection done...')
# predict user-item ratings in test_data
prediction = np.zeros(shape=(nr_user, dim), dtype=np.float)
for i in range(nr_user):
num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN)
denom = matrixNN.sum(axis=0) # column sums
prediction[i] = num/denom
print('Prediction done...')
# return DataFrame
return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns)
cb_pred = ContentBased(movie2,data_pivot2, 10)
# Content Based as a Class
from numpy.linalg import norm
class ContentBased:
def simil_cosine(self, a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def __init__(self, NN):
self.NN = NN
def fit(self, content_data):
cdata = content_data.reset_index(drop=True).copy()
self.item_dim = cdata.shape[0]
self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(self.item_dim):
crit_val = -np.sort(-self.matrix[i])[self.NN-1]
self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val)
def predict(self, test_data):
if test_data.shape[1] != self.item_dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], self.item_dim))
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.

Numba function fails when imported from another script, but works well when compiled manually

Here is a reproducible example. The numba function "is_in_set_pnb" doesn't work when is imported from "functions.py" script. However, it works just fine when is defined in the same script.
functions.py script
import numpy as np
import numba as nb
import pandas as pd
# corrFilter function
def corrFilter(df, threshold):
corr_matrix = df.corr().abs()
flat_matrix = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
flat_matrix = pd.DataFrame(flat_matrix).reset_index()
flat_matrix.columns=["var1", "var2", "correlation"]
flat_matrix = flat_matrix.reindex(flat_matrix.correlation.sort_values(ascending=False).index).reset_index().drop(["index"],axis=1)
filtered_matrix = flat_matrix[flat_matrix.correlation > threshold]
pairs_to_remove = filtered_matrix[["var1", "var2"]].to_numpy()
return pairs_to_remove
# Numba function / numpy.isin() improved version
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
main.py script
from sklearn import datasets
import pandas as pd
import numpy as np
import itertools
from functions import *
# Loading data
iris = datasets.load_iris()
data = iris.data
data = pd.DataFrame(data)
data.columns = ["var1", "var2", "var3", "var4"]
# Get pairs with a correlation higher than 0.5
remove_pairs = corrFilter(data, 0.5)
remove_pairs_check = np.sum(remove_pairs, axis=1)
# Remove triplets with pairs with a higher correlation than 0.5
triplets = [list(k) for k in itertools.combinations(data.columns, 3)]
array_triplets = np.vstack(triplets).astype(object)
n, d = array_triplets.shape
pair1 = np.sum(array_triplets[:,[0,1]], axis=1).reshape(n, 1)
pair2 = np.sum(array_triplets[:,[1,2]], axis=1).reshape(n, 1)
pair3 = np.sum(array_triplets[:,[0,2]], axis=1).reshape(n, 1)
array_triplets_check = np.concatenate((pair1, pair2, pair3), axis =1)
# Check if each pair is in the remove_pairs_check list
array_triplets_check_v1 = np.in1d(array_triplets_check, remove_pairs_check).reshape(n, d)
# Using numba function from "functions.py" script"
# This fails!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
However, if we define the numba function within the main.py script:
import numba as nb
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
# This works properly!!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
What I am missing?? I am using:
numba 0.50.1 py38h47e9c7a_0

Unique column of a sparse matrix in Python

What is a good way of identifying the unique columns of a sparse matrix represented in csc_matrix format and the times each column is repeated?
I have no information a priori about the elements of the matrix. It is the result of a sampling with replacement of the columns of another matrix, so I can have duplicated columns both due to the fact that a column is sampled many times either there are duplicated columns in the original matrix. Therefore I cannot apply numpy.unique to the indices of the sampled columns and I think it is not a good choice to convert the entire matrix to a dense format and then apply numpy.unique to it.
You can sort and group by the number of nonzeros in each column. Each group then sort by indices and values and split into blocks of no-change:
import numpy as np
from scipy import sparse
def sparse_unique_columns(M):
M = M.tocsc()
m, n = M.shape
if not M.has_sorted_indices:
M.sort_indices()
if not M.has_canonical_format:
M.sum_duplicates()
sizes = np.diff(M.indptr)
idx = np.argsort(sizes)
Ms = M#sparse.csc_matrix((np.ones((n,)), idx, np.arange(n+1)), (n, n))
ssizes = np.diff(Ms.indptr)
ssizes[1:] -= ssizes[:-1]
grpidx, = np.where(ssizes)
grpidx = np.concatenate([grpidx, [n]])
if ssizes[0] == 0:
counts = [np.array([0, grpidx[0]])]
else:
counts = [np.zeros((1,), int)]
ssizes = ssizes[grpidx[:-1]].cumsum()
for i, ss in enumerate(ssizes):
gil, gir = grpidx[i:i+2]
pl, pr = Ms.indptr[[gil, gir]]
dv = Ms.data[pl:pr].view(f'V{ss*Ms.data.dtype.itemsize}')
iv = Ms.indices[pl:pr].view(f'V{ss*Ms.indices.dtype.itemsize}')
idxi = np.lexsort((dv, iv))
dv = dv[idxi]
iv = iv[idxi]
chng, = np.where(np.concatenate(
[[True], (dv[1:] != dv[:-1]) | (iv[1:] != iv[:-1]), [True]]))
counts.append(np.diff(chng))
idx[gil:gir] = idx[gil:gir][idxi]
counts = np.concatenate(counts)
nu = counts.size - 1
uniques = M#sparse.csc_matrix((np.ones((nu,)), idx[counts[:-1].cumsum()],
np.arange(nu + 1)), (n, nu))
return uniques, idx, counts[1:]
a = np.random.uniform(0, 10, (1000, 200))
a[a>1] = 0
a = sparse.csc_matrix(a)
b = sparse.csc_matrix((np.ones(1000), np.random.randint(0, 200, (1000,)), np.arange(1001)))
c = a#b
unq, idx, cnt = sparse_unique_columns(c)
unqd, idxd, cntd = np.unique(c.A, axis=1, return_counts=True, return_inverse=True)
from timeit import timeit
print('sparse:', timeit(lambda: sparse_unique_columns(c), number=1000), 'ms')
print('dense: ', timeit(lambda: np.unique(c.A, axis=1, return_counts=True), number=100)*10, 'ms')
Sample output:
sparse: 2.735588440205902 ms
dense: 49.32689592242241 ms

How to get the index of a list items in another list?

Consider I have these lists:
l = [5,6,7,8,9,10,5,15,20]
m = [10,5]
I want to get the index of m in l. I used list comprehension to do that:
[(i,i+1) for i,j in enumerate(l) if m[0] == l[i] and m[1] == l[i+1]]
Output : [(5,6)]
But if I have more numbers in m, I feel its not the right way. So is there any easy approach in Python or with NumPy?
Another example:
l = [5,6,7,8,9,10,5,15,20,50,16,18]
m = [10,5,15,20]
The output should be:
[(5,6,7,8)]
The easiest way (using pure Python) would be to iterate over the items and first only check if the first item matches. This avoids doing sublist comparisons when not needed. Depending on the contents of your l this could outperform even NumPy broadcasting solutions:
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
In case your interested in speed I checked the performance of the approaches (borrowing from my setup here):
import random
import numpy as np
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
In case your l and m are lists my function outperforms the NumPy solutions for all sizes:
But in case you have these as numpy arrays you'll get faster results for large arrays (size > 1000 elements) when using Divakars NumPy solutions:
You are basically looking for the starting indices of a list in another list.
Approach #1 : One approach to solve it would be to create sliding windows of the elements in list in which we are searching, giving us a 2D array and then simply use NumPy broadcasting to perform broadcasted comparison against the search list against each row of the 2D sliding window version obtained earlier. Thus, one method would be -
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Sample runs -
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
Approach #2 : Another method would be to get the sliding window and then get the row-wise scalar view into the data to be search data and the data to be search for, giving us 1D data to work with, like so -
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
out = np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
2020 Versions
In search of more easy/compact approaches, we could look into scikit-image's view_as_windows for getting sliding windows with a built-in. I am assuming arrays as inputs for less messy code. For lists as input, we have to use np.asarray() as shown earlier.
Approach #3 : Basically a derivative of pattern_index_broadcasting with view_as_windows for a one-liner with a as the larger data and b is the array to be searched -
from skimage.util import view_as_windows
np.flatnonzero((view_as_windows(a,len(b))==b).all(1))[:,None]+np.arange(len(b))
Approach #4 : For a small number of matches from b in a, we could optimize, by looking for first element match from b to reduce the dataset size for searches -
mask = a[:-len(b)+1]==b[0]
mask[mask] = (view_as_windows(a,len(b))[mask]).all(1)
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Approach #5 : For a small sized b, we could simply run a loop for each of the elements in b and perform bitwise and-reduction -
mask = np.bitwise_and.reduce([a[i:len(a)-len(b)+1+i]==b[i] for i in range(len(b))])
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Just making the point that #MSeifert's approach can, of course, also be implemented in numpy:
def pp(h,n):
nn = len(n)
NN = len(h)
c = (h[:NN-nn+1]==n[0]).nonzero()[0]
if c.size==0: return
for i,l in enumerate(n[1:].tolist(),1):
c = c[h[i:][c]==l]
if c.size==0: return
return np.arange(c[0],c[0]+nn)
def get_data(l1,l2):
d=defaultdict(list)
[d[item].append(index) for index,item in enumerate(l1)]
print(d)
Using defaultdict to store indices of elements from other list.

Categories