For loop only returning last item - python

# Create random df
df = pd.DataFrame(np.random.randint(1,10, size=(100,23)))
test = df[:50]
for i in range(len(test)):
query_node = test.iloc[i]
# Find the distance between this node and everyone else
euclidean_distances = test.apply(lambda row: distance.euclidean(row, query_node), axis=1)
# Create a new dataframe with distances.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
smallest_dist = [dist["idx"] for idx, dist in distance_frame.iloc[1:4].iterrows()]
I am stumped on this problem and wondering if anyone can see where I'm going wrong. I am trying to calculate the euclidean distance between each row and every other row. Then, I sort those distances and return the index positions of the "most similar" rows by minimum distance in the list smallest_dist.
The issue is that this only returns the most similar index positions of the last row: [6.0, 3.0, 4.0]
What I want for output is something like this:
Original ID
Matches
1
4,5,6
2
8,2,5
I've tried this but it gives the same result:
list_of_mins = []
for i in range(len(test)):
query_node = test.iloc[i]
# Find the distance between this node and everyone else
euclidean_distances = test.apply(lambda row: distance.euclidean(row, query_node), axis=1)
# Create a new dataframe with distances.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
smallest_dist = [dist["idx"] for idx, dist in distance_frame.iloc[1:4].iterrows()]
for i in range(len(test)):
list_of_mins.append(smallest_dist_ixs)
Does anyone know what's causing this problem? thank you!

I don't have the distance library available so I change that to a simple sum, but it should work after replacing it back to distance
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1, 10, size=(100, 23)))
test = df[:50]
dict_results = {'ids': [],
'ids_min': []}
n_min = 2
for i in range(len(test)):
query_node = test.iloc[i]
# Find the distance between this node and everyone else
euclidean_distances = test.apply(lambda row: np.sum(row), axis=1)
# Create a new dataframe with distances.
# print(euclidean_distances)
distance_frame = pd.DataFrame(data={"dist": euclidean_distances,
"idx": euclidean_distances.index})
selected_min = distance_frame.sort_values("dist").head(n_min)
dict_results['ids'].append(i)
dict_results['ids_min'].append(', '.join(selected_min['idx'].astype('str')))
print(pd.DataFrame(dict_results))
I added a few changes to your code:
Added a n_min parameter to define how many elements you want in the second columns (number of index to closest rows)
Created a dict where the results are going to be save to create the data frame you want.
In the loop added the append to add the results of each iteration to the dict where the results are being saved
After the loop if you call the dict inside pd.DataFrame it will be parse the same way you were doing it with the distance_frame

What happens if you try to resturn the results either in the data frame or (for convenience of testing) a dictionary? For example:
df = pd.DataFrame(np.random.randint(1,10, size=(100,23)))
test = df[:50]
closest_nodes = {}
for i in range(len(test)):
query_node = test.iloc[i]
# Find the distance between this node and everyone else
euclidean_distances = test.apply(lambda row: distance.euclidean(row, query_node), axis=1)
# Create a new dataframe with distances.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
closest_nodes[i] = [dist["idx"] for idx, dist in distance_frame.iloc[1:4].iterrows()]
The thing I didn't see in your code was some sort of storage action to put the one result per test case into a permanent structure.

Related

get the full row data from the found extrema

I am new to using pandas and I can't find a way to get the full row of the found extrema
df = pd.read_csv('test.csv')
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=10)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=10)[0]]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
print(min_values_list, max_values_list)
It print only the minima and extrema values, but I need the full row data of found minima / maxima
Example of data:
Datetime,Date,Open,High,Low,Close
2021-01-11 00:00:00+00:00,18638.0,1.2189176082611084,1.2199585437774658,1.2186205387115479,1.2192147970199585
If the list is required, then I would suggest:
def df_to_list_rowwise(df: pd.DataFrame) -> list:
return [df.iloc[_, :].tolist() for _ in range(df.shape[0])]
df_min_values = df.iloc[argrelextrema(np.array(df.Close), np.less_equal)[0], :]
df_max_values = df.iloc[argrelextrema(np.array(df.Close), np.greater_equal)[0], :]
print(df_to_list_rowwise(df_min_values))
print(df_to_list_rowwise(df_max_values))
Would that help?
try to use df.dropna().index.tolist() instead of specifying the column because adding the column name returns just the value of a specific row and the specified column not the whole row

Problems with pd.merge

Hope you all are having an excellent week.
So, I was finishing a script that worked really well for an specific use case. The base is as follows:
Funcion cosine_similarity_join:
def cosine_similarity_join(a:pd.DataFrame, b:pd.DataFrame, col_name):
a_len = len(a[col_name])
# all of the "documents" in a 1D array
corpus = np.concatenate([a[col_name].to_numpy(), b[col_name].to_numpy()])
# vectorize the array
tfidf, vectorizer = fit_vectorizer(corpus, 3)
# in this matrix each row represents the str in a and the col is the str from b, value is the cosine similarity
res = cosine_similarity(tfidf[:a_len], tfidf[a_len:])
res_series = pd.DataFrame(res).stack().rename("score")
res_series.index.set_names(['a', 'b'], inplace=True)
# join scores to b
b_scored = pd.merge(left=b, right=res_series, left_index=True, right_on='b').droplevel('b')
# find the indices on which to match, (highest score in each row)
best_match = np.argmax(res, axis=1)
# Join the rest of
res = pd.merge(left=a, right=b_scored, left_index=True, right_index=True, suffixes=('', '_Guess'))
print(res)
df = res.reset_index()
df = df.iloc[df.groupby(by="RefCol")["score"].idxmax()].reset_index(drop=True)
return df
This works like a charm when I do something like:
resulting_df = cosine_similarity_join(df1,df2,'My_col')
But in my case, I need something in the lines of:
big_df = pd.read_csv('some_really_big_df.csv')
some_other_df = pd.read_csv('some_other_small_df.csv')
counter = 0
size = 10000
total_size = len(big_df)
while counter <= total_size:
small_df = big_df[counter:counter+size]
resulting_df = cosine_similarity_join(small_df,some_other_df,'My_col')
counter += size
I already mapped the problem until one specific line in the function:
res = pd.merge(left=a, right=b_scored, left_index=True, right_index=True, suffixes=('', '_Guess'))
Basically this res dataframe is coming out empty and I just cannot understand why (since when I replicate the values outside of the loop it works just fine)...
I looked at the problem for hours now and would gladly accept a new light over the question.
Thank you all in advance!
Found the problem!
I just needed to reset the indexes for the join clause - once I create a new small df from the big df, the indexes remain equal to the slice of the big one, thus generating the problem when joining with another df!
So basically all I needed to do was:
while counter <= total_size:
small_df = big_df[counter:counter+size]
small_df = small_df.reset_index()
resulting_df = cosine_similarity_join(small_df,some_other_df,'My_col')
counter += size
I'll leave it here in case it helps someone :)
Cheers!

Adding empty rows in Pandas dataframe

I'd like to append consistently empty rows in my dataframe.
I have following code what does what I want but I'm struggling in adjusting it to my needs:
s = pd.Series('', data_only_trades.columns)
f = lambda d: d.append(s, ignore_index=True)
set_rows = np.arange(len(data_only_trades)) // 4
empty_rows = data_only_trades.groupby(set_rows, group_keys=False).apply(f).reset_index(drop=True)
How can I adjust the code so I add two or more rows instead of one?
How can I set a starting point (e.g. it should start with row 5 -- Do I have to use .loc then in arange?)
Also tried this code but I was struggling in setting the starting row and the values to blank (I got NaN):
df_new = pd.DataFrame()
for i, row in data_only_trades.iterrows():
df_new = df_new.append(row)
for _ in range(2):
df_new = df_new.append(pd.Series(), ignore_index=True)
Thank you!
import numpy as np
v = np.ndarray(shape=(numberOfRowsYouWant,df.values.shape[1]), dtype=object)
v[:] = ""
pd.DataFrame(np.vstack((df.values, v)))
I think you can use NumPy
but, if you want to use your manner, simply convert NaN to "":
df.fillna("")

fastest way to iterate through uneven columns to find the existing value

I have 2 DataFrames of (df1) 35k and (df2) 76k rows where I need to check whether df1["col1"] elements exist in df2["col2"] sub-elements. The code seems to be working fine on a sample dataset I have provided but the runtime takes forever on the original one. Here is a for-loop code I used on the sample dataset:
import pandas as pd
post_token_list = [['wXrL3TbK'], ['wXmTQKw1'], ['wXvnlWej'], ['wXvXBjKp']]
tokens_list = [['wXv3qoPQ', 'wXvT7ylu', 'wXvnIJuH', 'wXvXH7vy', 'wXvDXSS1', 'wXvjVE1F', 'wXvPV6z1', 'wXvHF1uw',
'wXvH1q03', 'wXvnTlcr', 'wXvDEG9U', 'wXLfZtO6', 'wXvLDDDl', 'wXvHTgjk', 'wXvHDDr8', 'wXvPBLbu',
'wXvvxXHI', 'wXvPBFge', 'wXvLxSii', 'wXvDhk2h', 'wXv3Alan', 'wXvvQuKy', 'wXvvQ6LO', 'wXpHNjw9'],
['wXYr2lVk', 'wXXj7iDP', 'wXXXIsQr', 'wXQbXKz6', 'wXN3tMp1', 'wXMfZV5N', 'wXvnlWej', 'wXSDyEaW',
'wXQ7mM78', 'wXMPvojh', 'wXMjo-8G', 'wXLfZtO6', 'wXN3tMp1'],
['wXr_jZmX', 'wXr7D0AM', 'wXrzjhxL', 'wXrfjQNe', 'wXrnihqT', 'wXrjyqm5', 'wXr3CD4h', 'wXrnSZsy',
'wXrTieP7', 'wXLfZtO6', 'wXgHVwkc', 'wXdvewsV', 'wXrfxZeg', 'wXrLB7Zo', 'wXprtX71', 'wXrHhjtO',
'wXrzwKBt', 'wXqz-RlY', 'wXq_fp7F', 'wXq7Po7n', 'wXq7fC73', 'wXqzvRSW', 'wXqf_PQ3', 'wXML2vCd'],
['wXv3aQrv', 'wXvn6ONM', 'wXvfaG0M', 'wXvf6LIr', 'wXvjJBg_', 'wXvL6M-0', 'wXv7p2cd', 'wXv3poSs',
'wXvz5kUz', 'wXvrZz0_', 'wXv_YVCb', 'wXLfZtO6', 'wXvX5Hgi', 'wXvz3Ptg', 'wXvHJUU-', 'wXvr4fB7',
'wXvnlWej', 'wXv_YUrK', 'wXv7Id05', 'wXv7IYOV', 'wXvfYfLo', 'wXv7Y3AV', 'wXvT4_pE', 'wXvPovRt'],
['wXoDui-2', 'wXoT9yTg', 'wXmTQKw1', 'wXormLxu', 'wXMX-NNQ', 'wXo7kUfB', 'wXon0rt_', 'wXozT-3V',
'wXnvYjEc', 'wXnTn9D6', 'wXnLH7Cz', 'wXn_2HV_', 'wXnPGou9', 'wXnPVSNo', 'wXuG0sl3', 'wXnjAs7X',
'wXm38mLv', 'wXmnj5Oh', 'wXmfjQ2h', 'wXm_wXuD', 'wXlPOUmy', 'wXcfHkmx', 'wXQ_62cx', 'wXUD3qyx']]
df1 = pd.DataFrame({"col1": post_token_list})
df2 = pd.DataFrame({"col2": tokens_list})
query_bounce = []
def query_bounce_checker(dataset_clicked, dataset_loaded, col1, col2):
for i in dataset_clicked[col1]:
for j in i:
[query_bounce.append(k) for k in dataset_loaded[col2] if j in k]
return query_bounce
query_bounce_checker(df1, df2, "col1", "col2")
i, j, and k values are used to access and compare the elements and sub-elements of the two respecting columns.
Speed is a contributing factor for me, and the function written here is not fast enough for a dataset of this size.
If this is actually what you want, this should be pretty fast.
import numpy as np
np.intersect1d(np.hstack(df1.col1),np.hstack(df2.col2))
Output
array(['wXmTQKw1', 'wXvnlWej'], dtype='<U8')
I am not sure if it is what you want. If you just want to check which values in df1 also exist in df2, you can transform two dataframes into arrays and use np.in1d() to do so.
Try this:
array1 = np.array((','.join(df1['col1'].apply(lambda x: ','.join(x)))).split(','))
array2 = np.array((','.join(df2['col2'].apply(lambda x: ','.join(x)))).split(','))
print(array1[np.in1d(array1,array2)])
Output:
['wXmTQKw1' 'wXvnlWej']

How to make a pandas dataframe from list data generated

I have a list of co-authors:
ten_author_pairs = [('creutzig', 'gao'),
('creutzig', 'linshaw'),
('gao', 'linshaw'),
('jing', 'zhang'),
('jing', 'liu'),
('zhang', 'liu'),
('jing', 'xu'),
('briant', 'einav'),
('chen', 'gao'),
('chen', 'jing')]
From here I can generate a list of negative examples - i.e. authors-pairs which are unconnected using the following code:
#generating negative examples -
from itertools import combinations
elements = list(set([e for l in ten_author_pairs for e in l])) # find all unique elements
complete_list = list(combinations(elements, 2)) # generate all possible combinations
#convert to sets to negate the order
set1 = [set(l) for l in ten_author_pairs]
complete_set = [set(l) for l in complete_list]
# find sets in `complete_set` but not in `set1`
ten_unconnnected = [list(l) for l in complete_set if l not in set1]
print(len(ten_author_pairs))
print(len(ten_unconnnected))
Next, I want to implement a link prediction problem for which I want to obtain a dataframe as follows:
author-pair jaccard Resource_Allocation Adamic_Adar Preferential cn_soundarajan_hopcroft within_inter_cluster link
creutzig-linshaw 0.25 0.25 0.25 0.25 0.25 0.25 1
I can calculate these and have lists with scores as output using networkx documentation, but I am not able to put it together as a table as shown above.
Like for the positive examples (the list mentioned above), I can generate a dataframe using:
df = pd.DataFrame(list, columns = ['u1','u2])
and then make a graph with:
G = nx.from_pandas_edgelist(df, u1, u2, create_using = nx.Graph())
After which say for jaccard index I can apply:
nx.jaccard_coefficient(G)
Which returns me a list of node pairs with jaccard score.
The 'link' column is generated with the logic - 1 for co-authors and 0 for pairs in the negative example.
But, I need all the respective scores as a table as mentioned.
Can anyone please help me with how to construct the above dataframe.
(The scores mentioned are just for example purpose to indicate the kind of table i need)
Oh -- this has been a good two years, but I just stumbled upon this...in case I understood you correctly, building on your basis:
from itertools import combinations
import pandas as pd
import networkx as nx
elements = list(set([e for l in ten_author_pairs for e in l]))
complete_list = list(combinations(elements, 2))
set1 = [set(l) for l in ten_author_pairs]
df = pd.DataFrame(set1, columns=["u1", "u2"])
G = nx.from_pandas_edgelist(df, "u1", "u2", create_using=nx.Graph())
Then defining the list of generators
list_generators = [
nx.jaccard_coefficient,
nx.resource_allocation_index,
nx.adamic_adar_index,
nx.preferential_attachment,
]
Building the score dataframe:
dfx = pd.DataFrame()
for item_generator in list_generators:
if dfx.shape[0]:
dfx = dfx.merge(
right=get_df_network(generator=item_generator, graph=G),
left_index=True,
right_index=True,
)
else:
dfx = get_df_network(generator=item_generator, graph=G)
And finally merging in the link dataframe
df_link = (
pd.DataFrame(set1, columns=["node_0", "node_1"])
.set_index(["node_0", "node_1"])
.assign(link=[1] * len(set1))
)
dfx.merge(df_link, left_index=True, right_index=True, how="outer").fillna(0)
could do the job?

Categories