Rank the row based on the similar text using python? - python

How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity?
Expected output
i have tried with the levistian distance but not sure how can i do for the whole table
def bow(x=None):
x = x.lower()
words = x.split(' ')
words.sort()
x = ' '.join(words)
exclude = set('{}{}'.format(string.punctuation, string.digits))
x = ''.join(ch for ch in x if ch not in exclude)
x = '{} '.format(x.strip())
return x
#intents = load_intents(export=True)
df['bow'] = df['name'].apply(lambda x: bow(x))
df.sort_values(by='bow',ascending=True,inplace=True)
last_bow = ''
recs = []
for idx,row in df.iterrows():
record = {
'name': row['name'],
'bow': row['bow'],
'lev_distance': ed.eval(last_bow,row['bow'])
}
recs.append(record)
last_bow = row['bow']
intents = pd.DataFrame(recs,columns=['name','bow','lev_distance'])
l = intents[intents['lev_distance'] <= lev_distance_range]
r = []
for x in l.index.values:
r.append(x - 1)
r.append(x)
r = list(set(r))
l = intents.iloc[r,:]

Using textdistance, you could try this:
import pandas as pd
import textdistance
df = pd.DataFrame(
{
"text": [
"Rahul dsa",
"Rasul dsad",
"Raul ascs",
"shrez",
"Indya",
"Indi",
"shez",
"india",
"kloa",
"klsnsd",
],
}
)
df = (
df
.assign(
match=df["text"].map(
lambda x: [
i
for i, text in enumerate(df["text"])
if textdistance.jaro_winkler(x, text) >= 0.9
]
)
)
.sort_values(by="match")
.drop(columns="match")
)
print(df)
# Output
text
0 Rahul dsa
1 Rasul dsad
2 Raul ascs
3 shrez
6 shez
4 Indya
5 Indi
7 india
8 kloa
9 klsnsd

Related

How to change python string into pandas data frame?

fellow developers in the StackOverflow.
I have string data in
'key=apple; age=10; key=boy; age=3'
How can we convert it into the pandas' data frame such that key and age will be the header and all the values in the column?
key age
apple 10
boy 3
Try this:
import pandas as pd
data = 'key=apple; age=10; key=boy; age=3'
words = data.split(";")
key = []
age = []
for word in words:
if "key" in word:
key.append(word.split("=")[1])
else:
age.append(word.split("=")[1])
df = pd.DataFrame(key, columns=["key"])
df["age"] = age
print(df)
You can try this:
import pandas as pd
str_stream = 'key=apple; age=10; key=boy; age=3'
lst_kv = str_stream.split(';')
# lst_kv => ['key=apple', ' age=10', ' key=boy', ' age=3']
res= [{s.split('=')[0].strip(): s.split('=')[1] for s in lst_kv[i:i+2]}
for i in range(len(lst_kv)//2)
]
df = pd.DataFrame(res)
df
Output:
key age
0 apple 10
1 boy 10
More explanation for one line res :
res = []
for i in range(len(lst_kv)//2):
dct_tmp = {}
for s in lst_kv[i:i+2]:
kv = s.split('=')
dct_tmp[kv[0].strip()] = kv[1]
res.append(dct_tmp)
res
Output:
[{'key': 'apple', 'age': '10'}, {'age': '10', 'key': 'boy'}]

Additional columns added to saved CSV

I have following code which generate features from csv
def gen_features_per_id(file_name, label):
df = pd.read_csv(file_name, delimiter=',')
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
row = []
column_names = ['group_timestamp', 'label',
'x_mean', 'x_median', 'x_stdev', 'x_raw_min', 'x_raw_max', 'x_abs_min', 'x_abs_max',
'y_mean', 'y_median', 'y_stdev', 'y_raw_min', 'y_raw_max', 'y_abs_min', 'y_abs_max',
'z_mean', 'z_median', 'z_stdev', 'z_raw_min', 'z_raw_max', 'z_abs_min', 'z_abs_max' ]
group_df = pd.DataFrame(columns=column_names)
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
x_median = x.median()
x_std_dev = statistics.stdev(x)
x_raw_min = min(x)
x_raw_max = max(x)
x_abs_min = min(abs(x))
x_abs_max = max(abs(x))
# print(
# f'Mean : {x_mean}, Median : {x_median}, Stdev : {x_std_dev}, '
# f'X raw Min : {x_raw_min}, X raw Max : {x_raw_max}, '
# f'X abs Min : {x_abs_min}, X abs Max : {x_abs_max}'
# )
y = g['y'].head(50)
y_mean = y.mean()
y_median = y.median()
y_std_dev = statistics.stdev(y)
y_raw_min = min(y)
y_raw_max = max(y)
y_abs_min = min(abs(y))
y_abs_max = max(abs(y))
# print(
# f'Mean : {y_mean}, Median : {y_median}, Std dev : {y_std_dev}, '
# f'X raw Min : {y_raw_min}, X raw Max : {y_raw_max}, '
# f'X abs Min : {y_abs_min}, X abs Max : {y_abs_max}'
# )
z = g['z'].head(50)
z_mean = z.mean()
z_median = z.median()
z_std_dev = statistics.stdev(z)
z_raw_min = min(z)
z_raw_max = max(z)
z_abs_min = min(abs(z))
z_abs_max = max(abs(z))
# print(
# f'Mean : {z_mean}, Median : {z_median}, Std dev : {z_std_dev}, '
# f'X raw Min : {z_raw_min}, X raw Max : {z_raw_max}, '
# f'X abs Min : {z_abs_min}, X abs Max : {z_abs_max}'
# )
row.append(group_timestamp)
row.append(label)
row.append(x_mean)
row.append(x_median)
row.append(x_std_dev)
row.append(x_raw_min)
row.append(x_raw_max)
row.append(x_abs_min)
row.append(x_abs_max)
row.append(y_mean)
row.append(y_median)
row.append(y_std_dev)
row.append(y_raw_min)
row.append(y_raw_max)
row.append(y_abs_min)
row.append(y_abs_max)
row.append(z_mean)
row.append(z_median)
row.append(z_std_dev)
row.append(z_raw_min)
row.append(z_raw_max)
row.append(z_abs_min)
row.append(z_abs_max)
group_df = group_df.append([row], ignore_index=True)
group_df.to_csv("some.csv", index=False)
row = []
But saved csv file have additional columns added to the start of the csv header which is equal to supplied number of columns
Sample CSV
datetime,x,y,z,label
1493740845,0.0004,-0.0001,0.0045,bad
1493740846,0.0003,0.0002,0.0047,bad
1493740847,0.0005,0.0001,0.0049,bad
1493740848,0.0006,0.0004,0.005,bad
1493740849,0.0006,-0.0003,0.005,bad
1493740851,0.0001,-0.0003,0.0039,bad
1493740852,-0.0006,0.0003,0.0046,bad
B1493740853,0.0001,0.0,0.0048,bad
Output:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,group_timestamp,label,x_abs_max,x_abs_min,x_mean,x_median,x_raw_max,x_raw_min,x_stdev,y_abs_max,y_abs_min,y_mean,y_median,y_raw_max,y_raw_min,y_stdev,z_abs_max,z_abs_min,z_mean,z_median,z_raw_max,z_raw_min,z_stdev
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
How to fix this?
Additionally : If you can help me to simplify the code more.
There is problem for each loop in groupby is necessary append values to row list and then append to rows outside loop for nested lists, so possible pass to DataFrame cosntructor in last step:
#added for nested lists (outside loops)
rows = []
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
#added for row per loop
row = []
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
....
row.append(z_abs_max)
rows.append(row)
#DataFrame outside loops
group_df = pd.DataFrame(rows, columns=column_names)
print (group_df)
Your solution should be improved by GroupBy.agg:
#custom aggregate functions
def std_dev(x):
return statistics.stdev(x)
def abs_min(x):
return x.abs().min()
def abs_max(x):
return x.abs().max()
d = ['mean','median',std_dev, 'min','max', abs_min, abs_max]
cols = ['x','y','z']
#filtered first 50 rows
df[cols] = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].head(50)
#aggregate functions
group_df = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].agg(d)
group_df.columns = group_df.columns.map('_'.join)
print (group_df)

Python - How to convert elements in Panda from list to string

My df needs to have raw text, but the result i am getting comes inside brackets.
Here is my code:
chatroom = driver.find_element_by_class_name('_1_q7u')
pos= 0
df = pd.DataFrame(columns=['Msgs','Time'], index=None)
chat = driver.find_element_by_class_name('_19vo_').text
for ol in chatroom.find_elements_by_class_name('message-in'):
msgs = [k.text for k in ol.find_elements_by_class_name('_12pGw')]
times = [k.text for k in ol.find_elements_by_class_name('_1RNhZ')]
# df = [msgs,times]
df.loc[pos] = [msgs, times]
pos+=1
print(df)
Here is the output i have:
Msgs Time
0 [T] [14:30]
1 [Z] [14:36]
2 [Q] [14:37]
3 [R] [14:39]
Here is the output i want:
Msgs Time
0 T 14:30
1 Z 14:36
2 Q 14:37
3 R 14:39
chatroom = driver.find_element_by_class_name('_1_q7u')
pos= 0
df = pd.DataFrame(columns=['Msgs','Time'], index=None)
chat = driver.find_element_by_class_name('_19vo_').text
for ol in chatroom.find_elements_by_class_name('message-in'):
msgs = [k.text for k in ol.find_elements_by_class_name('_12pGw')][0]
times = [k.text for k in ol.find_elements_by_class_name('_1RNhZ')][0]
# df = [msgs,times]
df.loc[pos] = [msgs, times]
pos+=1
print(df)

Added a not desired column in csv

I have this code
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Embarked"] = train["Embarked"].fillna("S")
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
test.Fare[152] = test["Fare"].median()
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna("S")
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId)
my_solution.to_csv("5.csv", index_label = ["PassangerId", "Survived"])
As you can see I only want save a csv with two columns, but when I look at the file 5.csv it's added another column called 0..Anybody know why?
You're seeing this behaviour because you're adding two index_labels when there is only one index.
You can instead name your one column as such:
my_solution.columns = ['Survived']
And then label your index like so:
my_solution.to_csv("5.csv", index_label=["PassengerId"])
Try this slightly optimized solution:
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
cols = ["Pclass", "Sex", "Age", "Fare"]
mappings = {
'Sex': {'male':0, 'female':1},
}
def cleanup(df, mappings=mappings):
# map non-numeric columns
for c in mappings.keys():
df[c] = df[c].map(mappings[c])
# replace NaN's with average value
for c in df.columns[df.isnull().any()]:
df[c].fillna(df[c].mean(), inplace=True)
return df
# parse train data set
train = cleanup(d.read_csv(train_url, usecols=cols + ['Survived']))
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one.fit(train.drop('Survived',1), train['Survived'])
# parse test data set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url, usecols=cols+['PassengerId'])
result = test.pop('PassengerId').to_frame('PassengerId')
test = cleanup(test)
result['Survived'] = my_tree_one.predict(test)
result.to_csv("5.csv", index=False)

Adding to JSON in Python and converting to an object

I have a JSON array shown below.
[
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX",
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA"
]
I would like to add a value in the form of the string (self.tx_amount_5) so I get a JSON OBJECT something like this:
{
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX" : 100000
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA" : 100000
}
The part of code that has generated the first JSON array is:
r = requests.get('http://api.blockcypher.com/v1/btc/main/addrs/A/balance')
balance = r.json()['balance']
with open("Entries#x1.csv") as f,open("winningnumbers.csv") as nums:
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
results = defaultdict(list)
for row in r:
results[sum(n in nums for n in islice(row, 1, None))].append(row[0])
self.number_matched_0 = results[0]
self.number_matched_1 = results[1]
self.number_matched_2 = results[2]
self.number_matched_3 = results[3]
self.number_matched_4 = results[4]
self.number_matched_5 = results[5]
self.number_matched_5_json = json.dumps(self.number_matched_5, sort_keys = True, indent = 4)
print(self.number_matched_5_json)
if len(self.number_matched_3) == 0:
print('Nobody matched 3 numbers')
else:
self.tx_amount_3 = int((balance*0.001)/ len(self.number_matched_3))
if len(self.number_matched_4) == 0:
print('Nobody matched 4 numbers')
else:
self.tx_amount_4 = int((balance*0.1)/ len(self.number_matched_4))
if len(self.number_matched_5) == 0:
print('Nobody matched 3 numbers')
else:
self.tx_amount_5 = int((balance*0.4)/ len(self.number_matched_5))
If I understand correctly, you can create the dictionary like this:
import json
s="""[
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX",
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA"
]"""
d = {el: self.tx_amount_5 for el in json.loads(s)}
print(d)
which produces
{'3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX': 100000,
'35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA': 100000}

Categories