Sorry, I should delete the old question, and create the new one.
I have a dataframe with two columns. The df looks as follows:
Word Tag
0 Asam O
1 instruksi O
2 - O
3 instruksi X
4 bahasa Y
5 Instruksi P
6 - O
7 instruksi O
8 sebuah Q
9 satuan K
10 - L
11 satuan O
12 meja W
13 Tiap Q
14 - O
15 tiap O
16 karakter P
17 - O
18 ke O
19 - O
20 karakter O
and I'd like to merge some rows which contain dash - to one row. so the output should be the following:
Word Tag
0 Asam O
1 instruksi-instruksi O
2 bahasa Y
3 Instruksi-instruksi P
4 sebuah Q
5 satuan-satuan K
6 meja W
7 Tiap-tiap Q
8 karakter-ke-karakter P
Any ideas? Thanks in advance. I have tried the answer from Jacob K, it works, then I found in my dataset, there are more than one - row in between. I have put the expected output, like index number 8
Solution from Jacob K:
# Import packages
import pandas as pd
import numpy as np
# Get 'Word' and 'Tag' columns as numpy arrays (for easy indexing)
words = df.Word.to_numpy()
tags = df.Tag.to_numpy()
# Create empty lists for new colums in output dataframe
newWords = []
newTags = []
# Use while (rather than for loop) since index i can change dynamically
i = 0 # To not cause any issues with i-1 index
while (i < words.shape[0] - 1):
if (words[i] == "-"):
# Concatenate the strings above and below the "-"
newWords.append(words[i-1] + "-" + words[i+1])
newTags.append(tags[i-1])
i += 2 # Don't repeat any concatenated values
else:
if (words[i+1] != "-"):
# If there is no "-" next, append the regular word and tag values
newWords.append(words[i])
newTags.append(tags[i])
i += 1 # Increment normally
# Create output dataframe output_df
d2 = {'Word': newWords, 'Tag': newTags}
output_df = pd.DataFrame(data=d2)
My approach with GroupBy.agg:
#df['Word'] = df['Word'].str.replace(' ', '') #if necessary
blocks = df['Word'].shift().ne('-').mul(df['Word'].ne('-')).cumsum()
new_df = df.groupby(blocks, as_index=False).agg({'Word' : ''.join, 'Tag' : 'first'})
print(new_df)
Output
Word Tag
0 Asam O
1 instruksi-instruksi O
2 bahasa Y
3 Instruksi-instruksi P
4 sebuah Q
5 satuan-satuan K
6 meja W
7 Tiap-tiap Q
8 karakter-ke-karakter P
Blocks (Detail)
print(blocks)
0 1
1 2
2 2
3 2
4 3
5 4
6 4
7 4
8 5
9 6
10 6
11 6
12 7
13 8
14 8
15 8
16 9
17 9
18 9
19 9
20 9
Name: Word, dtype: int64
This is a loop version:
import pandas as pd
# import data
DF = pd.read_csv("table.csv")
# creates a new DF
newDF = pd.DataFrame()
# iterate through rows
for i in range(len(DF)-1):
# prepare prev row index (?dealing with private instance of first row)
prev = i-1
if (prev < 0):
prev = 0
# copy column if the row is not '-' and the next row is not '-'
if (DF.loc[i+1, 'Word'] != '-'):
if (DF.loc[i, 'Word'] != '-' and DF.loc[prev, 'Word'] != '-'):
newDF = newDF.append(DF.loc[i, :])
# units the three rows if the middle one is '-'
else:
row = {'Tag': [DF.loc[i, 'Tag']], 'Word': [DF.loc[i, 'Word']+DF.loc[i+1, 'Word']+DF.loc[i+2, 'Word']]}
newDF = newDF.append(pd.DataFrame(row))
Related
So i have this dataframe:
import pandas as pd
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
df1
id name amount
0 1 ada 2
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
6 3 ccc 2
7 3 cccd 3
8 3 ood -5
9 4 aaa 3
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
First i want to find the matching positive for negative amounts per id, which i do through this:
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
df1
id name amount
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
Next thing i want to do is to get only the pairs of matching pos and neg numbers that also have the highest similarity in the string column 'name'.So if an id has two other negative numbers that match with the positive i want to isolate the pairs with the highest similarity per id, so i want my desired output to be like this:
id name amount
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
I guess i have to use some type of string similarity index like sequencematcher or jaccard etc., but i am not sure how to work around this. Any help on how to get my desired output would be very much appreciated.
You can try something like this:
please notice you can change the information you print as you wish, just need to edit the return values from the function create_sim
import pandas as pd
from operator import itemgetter
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
print(df1)
def split(word):
return [char for char in word]
def DistJaccard(str1, str2):
l1 = set(split(str1))
l2 = set(split(str2))
return float(len(l1 & l2)) / len(l1 | l2)
def create_sim(df, idx):
idx_id = df['id'].values[idx]
idx_amount = df['amount'].values[idx]
idx_name = df['name'].values[idx]
df_t = df.loc[df['id'] == idx_id]
pos = [i for i in list(df_t['amount']) if i > 0] or None
neg = [i for i in list(df_t['amount']) if i < 0] or None
if pos and neg:
l = [x for x in list(df_t['amount']) if x == idx_amount * -1]
if len(l) > 0:
df_t = df.loc[df['amount'] == idx_amount * -1]
compare_list = list(df_t['name'])
list_results = []
for item in compare_list:
sim = DistJaccard(idx_name, item)
list_results.append((item, sim))
return max(list_results, key=itemgetter(1))
return None
count = 0
for index, row in df1.iterrows():
res = create_sim(df1, count)
if res:
print(f"The most similar word of {row['name']} is {res[0]} with similarity of {res[1]}")
else:
print(f"No similar words of {row['name']}")
count+=1
Edit:
In order to make a DF with the results you can change it to this:
count = 0
item1_id = []
item1_row = []
item1_name = []
item2_id = []
item2_row = []
item2_name = []
for index, row in df1.iterrows():
res = create_sim(df1, count)
item1_id.append(row['id'])
item1_row.append(count)
item1_name.append(row['name'])
if res:
row_idx = df1.loc[(df1['id'] == res[2]) & (df1['name'] == res[0]) & (df1['amount'] != row['amount']), "name"].index.tolist()
item2_id.append(row['id'])
item2_row.append(row_idx[0])
item2_name.append(res[0])
else:
item2_id.append(None)
item2_row.append(None)
item2_name.append(None)
count+=1
final = pd.DataFrame(item1_id, columns=['item 1 id'])
final['item 1 row'] = item1_row
final['item 1 name'] = item1_name
final['item 2 id'] = item2_id
final['item 2 row'] = item2_row
final['item 2 name'] = item2_name
print(final)
So I made this dataframe
alp = "abcdefghijklmnopqrstuvwxyz0123456789"
s = "carl"
for i in s:
alp = alp.replace(i,"")
jaa = s+alp
x = list(jaa)
array = np.array(x)
re = np.reshape(array,(6,6))
dt = pd.DataFrame(re)
dt.columns = [1,2,3,4,5,6]
dt.index = [1,2,3,4,5,6]
dt
1 2 3 4 5 6
1 c a r l b d
2 e f g h i j
3 k m n o p q
4 s t u v w x
5 y z 0 1 2 3
6 4 5 6 7 8 9
I want to search a value , and print its row(index) and column.
For example, 'h', the output i want is 2,4.
Is there any way to get that output?
row, col = np.where(dt == "h")
print(dt.index[row[0]], dt.columns[col[0]])
Hello I have a list of tuple such as :
indexes_to_delete=((6,9),(20,22),(2,4))
and a sequence that I can open using Biopython :
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
and from indexes_to_delete file I would like to remove the part from :
6 to 9
20 to 22
and
2 to 4
so if I follow these coordinate I should have a new_sequence :
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
so if I remove the coordinates I get :
A E J K L M N O P Q R S W X Y Z
1 5 10 11 12 13 14 15 16 17 18 19 23 24 25 26
indexes_to_delete=((6,9),(20,22),(2,4))
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
s = ''.join(ch for i, ch in enumerate(Sequence1, 1) if not any(a <= i <= b for a, b in indexes_to_delete))
print(s)
Prints:
AEJKLMNOPQRSWXYZ
Here is another approach using several modules.
from string import ascii_uppercase
from intspan import intspan
from operator import itemgetter
indexes_to_delete=((6,9),(20,22),(2,4))
# add dummy 'a' so count begins with 1 for uppercase letters
array = ['a'] + list(ascii_uppercase)
indexes_to_keep = intspan.from_ranges(indexes_to_delete).complement(low = 1, high=26)
slice_of = itemgetter(*indexes_to_keep)
print(' '.join(slice_of(array)))
print(' '.join(map(str,indexes_to_keep)))
Prints:
A E J K L M N O P Q R S W X Y Z
1 5 10 11 12 13 14 15 16 17 18 19 23 24 25 26
def delete_indexes(sequence, indexes_to_delete):
# first convert the sequence to a dictionary
seq_dict = {i+1: sequence[i] for i in range(len(sequence))}
# collect all the keys that need to be removed
keys_to_delete = []
for index_range in indexes_to_delete:
start, end = index_range
keys_to_delete += range(start, end+1)
if not keys_to_delete:
return seq_dict
# reomove the keys from the original dictionary
for key in keys_to_delete:
seq_dict.pop(key)
return seq_dict
You can use this function to get the new sequence.
new_sequence = delete_indexes(Sequence1, indexes_to_delete)
Of course, the new_sequence is still a python dictionary. You can convert it to list or str, or whatever. For example, to convert it into a str as the old Sequence1:
print(''.join(list(new_sequence.values())))
Out[7]:
AEJKLMNOPQRSWXYZ
You can get their coordinates using new_sequence.keys().
A bit more readable version:
indexes_to_delete=((6,9),(20,22),(2,4))
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
newSequence1 = ""
for idx, char in enumerate(Sequence1):
for startIndex, endIndex in indexes_to_delete:
if startIndex <= idx+1 <= endIndex:
break
else:
newSequence1 += char
print(newSequence1)
Prints: AEJKLMNOPQRSWXYZ
I'm trying to extract tables from log files which are in .txt format. The file is loaded using read_csv() from pandas.
The log file looks like this:
aaa
bbb
ccc
=====================
A B C D E F
=====================
1 2 3 4 5 6
7 8 9 1 2 3
4 5 6 7 8 9
1 2 3 4 5 6
---------------------
=====================
G H I J
=====================
1 3 4
5 6 7
---------------------
=====================
K L M N O
=====================
1 2 3
4 5 6
7 8 9
---------------------
xxx
yyy
zzz
Here are some points about the log file:
Files start and end with some lines of comment which can be ignored.
In the example above there are three tables.
Headers for each table are located between lines of "======..."
The end of each table is signified by a line of "------..."
My code as of now:
import pandas as pd
import itertools
df = pd.read_csv("xxx.txt", sep="\n", header=None)
# delimiters for header and end-of-table
h_dl = "=" * 21
r_dl = "-" * 21
for i in range(len(df.index)-2):
# if loop to find lines which are table headers & convert to list
if (df.iloc[i].any() == h_dl) & (df.iloc[i+2].any() == h_dl):
h = df.iloc[i+1].str.split().tolist()
h = list(itertools.chain(*h))
# while loop to find lines which are table rows & append to one list
x = 3
r = []
while True:
if df.iloc[i+x].any() == r_dl:
break
r.append(df.iloc[i+x].str.split().tolist())
x += 1
r = list(itertools.chain(*r))
# create pandas dataframe with header and rows obtained above
t = pd.DataFrame(data=r, columns=h)
This code returns AssertionError: 14 columns passed, passed data had 15 columns. I know that this is due to the fact that for the table rows, I am using .str.split() which by default splits on whitespace. Since there are some columns for which there are missing values, the number of elements in table headers and number of elements in table rows does not match for the second and htird table. I am struggling to get around this, since the number of whitespace characters to signify missing values is different for each table.
My question is: is there a way to account for missing values in some of the columns, so that I can get a DataFrame as output where there are either null or NaN for missing values as appropriate?
With usage of Victor Ruiz method I added if options to handle different header sizes.
=^..^=
Description in code:
import re
import pandas as pd
import itertools
df = pd.read_csv("stack.txt", sep="\n", header=None)
# delimiters for header and end-of-table
h_dl = "=" * 21
r_dl = "-" * 21
for i in range(len(df.index)-2):
# if loop to find lines which are table headers & convert to list
if (df.iloc[i].any() == h_dl) & (df.iloc[i+2].any() == h_dl):
h = df.iloc[i+1].str.split().tolist()
h = list(itertools.chain(*h))
# get header string
head = df.iloc[i+1].to_string()
# get space distance in header
space_range = 0
for result in re.findall('([ ]*)', head):
if len(result) > 0:
space_range = len(result)
x = 3
r = []
while True:
if df.iloc[i+x].any() == r_dl:
break
# strip line
line = df.iloc[i+x].to_string()[5::]
# collect items based on elements distance
items = []
for result in re.finditer('(\d+)([ ]*)', line):
item, delimiter = result.groups()
items.append(item)
if len(delimiter) > space_range*2+1:
items.append('NaN')
items.append('NaN')
if len(delimiter) < space_range*2+2 and len(delimiter) > space_range:
items.append('NaN')
r.append([items])
x += 1
r = list(itertools.chain(*r))
# create pandas dataframe with header and rows obtained above
t = pd.DataFrame(data=r, columns=h)
Output:
A B C D E F
0 1 2 3 4 5 6
1 7 8 9 1 2 3
2 4 5 6 7 8 9
3 1 2 3 4 5 6
G H I J
0 1 NaN 3 4
1 5 NaN 6 7
K L M N O
0 1 NaN NaN 2 3
1 4 5 NaN NaN 6
2 7 8 NaN 9 None
Maybe this can help you.
Suppose we have the next line of text:
1 3 4
The problem is to identify how much spaces delimits two consecutive items without considering that there is a missing value between them.
Let consider that 5 spaces is a delimiter, and more than 5 is a missing value.
You can use regex to parse the items:
from re import finditer
line = '1 3 4'
items = []
for result in finditer('(\d+)([ ]*)', line):
item, delimiter = result.groups()
items.append(item)
if len(delimiter) > 5:
items.append(nan)
print(items)
Output is:
['1', nan, '3', '4']
A more complex situation would be if it can appear two or more consecutive missing values (the code above will just inyect only one nan)
I have a dataframe like this:
ID, Values
1 10, 11, 12, 13
2 14
3 15, 16, 17, 18
I want to create a new dataframe like this:
ID COl1 Col2
1 10 11
1 11 12
1 12 13
2 14
3 15 16
3 16 17
3 17 18
Please help me in how to do this???
Note: The rows in Values column of input df are str type.
Use list comprehension with flattening and small change - if i > 0: to if i == 2: for correct working with one element values:
from collections import deque
#https://stackoverflow.com/a/36586925
def chunks(iterable, chunk_size=2, overlap=1):
# we'll use a deque to hold the values because it automatically
# discards any extraneous elements if it grows too large
if chunk_size < 1:
raise Exception("chunk size too small")
if overlap >= chunk_size:
raise Exception("overlap too large")
queue = deque(maxlen=chunk_size)
it = iter(iterable)
i = 0
try:
# start by filling the queue with the first group
for i in range(chunk_size):
queue.append(next(it))
while True:
yield tuple(queue)
# after yielding a chunk, get enough elements for the next chunk
for i in range(chunk_size - overlap):
queue.append(next(it))
except StopIteration:
# if the iterator is exhausted, yield any remaining elements
i += overlap
if i == 2:
yield tuple(queue)[-i:]
L = [[x] + list(z) for x, y in zip(df['ID'], df['Values']) for z in (chunks(y.split(', ')))]
df = pd.DataFrame(L, columns=['ID','Col1','Col2']).fillna('')
print (df)
ID Col1 Col2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18
Tried slightly different approach. Created a function which will return numbers in pairs from the initial comma separated string.
def pairup(mystring):
"""Function to return paired up list from string"""
mylist = mystring.split(',')
if len(mylist) == 1: return [mylist]
splitlist = []
for index, item in enumerate(mylist):
try:
splitlist.append([mylist[index], mylist[index+1]])
except:
pass
return splitlist
Now let's create the new data frame.
# https://stackoverflow.com/a/39955283/3679377
new_df = df[['ID']].join(
df.Values.apply(lambda x: pd.Series(pairup(x)))
.stack()
.apply(lambda x: pd.Series(x))
.fillna("")
.reset_index(level=1, drop=True),
how='left').reset_index(drop=True)
new_df.columns = ['ID', 'Col 1', 'Col 2']
Here's the output of print(new_df).
ID Col 1 Col 2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18