Remove part of a string with coordinates in python - python

Hello I have a list of tuple such as :
indexes_to_delete=((6,9),(20,22),(2,4))
and a sequence that I can open using Biopython :
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
and from indexes_to_delete file I would like to remove the part from :
6 to 9
20 to 22
and
2 to 4
so if I follow these coordinate I should have a new_sequence :
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
so if I remove the coordinates I get :
A E J K L M N O P Q R S W X Y Z
1 5 10 11 12 13 14 15 16 17 18 19 23 24 25 26

indexes_to_delete=((6,9),(20,22),(2,4))
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
s = ''.join(ch for i, ch in enumerate(Sequence1, 1) if not any(a <= i <= b for a, b in indexes_to_delete))
print(s)
Prints:
AEJKLMNOPQRSWXYZ

Here is another approach using several modules.
from string import ascii_uppercase
from intspan import intspan
from operator import itemgetter
indexes_to_delete=((6,9),(20,22),(2,4))
# add dummy 'a' so count begins with 1 for uppercase letters
array = ['a'] + list(ascii_uppercase)
indexes_to_keep = intspan.from_ranges(indexes_to_delete).complement(low = 1, high=26)
slice_of = itemgetter(*indexes_to_keep)
print(' '.join(slice_of(array)))
print(' '.join(map(str,indexes_to_keep)))
Prints:
A E J K L M N O P Q R S W X Y Z
1 5 10 11 12 13 14 15 16 17 18 19 23 24 25 26

def delete_indexes(sequence, indexes_to_delete):
# first convert the sequence to a dictionary
seq_dict = {i+1: sequence[i] for i in range(len(sequence))}
# collect all the keys that need to be removed
keys_to_delete = []
for index_range in indexes_to_delete:
start, end = index_range
keys_to_delete += range(start, end+1)
if not keys_to_delete:
return seq_dict
# reomove the keys from the original dictionary
for key in keys_to_delete:
seq_dict.pop(key)
return seq_dict
You can use this function to get the new sequence.
new_sequence = delete_indexes(Sequence1, indexes_to_delete)
Of course, the new_sequence is still a python dictionary. You can convert it to list or str, or whatever. For example, to convert it into a str as the old Sequence1:
print(''.join(list(new_sequence.values())))
Out[7]:
AEJKLMNOPQRSWXYZ
You can get their coordinates using new_sequence.keys().

A bit more readable version:
indexes_to_delete=((6,9),(20,22),(2,4))
Sequence1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
newSequence1 = ""
for idx, char in enumerate(Sequence1):
for startIndex, endIndex in indexes_to_delete:
if startIndex <= idx+1 <= endIndex:
break
else:
newSequence1 += char
print(newSequence1)
Prints: AEJKLMNOPQRSWXYZ

Related

Indexing in Polybius cipher produce error in python

I'm making a polybius cipher. So I made a table to convert with the keyword tomato
alp = "abcdefghijklmnopqrstuvwxyz0123456789"
s = str(input("keyword: "))
for i in s:
alp = alp.replace(i,"")
s2 = "".join(dict.fromkeys(s))
jaa = s2+alp
x = list(jaa)
array = np.array(x)
re = np.reshape(array,(6,6))
dt = pd.DataFrame(re)
dt.columns = [1,2,3,4,5,6]
dt.index = [1,2,3,4,5,6]
dt
1 2 3 4 5 6
1 t o m a b c
2 d e f g h i
3 j k l n p q
4 r s u v w x
5 y z 0 1 2 3
6 4 5 6 7 8 9
I want to translate poly with this code
poly = '25 34 14 12 35 22 43 21 25 34 24 33 51 23 12 25 13 34 22'
a = poly.split(" ")
for i in range (len(a)):
hur = a[i]
w = dt._get_value(hur[0],hur[1])
print(w)
But, keyerror : '5'. I've tried to get value with (2,5), the output is good, but can't run it with the indexing. Which part is missing?
It's because hur[0] and hur[1] is a string, not an integer.
You need to do:
for hur in a:
w = dt._get_value(int(hur[0]),int(hur[1]))
print(w, end="") # end="" will print it as one text instead of over multiple lines
Note that your poly has a double space which will mess up the split method.

How to combine some rows into a single row

Sorry, I should delete the old question, and create the new one.
I have a dataframe with two columns. The df looks as follows:
Word Tag
0 Asam O
1 instruksi O
2 - O
3 instruksi X
4 bahasa Y
5 Instruksi P
6 - O
7 instruksi O
8 sebuah Q
9 satuan K
10 - L
11 satuan O
12 meja W
13 Tiap Q
14 - O
15 tiap O
16 karakter P
17 - O
18 ke O
19 - O
20 karakter O
and I'd like to merge some rows which contain dash - to one row. so the output should be the following:
Word Tag
0 Asam O
1 instruksi-instruksi O
2 bahasa Y
3 Instruksi-instruksi P
4 sebuah Q
5 satuan-satuan K
6 meja W
7 Tiap-tiap Q
8 karakter-ke-karakter P
Any ideas? Thanks in advance. I have tried the answer from Jacob K, it works, then I found in my dataset, there are more than one - row in between. I have put the expected output, like index number 8
Solution from Jacob K:
# Import packages
import pandas as pd
import numpy as np
# Get 'Word' and 'Tag' columns as numpy arrays (for easy indexing)
words = df.Word.to_numpy()
tags = df.Tag.to_numpy()
# Create empty lists for new colums in output dataframe
newWords = []
newTags = []
# Use while (rather than for loop) since index i can change dynamically
i = 0 # To not cause any issues with i-1 index
while (i < words.shape[0] - 1):
if (words[i] == "-"):
# Concatenate the strings above and below the "-"
newWords.append(words[i-1] + "-" + words[i+1])
newTags.append(tags[i-1])
i += 2 # Don't repeat any concatenated values
else:
if (words[i+1] != "-"):
# If there is no "-" next, append the regular word and tag values
newWords.append(words[i])
newTags.append(tags[i])
i += 1 # Increment normally
# Create output dataframe output_df
d2 = {'Word': newWords, 'Tag': newTags}
output_df = pd.DataFrame(data=d2)
My approach with GroupBy.agg:
#df['Word'] = df['Word'].str.replace(' ', '') #if necessary
blocks = df['Word'].shift().ne('-').mul(df['Word'].ne('-')).cumsum()
new_df = df.groupby(blocks, as_index=False).agg({'Word' : ''.join, 'Tag' : 'first'})
print(new_df)
Output
Word Tag
0 Asam O
1 instruksi-instruksi O
2 bahasa Y
3 Instruksi-instruksi P
4 sebuah Q
5 satuan-satuan K
6 meja W
7 Tiap-tiap Q
8 karakter-ke-karakter P
Blocks (Detail)
print(blocks)
0 1
1 2
2 2
3 2
4 3
5 4
6 4
7 4
8 5
9 6
10 6
11 6
12 7
13 8
14 8
15 8
16 9
17 9
18 9
19 9
20 9
Name: Word, dtype: int64
This is a loop version:
import pandas as pd
# import data
DF = pd.read_csv("table.csv")
# creates a new DF
newDF = pd.DataFrame()
# iterate through rows
for i in range(len(DF)-1):
# prepare prev row index (?dealing with private instance of first row)
prev = i-1
if (prev < 0):
prev = 0
# copy column if the row is not '-' and the next row is not '-'
if (DF.loc[i+1, 'Word'] != '-'):
if (DF.loc[i, 'Word'] != '-' and DF.loc[prev, 'Word'] != '-'):
newDF = newDF.append(DF.loc[i, :])
# units the three rows if the middle one is '-'
else:
row = {'Tag': [DF.loc[i, 'Tag']], 'Word': [DF.loc[i, 'Word']+DF.loc[i+1, 'Word']+DF.loc[i+2, 'Word']]}
newDF = newDF.append(pd.DataFrame(row))

An algorithm for efficiently replacing an array elements with groups

There are N elements, each element has its own cost. And there are M groups. Each group includes several indices of elements from the array and has its own cost.
input for example
6
100 5
200 5
300 5
400 5
500 5
600 3
2
4 6
100 200 300 700
3 5
300 400 500
The first number N is the number of elements. The next N lines contain the index and cost of a particular item. Then comes the number M (number of groups). After it comes 2*M lines . These lines contain the number of elements in the group, the cost of the group itself, and the indices of the elements.
I want to find the minimum cost for which can purchase all N items.
In the example, it is most advantageous to take both groups and purchase an element with the number 600 separately. The answer is 14. (6+5+3)
Here is my solution
from queue import PriorityQueue
N = int(input())
dct = {}
groups = PriorityQueue()
for i in range(N):
a,c = [int(j) for j in input().split()]
dct[a] = c
M = int(input())
for i in range(M):
k,c = [int(j) for j in input().split()]
s = 0
tmp = []
for j in input().split():
j_=int(j)
if j_ in dct:
s+=dct[j_]
tmp.append(j_)
d = c-s
if d<0:
groups.put([d, c, tmp])
s = 0
while not groups.empty():
#print(dct)
#for i in groups.queue:
# print(i)
g = groups.get()
if g[0]>0:
break
#print('G',g)
#print('-------')
for i in g[2]:
if i in dct:
del(dct[i])
s += g[1]
groups_ = PriorityQueue()
for i in range(len(groups.queue)):
g_ = groups.get()
s_ = 0
tmp_ = []
for i in g_[2]:
if i in dct:
s_+=dct[i]
tmp_.append(i)
d = g_[1]-s_
groups_.put([d, g_[1], tmp_])
groups = groups_
for i in dct:
s+=dct[i]
print(s)
But it is not completely true.
For example, for such a test, it gives an answer of 162. But the correct answer is 160. It is most beneficial to take only the first and second groups and take an element with index 0 separately.
20
0 24
1 32
2 33
3 57
4 57
5 50
6 50
7 41
8 2
9 73
10 81
11 73
12 55
13 3
14 54
15 43
16 98
17 8
18 41
19 97
5
17 61
17 9 11 15 1 13 14 7 20 2 3 16 12 5 8 4 6
13 75
20 15 5 9 10 11 7 8 18 2 4 19 16
10 96
3 9 4 18 11 6 8 5 2 14
9 92
18 1 6 9 19 8 4 16 10
19 77
14 17 18 3 2 4 7 6 8 9 10 20 13 12 15 19 1 16 5
I also tried brute-force search, but such a solution would be too slow
from itertools import combinations
N = int(input())
dct = {}
s = 0
for i in range(N):
a,c = [int(j) for j in input().split()]
dct[a] = c
s += c
m = s
M = int(input())
groups = []
for i in range(M):
k,c = [int(j) for j in input().split()]
s = 0
tmp = []
for j in input().split():
j_=int(j)
if j_ in dct:
s+=dct[j_]
tmp.append(j_)
groups.append( [c, tmp] )
for u in range(1,M+1):
for i in list(combinations(groups, u)):
s = 0
tmp = dct.copy()
for j in i:
s += j[0]
for t in j[1]:
if t in tmp:
del(tmp[t])
for j in tmp:
s += tmp[j]
#print(i,s)
if s < m:
m = s
print(m)
I think that this problem is solved with the help of dynamic programming. Perhaps this is some variation of the typical Knapsack problem. Tell me which algorithm is better to use.
The so-called set cover problem(which is NP-Hard) seems like a special case of your problem. Therefore, I am afraid there is no efficient algorithm that solves it.
As already stated, this is a hard problem for which no "efficient" algorithm exists.
You can approach this as a graph problem, where the nodes of the graph are all possible combinations of groups (where each element on its own is also a group). Two nodes u and v are connected with a directed edge when there is a group g such that the union of the keys in u and in g, corresponds to the set of keys in v.
Then perform a Dijkstra search in this graph, starting from the node that represents the state where no groups are selected at all (cost 0, no keys). This search will minimise the cost, and you can use the extra optimisation that a group g is never considered twice in the same path. As soon as a state (node) is visited that covers all the keys, you can exit the algorithm -- typical for the Dijkstra algorithm -- as this represents the minimal cost to cover all the keys.
Such an algorithm is still quite costly, as at each addition of an edge to a path, a union of keys must be calculated. And,... quite some memory is needed to keep all states in the heap.
Here is a potential implementation:
from collections import namedtuple
import heapq
# Some named tuple types, to make the code more readable
Group = namedtuple("Group", "cost numtodo keys")
Node = namedtuple("Node", "cost numtodo keys nextgroupid")
def collectinput():
inputNumbers = lambda: [int(j) for j in input().split()]
groups = []
keys = []
N, = inputNumbers()
for i in range(N):
key, cost = inputNumbers()
keys.append(key)
# Consider these atomic keys also as groups (with one key)
# The middle element of this tuple may seem superficial, but it improves sorting
groups.append(Group(cost, N-1, [key]))
keys = set(keys)
M, = inputNumbers()
for i in range(M):
cost = inputNumbers()[-1]
groupkeys = [key for key in inputNumbers() if key in keys]
groups.append(Group(cost, N-len(groupkeys), groupkeys))
return keys, groups
def solve(keys, groups):
N = len(keys)
groups.sort() # sort by cost, if equal, by number of keys left
# The starting node of the graph search
heap = [Node(0, N, [], 0)]
while len(heap):
node = heapq.heappop(heap)
if node.numtodo == 0:
return node.cost
for i in range(node.nextgroupid, len(groups)):
group = groups[i]
unionkeys = list(set(node.keys + group.keys))
if len(unionkeys) > len(node.keys):
heapq.heappush(heap, Node(node.cost + group.cost, N-len(unionkeys), unionkeys, i+1))
# Main
keys, groups = collectinput()
cost = solve(keys, groups)
print("solution: {}".format(cost))
This outputs 160 for the second problem you posted.

Extracting specific elements from a list of strings and creating a new list?

I am a beginner in python.
This my issue. I have a list as below
lst = ['UGAGGUAGUAGGUUGUAUAGUU', 'CUAUGCAAUUUUCUACCUUACC', 'UCCCUGAGACCUCAAGUGUGA',
'ACACCUGGGCUCUCCGGGUACC', 'CAUACUUCCUUACAUGCCCAUA', 'UGGAAUGUAAAGAAGUAUGUA',
'CAUCAAAGCGGUGGUUGAUGUG', 'UAUCACAGCCAGCUUUGAUGUGC', 'AGGCAGUGUGGUUAGCUGGUUG',
'ACGGCUACCUUCACUGCCACCC']
Now I need to extract the first letter from all the 10 elements in the lst and then put them in a new list. similarly second letter, third letter and so forth until the last letter is extracted from all the ten elements and append it to the new list. The output has to look like this
new_lst = ['UCUACUCUAA', 'GUCCAGAAGC', 'AACAUGUUGG', 'GUCCAACCCG', 'GGUCCAAAAC',
'UCGUUUACGU', 'AAAGUGAAUA', 'GAGGCUGGGC', 'UUAGCACCUC', 'AUCCUAGCGU', ..., 'C']
I tried this code:
new_lst = []
new_lst.append(''.join([x[i] for x in lst]))
The above code prints only the first 10 elements in the new_list because the index is from 0 to 9 (I misunderstood what index means).
Then I did the following
final= []
for j in range(1,len(lst),1):
new_lst = []
for x in lst:
c = len(x)
for i in range(1,c,1):
while (i<len(x)):
new_lst.append(x[i])
else:
new_lst.append("")
final.append([new_lst])
print final
When I execute this code, it throws a memory error. The reason why I checked the length is because the elements in the lst are not of the same length and when I was using a different code it threw an error, IndexError: string index out of range.
I first wanted to dissect the code, so I just used the following code:
lst2 = []
for x in lst:
c = len (x)
print c
for i in range(0,c,1):
print i,
print x[i],
I got the following output:
22
0 U 1 G 2 A 3 G 4 G 5 U 6 A 7 G 8 U 9 A 10 G 11 G 12 U 13 U 14 G 15 U 16 A 17 U 18 A 19 G 20 U 21 U 22
0 C 1 U 2 A 3 U 4 G 5 C 6 A 7 A 8 U 9 U 10 U 11 U 12 C 13 U 14 A 15 C 16 C 17 U 18 U 19 A 20 C 21 C 21
0 U 1 C 2 C 3 C 4 U 5 G 6 A 7 G 8 A 9 C 10 C 11 U 12 C 13 A 14 A 15 G 16 U 17 G 18 U 19 G 20 A 22
0 A 1 C 2 A 3 C 4 C 5 U 6 G 7 G 8 G 9 C 10 U 11 C 12 U 13 C 14 C 15 G 16 G 17 G 18 U 19 A 20 C 21 C 22
0 C 1 A 2 U 3 A 4 C 5 U 6 U 7 C 8 C 9 U 10 U 11 A 12 C 13 A 14 U 15 G 16 C 17 C 18 C 19 A 20 U 21 A 21
0 U 1 G 2 G 3 A 4 A 5 U 6 G 7 U 8 A 9 A 10 A 11 G 12 A 13 A 14 G 15 U 16 A 17 U 18 G 19 U 20 A 22
0 C 1 A 2 U 3 C 4 A 5 A 6 A 7 G 8 C 9 G 10 G 11 U 12 G 13 G 14 U 15 U 16 G 17 A 18 U 19 G 20 U 21 G 23
0 U 1 A 2 U 3 C 4 A 5 C 6 A 7 G 8 C 9 C 10 A 11 G 12 C 13 U 14 U 15 U 16 G 17 A 18 U 19 G 20 U 21 G 22 C 22
0 A 1 G 2 G 3 C 4 A 5 G 6 U 7 G 8 U 9 G 10 G 11 U 12 U 13 A 14 G 15 C 16 U 17 G 18 G 19 U 20 U 21 G 22
0 A 1 C 2 G 3 G 4 C 5 U 6 A 7 C 8 C 9 U 10 U 11 C 12 A 13 C 14 U 15 G 16 C 17 C 18 A 19 C 20 C 21 C
As you can see above the loop goes through the first element, but after extracting the first character from the first element in lst, it goes to the second character in the first element. But I wanted the loop to go through the second element in the list lst. Also, there are elements in the list with unequal lengths, so wondering if there is a way to avoid the IndexError: string index out of range?
I guess I am missing something, it might be too silly. sorry for being naive. If you could please suggest different methods to accomplish the job, it would be awesome. I checked online about using array from the module numpy, but is there a way to do this without numpy?
You can use itertools.zip_longest:
import itertools
[''.join(chars) for chars in itertools.zip_longest(*lst,fillvalue = '')]
output:
['UCUACUCUAA', 'GUCCAGAAGC', 'AACAUGUUGG', 'GUCCAACCCG', 'GGUCCAAAAC', 'UCGUUUACGU', 'AAAGUGAAUA', 'GAGGCUGGGC', 'UUAGCACCUC', 'AUCCUAGCGU', 'GUCUUAGAGU', 'GUUCAGUGUC', 'UCCUCAGCUA', 'UUACAAGUAC', 'GAACUGUUGU', 'UCGGGUUUCG', 'ACUGCAGGUC', 'UUGGCUAAGC', 'AUUUCGUUGA', 'GAGAAUGGUC', 'UCACUAUUUC', 'UCCAGGGC', 'C']
The built-in zip() and well as the itertools method zip_longest() in Python 3 (or, in Python 2, the itertools methods izip() and izip_longest()) are the tools of choice when you want to process two or more iterables (such as lists, strings, or generators) in parallel. To see the difference between zip() and zip_longest() consider the following:
for chars in zip('ABCD','EFG','HI'):
print(chars)
print('')
for chars in itertools.zip_longest('ABCD','EFG','HI',fillvalue = ''):
print(chars)
Output:
('A', 'E', 'H')
('B', 'F', 'I')
('A', 'E', 'H')
('B', 'F', 'I')
('C', 'G', '')
('D', '', '')
the first tuple produced is the tuple of the first elements, the second tuple produced is the tuple of the second elements, etc. zip (or izip) stops as soon as the first iterable is exhausted. In this case it can't return a tuple of the third character in each string since the 3rd input to zip lacks a third character. zip_longest() (or izip_longest()) allows for a fillvalue to take the place of missing items in the shorter iterables once they are exahausted. Here I used the empty string since that disappears when the tuples are joined by ''.
In the above code I hardwired in 3 strings to zip_longest(). For your problem, you would have to explicitly enter 10 inputs, which would be tedious in the extreme, or use the unpacking operator *. If I have a list:
strings = ['ABCD','EFG', 'HI']
Then
for char in itertools.zip_longest(*strings, fillvalue = ''):
is equivalent to
for chars in itertools.zip_longest('ABCD','EFG','HI',fillvalue = ''):
You will need to iterate through indices of the longest string:
lst = ['UGAGGUAGUAGGUUGUAUAGUU', 'CUAUGCAAUUUUCUACCUUACC',
'UCCCUGAGACCUCAAGUGUGA', 'ACACCUGGGCUCUCCGGGUACC',
'CAUACUUCCUUACAUGCCCAUA', 'UGGAAUGUAAAGAAGUAUGUA',
'CAUCAAAGCGGUGGUUGAUGUG', 'UAUCACAGCCAGCUUUGAUGUGC',
'AGGCAGUGUGGUUAGCUGGUUG', 'ACGGCUACCUUCACUGCCACCC']
max_len = max(len(x) for x in lst) # length of the longest string
new_lst = [ ''.join(x[i] for x in lst if i < len(x)) for i in range(max_len)]

Binning values into groups with a minimum size using pandas

I'm trying to bin a sample of observations into n discrete groups, then combine these groups until each subgroup has a mimimum of 6 members. So far, I've generated bins, and grouped my DataFrame into them:
# df is a DataFrame containing 135 measurments
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
grp.size()
1 4
2 1
3 2
4 3
5 2
6 8
7 7
8 6
9 19
10 12
11 13
12 12
13 7
14 12
15 12
16 2
17 3
18 6
19 3
21 1
So I can see that I need to combine groups 1 - 3, 3 - 5, and 16 - 21, while leaving the others intact, but I don't know how to do this programmatically.
You can do this:
df = pd.DataFrame(np.random.random_integers(1,200,135), columns=['heights'])
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
sizes = grp.size()
def f(vals, max):
sum = 0
group = 1
for v in vals:
sum += v
if sum <= max:
yield group
else:
group +=1
sum = v
yield group
#I've changed 6 by 30 for the example cause I don't have your original dataset
grp.size().groupby([g for g in f(sizes, 30)])
And if you do print grp.size().groupby([g for g in f(sizes, 30)]).cumsum() you will see that the cumulative sums is grouped as expected.
Also if you want to group the original values you can do something like:
dat = np.random.random_integers(0,200,135)
dat = np.array([78,116,146,111,147,78,14,91,196,92,163,144,107,182,58,89,77,134,
83,126,94,70,121,175,174,88,90,42,93,131,91,175,135,8,142,166,
1,112,25,34,119,13,95,182,178,200,97,8,60,189,49,94,191,81,
56,131,30,107,16,48,58,65,78,8,0,11,45,179,151,130,35,64,
143,33,49,25,139,20,53,55,20,3,63,119,153,14,81,93,62,162,
46,29,84,4,186,66,90,174,55,48,172,83,173,167,66,4,197,175,
184,20,23,161,70,153,173,127,51,186,114,27,177,96,93,105,169,158,
83,155,161,29,197,143,122,72,60])
df = pd.DataFrame({'heights':dat})
bins = np.digitize(dat,np.linspace(0,200,21))
grp = df.heights.groupby(bins)
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f(x):
global c,s
res = pd.Series([c]*x.size,index=x.index)
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.apply(f)
print df.groupby(g).size()
#another way of doing the same, just a matter of taste
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f2(x):
global c,s
res = [c]*x.size #here is the main difference with f
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.transform(f2) #call it this way
print df.groupby(g).size()

Categories