find the index of element the number of occurence in string - python

A Char_Record is a 3 item list [char, total, pos_list] where
char is a one character string
total is a Nat representing the number of occurrences of char
pos_list is a list of Nat representing the indices of char
Using the function build_char_records() should produce a sorted list with every character represented (lowercase).
For example:
>>>build_char_records('Hi there bye')
['',2,[2,8]]
['b',1,[9]]
['e',3,[5,7,11]]
['h',2[0,4]]
['i',1,[1]]
['r',1,[6]]
['t',1,[3]]
['y',1,[10]]
I just wrote it like this , I don't know how to do it, someone help please. Thanks.
def build_char_records(s):
s=sorted(s)
a=[]
for i in range(len(s)):

I think that the other answers given thus far are better answers from an overall programming perspective, but based on your question I think this answer is appropriate for your skill level
def build_char_records(phrase):
phrase = phrase.lower()
resultList = []
for character in phrase: ## iterates through the phrase
if character not in resultList:
resultList.append(character) ## This adds each character to the list
## if it is not already in the list
resultList.sort() ## sorts the list
for i in range(len(resultList)): ## goes through each unique character
character = resultList[i] ## the character in question
tphrase = phrase ## a copy of the phrase
num = phrase.count(character) ## the number of occurences
acc = 0 ## an accumulator to keep track of how many we've found
locs = [] ## list of the locations
while acc < num: ## while the number we've found is less than how many
## there should be
index = tphrase.find(character) ## finds the first occurance of the character
tphrase = tphrase[index+1:] ## chops off everything up to and including the
## character
if len(locs) != 0: ## if there is more than one character
index = locs[acc-1] + index + 1 ## adjusts because we're cutting up the string
locs.append(index)## adds the index to the list
acc += 1 ## increases the accumulator
resultList[i] = [character, num, locs] ## creates the result in the proper spot
return resultList ## returns the list of lists
print build_char_records('Hi there bye')
This will print out [[' ', 2, [2, 8]], ['b', 1, [9]], ['e', 3, [5, 7, 11]], ['h', 2, [0, 4]], ['i', 1, [1]], ['r', 1, [6]], ['t', 1, [3]], ['y', 1, [10]]]
Here is a slightly shorter, cleaner version
def build_char_records(phrase):
phrase = phrase.lower()
resultList = []
for character in phrase:
if character not in resultList:
resultList.append(character)
resultList.sort()
for i in range(len(resultList)):
tphrase = phrase
num = phrase.count(resultList[i])
locs = []
for j in range(num):
index = tphrase.find(resultList[i])
tphrase = tphrase[index+1:]
if len(locs) != 0:
index = locs[acc-1] + index + 1
locs.append(index)
resultList[i] = [resultList[i], num, locs]
return resultList
print build_char_records('Hi there bye')

Using only list, this is what you can do:
def build_char_records(s):
records = [] # Create a list to act as a dictionary
for idx, char in enumerate(s):
char = char.lower()
current_record = None # Try to find the record in our list of records
for record in records: # Iterate over the records
if record[0] == char: # We find it!
current_record = record # This is the record for current char
break # Stop the search
if current_record is None: # This means the list does not contain the record for this char yet
current_record = [char, 0, []] # Create the record
records.append(current_record) # Put the record into the list of records
current_record[1] += 1 # Increase the count by one
current_record[2].append(idx) # Append the position of the character into the list
for value in records: # Iterate over the Char_Record
print value # Print the Char_Record
Or, if you need to sort it, you can do what #Dannnno said, or as an example, it can be sorted in this way (although you might have not learned about lambda):
records.sort(key=lambda x: x[0])
Just put that before printing the records.
Or, you can do it using dict and list:
def build_char_records(s):
record = {} # Create an empty dictionary
for idx, char in enumerate(s): # Iterate over the characters of string with the position specified
char = char.lower() # Convert the character into lowercase
if char not in record: # If we have not seen the character before, create the Char_Record
record[char] = [char,0,[]] # Create a Char_Record and put it in the dictionary
record[char][1] += 1 # Increase the count by one
record[char][2].append(idx) # Append the position of the character into the list
for value in record.values(): # Iterate over the Char_Record
print value # Print the Char_Record

from collections import defaultdict
def build_char_records(s):
cnt = defaultdict(int)
positions = defaultdict(list)
for i,c in enumerate(s):
cnt[c] += 1
positions[c].append(i)
return [ [c, cnt[c], positions[c]] for c in cnt.keys() ]

Related

Count of sub-strings that contain character X at least once. E.g Input: str = “abcd”, X = ‘b’ Output: 6

This question was asked in an exam but my code (given below) passed just 2 cases out of 7 cases.
Input Format : single line input seperated by comma
Input: str = “abcd,b”
Output: 6
“ab”, “abc”, “abcd”, “b”, “bc” and “bcd” are the required sub-strings.
def slicing(s, k, n):
loop_value = n - k + 1
res = []
for i in range(loop_value):
res.append(s[i: i + k])
return res
x, y = input().split(',')
n = len(x)
res1 = []
for i in range(1, n + 1):
res1 += slicing(x, i, n)
count = 0
for ele in res1:
if y in ele:
count += 1
print(count)
When the target string (ts) is found in the string S, you can compute the number of substrings containing that instance by multiplying the number of characters before the target by the number of characters after the target (plus one on each side).
This will cover all substrings that contain this instance of the target string leaving only the "after" part to analyse further, which you can do recursively.
def countsubs(S,ts):
if ts not in S: return 0 # shorter or no match
before,after = S.split(ts,1) # split on target
result = (len(before)+1)*(len(after)+1) # count for this instance
return result + countsubs(ts[1:]+after,ts) # recurse with right side
print(countsubs("abcd","b")) # 6
This will work for single character and multi-character targets and will run much faster than checking all combinations of substrings one by one.
Here is a simple solution without recursion:
def my_function(s):
l, target = s.split(',')
result = []
for i in range(len(l)):
for j in range(i+1, len(l)+1):
ss = l[i] + l[i+1:j]
if target in ss:
result.append(ss)
return f'count = {len(result)}, substrings = {result}'
print(my_function("abcd,b"))
#count = 6, substrings = ['ab', 'abc', 'abcd', 'b', 'bc', 'bcd']
Here you go, this should help
from itertools import combinations
output = []
initial = input('Enter string and needed letter seperated by commas: ') #Asking for input
list1 = initial.split(',') #splitting the input into two parts i.e the actual text and the letter we want common in output
text = list1[0]
final = [''.join(l) for i in range(len(text)) for l in combinations(text, i+1)] #this is the core part of our code, from this statement we get all the available combinations of the set of letters (all the way from 1 letter combinations to nth letter)
for i in final:
if 'b' in i:
output.append(i) #only outputting the results which have the required letter/phrase in it

How can I group a sorted list into tuples of start and endpoints of consecutive elements of this list?

Suppose that my sorted list is as such:
L = ["01-string","02-string","03-string","05-string","07-string","08-string"]
As you can see this list has been sorted. I now want the start and end points of each block of continuous strings in this list, for example, the output for this should be:
L_continuous = [("01-string", "03-string"),("05-string","05-string"),("07-string","08-string")]
So, just to clarify, I need a list of tuples and in each of these tuples I need the start and endpoint of each consecutive block in my list. So, for example, elements 0, 1 and 2 in my list are consecutive because 01,02,03 are consecutive numbers - so the start and endpoints would be "01-string" and "03-string".
The numbers 1-3 are consecutive so they form a block, whereas 5 does not have any consecutive numbers in the list so it forms a block by itself.
Not a one-liner, but something like this might work:
L = ["01-string","02-string","03-string","05-string","07-string","08-string"]
counter = None
# lastNum = None
firstString = ""
lastString = ""
L_continuous = list()
for item in L:
currentNum = int(item[0:2])
if counter is None:
# startTuple
firstString = item
counter = currentNum
lastString = item
continue
if counter + 1 == currentNum:
# continuation of block
lastString = item
counter += 1
continue
if currentNum > counter + 1:
# end of block
L_continuous.append((firstString,lastString))
firstString = item
counter = currentNum
lastString = item
continue
else:
print ('error - not sorted or unique numbers')
# add last block
L_continuous.append((firstString,lastString))
print(L_continuous)
The first thing to do is extract an int from the string data, so that we can compare consecutive numbers:
def extract_int(s):
return int(s.split('-')[0])
Then a straightforward solution is to keep track of the last number seen, and emit a new block when it is not consecutive with the previous one. At the end of the loop, we need to emit a block of whatever is "left over".
def group_by_blocks(strs):
blocks = []
last_s = first_s = strs[0]
last_i = extract_int(last_s)
for s in strs[1:]:
i = extract_int(s)
if i != last_i + 1:
blocks.append( (first_s, last_s) )
first_i, first_s = i, s
last_i, last_s = i, s
blocks.append( (first_s, last_s) )
return blocks
Example:
>>> group_by_blocks(L)
[('01-string', '03-string'), ('05-string', '05-string'), ('07-string', '08-string')]

extract substring pattern

I have long file like 1200 sequences
>3fm8|A|A0JLQ2
CFLVNLNADPALNELLVYYLKEHTLIGSANSQDIQLCGMGILPEHCIIDITSEGQVMLTP
QKNTRTFVNGSSVSSPIQLHHGDRILWGNNHFFRLNLP
>2ht9|A|A0JLT0
LATAPVNQIQETISDNCVVIFSKTSCSYCTMAKKLFHDMNVNYKVVELDLLEYGNQFQDA
LYKMTGERTVPRIFVNGTFIGGATDTHRLHKEGKLLPLVHQCYL
I want to read each possible pattern has cysteine in middle and has in the beginning five string and follow by other five string such as xxxxxCxxxxx
the output should be like this:
QDIQLCGMGIL
ILPEHCIIDIT
TISDNCVVIFS
FSKTSCSYCTM
this is the pogram only give position of C . it is not work like what I want
pos=[]
def find(ch,string1):
for i in range(len(string1)):
if ch == string1[i]:
pos.append(i)
return pos
z=find('C','AWERQRTCWERTYCTAAAACTTCTTT')
print z
You need to return outside the loop, you are returning on the first match so you only ever get a single character in your list:
def find(ch,string1):
pos = []
for i in range(len(string1)):
if ch == string1[i]:
pos.append(i)
return pos # outside
You can also use enumerate with a list comp in place of your range logic:
def indexes(ch, s1):
return [index for index, char in enumerate(s1)if char == ch and 5 >= index <= len(s1) - 6]
Each index in the list comp is the character index and each char is the actual character so we keep each index where char is equal to ch.
If you want the five chars that are both sides:
In [24]: s="CFLVNLNADPALNELLVYYLKEHTLIGSANSQDIQLCGMGILPEHCIIDITSEGQVMLTP QKNTRTFVNGSSVSSPIQLHHGDRILWGNNHFFRLNLP"
In [25]: inds = indexes("C",s)
In [26]: [s[i-5:i+6] for i in inds]
Out[26]: ['QDIQLCGMGIL', 'ILPEHCIIDIT']
I added checking the index as we obviously cannot get five chars before C if the index is < 5 and the same from the end.
You can do it all in a single function, yielding a slice when you find a match:
def find(ch, s):
ln = len(s)
for i, char in enumerate(s):
if ch == char and 5 <= i <= ln - 6:
yield s[i- 5:i + 6]
Where presuming the data in your question is actually two lines from yoru file like:
s="""">3fm8|A|A0JLQ2CFLVNLNADPALNELLVYYLKEHTLIGSANSQDIQLCGMGILPEHCIIDITSEGQVMLTPQKNTRTFVNGSSVSSPIQLHHGDRILWGNNHFFRLNLP
>2ht9|A|A0JLT0LATAPVNQIQETISDNCVVIFSKTSCSYCTMAKKLFHDMNVNYKVVELDLLEYGNQFQDALYKMTGERTVPRIFVNGTFIGGATDTHRLHKEGKLLPLVHQCY"""
Running:
for line in s.splitlines():
print(list(find("C" ,line)))
would output:
['0JLQ2CFLVNL', 'QDIQLCGMGIL', 'ILPEHCIIDIT']
['TISDNCVVIFS', 'FSKTSCSYCTM', 'TSCSYCTMAKK']
Which gives six matches not four as your expected output suggest so I presume you did not include all possible matches.
You can also speed up the code using str.find, starting at the last match index + 1 for each subsequent match
def find(ch, s):
ln, i = len(s) - 6, s.find(ch)
while 5 <= i <= ln:
yield s[i - 5:i + 6]
i = s.find(ch, i + 1)
Which will give the same output. Of course if the strings cannot overlap you can start looking for the next match much further in the string each time.
My solution is based on regex, and shows all possible solutions using regex and while loop. Thanks to #Smac89 for improving it by transforming it into a generator:
import re
string = """CFLVNLNADPALNELLVYYLKEHTLIGSANSQDIQLCGMGILPEHCIIDITSEGQVMLTPQKNTRTFVNGSSVSSPIQLHHGDRILWGNNHFFRLNLP
LATAPVNQIQETISDNCVVIFSKTSCSYCTMAKKLFHDMNVNYKVVELDLLEYGNQFQDA LYKMTGERTVPRIFVNGTFIGGATDTHRLHKEGKLLPLVHQCYL"""
# Generator
def find_cysteine2(string):
# Create a loop that will utilize regex multiple times
# in order to capture matches within groups
while True:
# Find a match
data = re.search(r'(\w{5}C\w{5})',string)
# If match exists, let's collect the data
if data:
# Collect the string
yield data.group(1)
# Shrink the string to not include
# the previous result
location = data.start() + 1
string = string[location:]
# If there are no matches, stop the loop
else:
break
print [x for x in find_cysteine2(string)]
# ['QDIQLCGMGIL', 'ILPEHCIIDIT', 'TISDNCVVIFS', 'FSKTSCSYCTM', 'TSCSYCTMAKK']

Python "list index out of range" error

My goals is to have a list of lists, where each item in the outer list contains a word in it's first index, and the number of times it has come across it in the second index. As an example, it should look like this:
[["test1",0],["test2",4],["test3",8]]
The only issue is that when I try to, for instance, access the word "test1" from the first inner-list, I get an index out of range error. Here is my code for how I am attempting to do this:
stemmedList = [[]]
f = open(a_document_name, 'r')
#read each line of file
fileLines = f.readlines()
for fileLine in fileLines:
#here we end up with stopList, a list of words
thisReview = Hw1.read_line(fileLine)['text']
tokenList = Hw1.tokenize(thisReview)
stopList = Hw1.stopword(tokenList)
#for each word in stoplist, compare to all terms in return list to
#see if it exists, if it does add one to its second parameter, else
#add it to the list as ["word", 0]
for word in stopList:
#if list not empty
if not len(unStemmedList) == 1: #for some reason I have to do this to see if list is empty, I'm assuming when it's empty it returns a length of 1 since I'm initializing it as a list of lists??
print "List not empty."
for innerList in unStemmedList:
if innerList[0] == word:
print "Adding 1 to [" + word + ", " + str(innerList[1]) + "]"
innerList[1] = (innerList[1] + 1)
else:
print "Adding [" + word + ", 0]"
unStemmedList.append([word, 0])
else:
print "List empty."
unStemmedList.append([word, 0])
print unStemmedList[len(unStemmedList)-1]
return stemmedList
The final output ends up being:
List is empty.
["test1",0]
List not empty"
Crash with list index out of range error which points to the line if innerList[0] == word
You have a = [[]]
Now, when you are appending to this list after encountering first word, you have
a = [ [], ['test', 0] ]
In the next iteration you are accessing the 0th element of an empty list which doesn't exist.
Assuming that stemmedList and unStemmedList are similar
stemmedList = [[]]
you have an empty list in your list of lists, it has no [0]. Instead just initialize it to:
stemmedList = []
Isn't this simpler?
counts = dict()
def plus1(key):
if key in counts:
counts[key] += 1
else:
counts[key] = 1
stoplist = "t1 t2 t1 t3 t1 t1 t2".split()
for word in stoplist:
plus1(word)
counts
{'t2': 2, 't3': 1, 't1': 4}

How to compress a python list of strings in format string[number]/[number]

I have a list of strings ending with numbers. Want to sort them in python and then compress them if a range is formed.
Eg input string :
ABC1/3, ABC1/1, ABC1/2, ABC2/3, ABC2/2, ABC2/1
Eg output string:
ABC1/1-3, ABC2/1-3
How should I approach this problem with python?
There's no need to use a dict for this problem. You can simply parse the tokens into a list and sort it. By default Python sorts a list of lists by the individual elements of each list. After sorting the list of token pairs, you only need to iterate once and record the important indices. Try this:
# Data is a comma separated list of name/number pairs.
data = 'ABC1/3, ABC1/1, ABC1/2, ABC2/3, ABC2/2, ABC2/1'
# Split data on ', ' and split each token on '/'.
tokens = [token.split('/') for token in data.split(', ')]
# Convert token number to integer.
for index in range(len(tokens)):
tokens[index][1] = int(tokens[index][1])
# Sort pairs, automatically orders lists by items.
tokens.sort()
prev = 0 # Record index of previous pair's name.
indices = [] # List to record indices for output.
for index in range(1, len(tokens)):
# If name matches with previous position.
if tokens[index][0] == tokens[prev][0]:
# Check whether number is increasing sequentially.
if tokens[index][1] != (tokens[index - 1][1] + 1):
# If non-sequential increase then record the indices.
indices.append((prev, index - 1))
prev = index
else:
# If name changes then record the indices.
indices.append((prev, index - 1))
prev = index
# After iterating the list, record the indices.
indices.append((prev, index))
# Print the ranges.
for start, end in indices:
if start == end:
args = (tokens[start][0], tokens[start][1])
print '{0}/{1},'.format(*args),
else:
args = (tokens[start][0], tokens[start][1], tokens[end][1])
print '{0}/{1}-{2},'.format(*args),
# Output:
# ABC1/1-3 ABC2/1-3
I wanted to speedhack this problem, so here is an almost complete solution for you, based on my own make_range_string and a stolen natsort.
import re
from collections import defaultdict
def sortkey_natural(s):
return tuple(int(part) if re.match(r'[0-9]+$', part) else part
for part in re.split(r'([0-9]+)', s))
def natsort(collection):
return sorted(collection, key=sortkey_natural)
def make_range_string(collection):
collection = sorted(collection)
parts = []
range_start = None
previous = None
def push_range(range_start, previous):
if range_start is not None:
if previous == range_start:
parts.append(str(previous))
else:
parts.append("{}-{}".format(range_start, previous))
for i in collection:
if previous != i - 1:
push_range(range_start, previous)
range_start = i
previous = i
push_range(range_start, previous)
return ', '.join(parts)
def make_ranges(strings):
components = defaultdict(list)
for i in strings:
main, _, number = i.partition('/')
components[main].append(int(number))
rvlist = []
for key in natsort(components):
rvlist.append((key, make_range_string(components[key])))
return rvlist
print(make_ranges(['ABC1/3', 'ABC1/1', 'ABC1/2', 'ABC2/5', 'ABC2/2', 'ABC2/1']))
The code prints a list of tuples:
[('ABC1', '1-3'), ('ABC2', '1-2, 5')]
I would start by splitting the strings, and using the part that you want to match on as a dictionary key.
strings = ['ABC1/3', 'ABC1/1', 'ABC1/2', 'ABC2/3', 'ABC2/2', 'ABC2/1']
d = {}
for s in string:
a, b = s.split('/')
d.get(a, default=[]).append(b)
That collects the number parts into a list for each prefix. Then you can sort the lists and look for adjacent numbers to join.

Categories