Search Strings in a List with Loop Return Order - python

I'm very new to Python and I have a question.
I have a List that looks like this:
List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Gene","L-Gene","U-Car"]
All of the words with B-(I)-L belong to each other and I want to use a function to show that.
def combine(x):
foo = []
regexp_B = ("B-" + r'.*')
regexp_I = ("I-" + r'.*')
regexp_L = ("L-" + r'.*')
regexp_U = ("U-" + r'.*')
for i in range(0,len(x),1):
if re.match(regexp_B, x[i]):
print("Found B")
foo.append[i+x[i]]
if re.match(regexp_I, x[i+1]):
print("Found I")
foo.append[i+1+x[i+1]]
if re.match(regexp_I, x[i+1]):
print("Found I")
foo.append[i+1+x[i+1]]
else:
print("Found L")
foo.append[i+1+x[i+1]]
else:
print("Found L")
foo.append[i1+x[i1]]
elif re.match(regexp_L, x[i]):
print("L")
foo.append[i1+x[i1]]
elif re.match(regexp_U, x[i]):
print("Found U")
foo.append[i1+x[i1]]
return foo
List_New = combine(List)
Desired Output:
foo = ["0B-Guild","0I-Guild","0I-Guild","OL-Guild","1B-Gene","1L-Gene","2U-Car"]
Edit:
The output follows this logic: Every time a "B-" prefix appears, the words to follow are part of one "theme" until a "L-" prefix appears. These words got to have the same number before them so they can be grouped for further functions. "U-" prefixes don't follow that logic and just need a number before them to distinguish them from the other words. Think of it as a Counter that groups these word into a cluster.

def combine(some_list):
current_group = 0 # starts with 0
g_size = 0 # current group size
for elem in some_list:
g_size += 1
if elem.startswith('U-') and g_size > 1:
g_size = 1
current_group += 1
yield '{}{}'.format(current_group, elem)
if elem.startswith(('L-', 'U-')): # each L- or U- also finishes a group
g_size = 0
current_group += 1
>>> List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Gene","L-Gene","U-Car"]
>>> print(list(combine(List)))
>>> List = ["B-Guild","I-Guild","I-Guild","L-Guild","B-Guild","L-Guild","U-Guild"]
>>> print(list(combine(List)))

Related

Using Python, how to print output string as -> aaa3bb2c1ddddd5 when Input string is aaabbcddddd

Using Python, how to print output string as -> aaa3bb2c1ddddd5 when Input string is aaabbcddddd
I want to concatenate actual character value and number of times a character is repeated in a string
def mycode(myString):
lenstr = len(myString)
print('length of string is '+str(lenstr));
for ele in myString:
count=0
for character in myString:
if character == ele:
count = count+1
totalstr = ele+str(count)
return totalstr
If the string is always sorted and grouped together like that, then you can use a collections.Counter to do it.
from collections import Counter
inp = "aaabbcddddd"
counter = Counter(inp)
out = "".join(k * v + str(v) for k,v in counter.items())
Or in one line:
print(''.join(k * v + str(v) for k,v in Counter(inp).items()))
Output:
aaa3bb2c1ddddd5
Or you can do it manually:
inp = "aaabbcddddd"
last = inp[0]
out = inp[0]
count = 1
for i in inp[1:]:
if i == last:
count += 1
else:
out += str(count)
count = 1
last = i
out += i
out += str(count)
print(out)
Here is a one line solution using a regex replacement with callback:
inp = "aaabbcddddd"
output = re.sub(r'((\w)\2*)', lambda m: m.group(1) + str(len(m.group(1))), inp)
print(output) # aaa3bb2c1ddddd5
Another one-liner:
import itertools
test = 'aaabbcddddd'
out = ''.join(f"{(g := ''.join(ig))}{len(g)}" for _, ig in itertools.groupby(test))
assert out == 'aaa3bb2c1ddddd5'
def char_counter_string(string):
prev_char = None
char_counter = 0
output = ''
for char_index in range(len(string)+1):
if char_index == len(string):
output += str(char_counter)
break
if string[char_index] != prev_char and prev_char is not None:
output += str(char_counter)
char_counter = 0
output += string[char_index]
char_counter += 1
prev_char = string[char_index]
return output
if __name__ == '__main__':
print(char_counter_string('aaabbcddddd'))
you can do like..
Code:
Time Complexity: O(n)
input_string="aaabbcddddd"
res=""
count=1
for i in range(1, len(input_string)):
if input_string[i] == input_string[i-1]:
count += 1
else:
res+=input_string[i-1]*count + str(count)
count = 1
res+=input_string[-1]*count + str(count)
print(res) #aaa3bb2c1ddddd5
Here's another way, ...
Full disclosure: ... as long as the run of characters is 10 or less, it will work. I.e., if there are 11 of anything in row, this won't work (the count will be wrong).
It's just a function wrapping a reduce.
from functools import reduce
def char_rep_count(in_string):
return reduce(
lambda acc, inp:
(acc[:-1]+inp+str(int(acc[-1])+1))
if (inp==acc[-2])
else (acc+inp+"1"),
in_string[1:],
in_string[0]+"1"
)
And here's some sample output:
print(char_rep_count("aaabbcdddd"))
aaa3bb2c1dddd4
I think this fulfils the brief and is also very fast:
s = 'aaabbcddddd'
def mycode(myString):
if myString:
count = 1
rs = [prev := myString[0]]
for c in myString[1:]:
if c != prev:
rs.append(f'{count}')
count = 1
else:
count += 1
rs.append(prev := c)
rs.append(f'{count}')
return ''.join(rs)
return myString

Find the total number of occurrence of a string in a cyclic string

I'm currently learning Python and I'm stuck on this specific question.
Image
Here is my current code:
word = input()
text = 0
wordch = 0
positions = 0
repeated = 0
while repeated != 2:
for i in range(0, len(tablet)):
if tablet[i] == word[wordch]:
text += 1
wordch += 1
if text == len(word):
positions += 1
text = 0
wordch = 0
elif repeated == 1 and text == len(word):
positions += 1
text = 0
wordch = 0
break
elif i == len(tablet)-1:
repeated += 1
break
elif tablet[i] != word[wordch]:
text == 0
wordch == 0
print(positions)
I would hope for a code that is really basic using the same concepts but please do answer.
Thank you!
I have tried to solve the problem by using a different approach. As we know that we can only use (len(fav_word)) - 1 letters if we tried to create the substring in a cyclic manner from the end since if we took any more characters, we would have created them from the start itself without the cycle.
So, I just created a new string from the original string by appending the starting (len(fav_word)) - 1 to the original string and then find all occurrences of the fav_string in the new string.
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += 1
x = "cabccabcab"
fav = "abc"
y = x + x[0:len(fav)-1]
print(len(list(find_all(y, fav)))) # Output: 3
x = "ababa"
fav = "aba"
y = x + x[0:len(fav)-1]
print(len(list(find_all(y, fav)))) # Output: 2
x = "aaaaaa"
fav = "aa"
y = x + x[0:len(fav)-1]
print(len(list(find_all(y, fav)))) # Output: 6
x = "abaaba"
fav = "aaba"
y = x + x[0:len(fav)-1]
print(len(list(find_all(y, fav)))) # Output: 2
def find_str(g,find):
lg = len(g)
lf = len(find)
x=0
s=""
for index, i in enumerate(g):
if i == find[0]:
if index+lf <= lg:
s = "".join(g[index:index+lf])
if s == find:
x+=1
else:
rem = "".join(g[index:])
lr = len(rem)
for index,i in enumerate(g):
rem+=i
lr+=1
if lr == lf:
if rem == find:
x+=1
break
return x
print(find_str("abaaba","aaba"))
def split(word):
return [char for char in word]
x = "aaaaaa"
pattern = "aa"
mylist=split(x)
ok=True
occurrences=0
buffer=""
while ok:
char=mylist.pop(0)
buffer+=char
if buffer==pattern:
occurrences+=1
buffer=""
if len(mylist)==0:
ok=False
print(occurrences)
output:3

Is there a way to speed up this for in python?

Is there a way to speed up this code in python? I need to run it with over 500k strings and it takes too long.
Each word need to be put inside the matching dictionary.
example_sent_words = list of 500k strings
EmojiPos = list of emoji
EmojiNeg = list of emoji
OthersEmoji = list of emoji
emoji_pos=dict()
emoji_neg=dict()
emoji_others=dict()
for w in example_sent_words:
if w in s_EmojiPos:
remove_username_url.remove(w)
if w in emoji_pos:
emoji_pos[w] += 1
else:
emoji_pos[w] = 1
elif w in s_EmojiNeg:
remove_username_url.remove(w)
if w in emoji_neg:
emoji_neg[w] += 1
else:
emoji_neg[w] = 1
elif w in s_OthersEmoji:
remove_username_url.remove(w)
if w in emoji_others:
emoji_others[w] += 1
else:
emoji_others[w] = 1
edit: I wrote this as suggested:
s_AdditionalEmoji = set(AdditionalEmoji)
s_EmojiNeg = set(EmojiNeg)
s_EmojiPos = set(EmojiPos)
To cut down on memory usage and speed up checks as well as make sure to check every word, I would suggest:
word_count = len(example_sent_words)
for i in range(word_count) :
w = example_sent_words[0]
if w in EmojiPos:
example_sent_words.pop(0)
try:
emoji_pos[w] += 1
except:
emoji_pos[w] = 1
elif w in EmojiNeg:
example_sent_words.pop(0)
try:
emoji_neg[w] += 1
except:
emoji_neg[w] = 1
elif w in OthersEmoji:
example_sent_words.pop(0)
try:
emoji_others[w] += 1
except:
emoji_others[w] = 1

finding keywords frequency in a c code using python excluding comments

I want to create a program, that can count the frequency of keywords used in a C code, excluding the commented ones or inside printf command.
def counting(f, word):
counter = 0
for w in f.split():
if word==w:
counter += 1
return counter
key=open('c_keywords.txt')
keyw=key.read().split()
file=open('a1.cpp').read()
for key in keyw:
x = counting(file,key)
if x != 0:
print (key, ":", x)
Here is an example of how to do it with a textfile, you can edit the text.txt and use your C code file instead
with open('text.txt', 'r') as doc:
print('opened txt')
for words in doc:
wordlist = words.split()
for numbers in range(len(wordlist)):
for inner_numbers in range(len(wordlist)):
if inner_numbers != numbers:
if wordlist[numbers] == wordlist[inner_numbers]:
print('word: %s == %s' %(wordlist[numbers], wordlist[inner_numbers]))
Use:
f = open('keywords_c.txt')
count = 0
words = []
for x in f:
w = x.split()
for a in w:
words.append(a)
print(words)
cpp = open('Simple_c.cpp')
program = []
for y in cpp:
if y.startswith('printf'):
continue
elif y.startswith('//'):
continue
else:
w = y.split()
for b in w:
if any(b in s for s in words):
count +=1
print(count)

Parsing Data from live website in Python Enumerate problem!

The following script is supposed to fetch a specific line number and parse it from a live website. It works for like 30 loops but then it seems like enumerate(f) stops working correctly... the "i" in the for loop seems to stop at line 130 instead of like 200 something. Could this be due to the website I'm trying to fetch data from or something else? Thanks!!
import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.divs = []
self.descriptions = []
self.inside_div_element = 0
def start_div(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "id":
self.divs.append(value)
self.inside_div_element = 1
def end_div(self):
"Record the end of a hyperlink."
self.inside_div_element = 0
def handle_data(self, data):
"Handle the textual 'data'."
if self.inside_div_element:
self.descriptions.append(data)
def get_div(self):
"Return the list of hyperlinks."
return self.divs
def get_descriptions(self, check):
"Return a list of descriptions."
if check == 1:
self.descriptions.pop(0)
return self.descriptions
def rm_descriptions(self):
"Remove all descriptions."
self.descriptions.pop()
import urllib
import linecache
import sgmllib
tempLine = ""
tempStr = " "
tempStr2 = ""
myparser = MyParser()
count = 0
user = ['']
oldUser = ['none']
oldoldUser = [' ']
array = [" ", 0]
index = 0
found = 0
k = 0
j = 0
posIndex = 0
a = 0
firstCheck = 0
fCheck = 0
while a < 1000:
print a
f = urllib.urlopen("SITE")
a = a+1
for i, line in enumerate(f):
if i == 187:
print i
tempLine = line
print line
myparser.parse(line)
if fCheck == 1:
result = oldUser[0] is oldUser[1]
u1 = oldUser[0]
u2 = oldUser[1]
tempStr = oldUser[1]
if u1 == u2:
result = 1
else:
result = user is oldUser
fCheck = 1
user = myparser.get_descriptions(firstCheck)
tempStr = user[0]
firstCheck = 1
if result:
array[index+1] = array[index+1] +0
else:
j = 0
for z in array:
k = j+2
tempStr2 = user[0]
if k < len(array) and tempStr2 == array[k]:
array[j+3] = array[j+3] + 1
index = j+2
found = 1
break
j = j+1
if found == 0:
array.append(tempStr)
array.append(0)
oldUser = user
found = 0
print array
elif i > 200:
print "HERE"
break
print array
f.close()
Perhaps the number of lines on that web page are fewer than you think? What does this give you?:
print max(i for i, _ in enumerate(urllib.urlopen("SITE")))
Aside: Your indentation is stuffed after the while a < 1000: line. Excessive empty lines and one-letter names don't assist the understanding of your code.
enumerate is not broken. Instead of such speculation, inspect your data. Suggestion: replace
for i, line in enumerate(f):
by
lines = list(f)
print "=== a=%d linecount=%d === % (a, len(lines))
for i, line in enumerate(lines):
print " a=%d i=%d line=%r" % (a, i, line)
Examine the output carefully.

Categories