about list comprehension in python - python

Im completely stuck with these codes and dont have any idea why this two codes' output is different.
ans
str = 'I am an NLPer'
def ngram(n, lst):
return list(zip(*[lst[i:] for i in range(n)]))
ngram(2,str)
output
[('I', ' '),
(' ', 'a'),
('a', 'm'),
('m', ' '),
(' ', 'a'),
('a', 'n'),
('n', ' '),
(' ', 'N'),
('N', 'L'),
('L', 'P'),
('P', 'e'),
('e', 'r')]
my code
def myngram(n):
for i in range(n):
return list(zip(*str[i:]))
myngram(2)
output
[('I', ' ', 'a', 'm', ' ', 'a', 'n', ' ', 'N', 'L', 'P', 'e', 'r')]
any idea?
I got another solutin down below but that one above is way more sofisticatedd.
str = 'I am an NLPer'
list = []
def ngram(n):
for i in range(len(str)-1):
list.append((str[i], str[i+1]))
return(list)
ngram(2)

You can breakdown the function to have a better understanding of it,
[lst[i:] for i in range(n)]
will give u an output of 2 list with result,
[('I am an NLPer',), (' am an NLPer',)]
with the guide on zip mention here it would unzip the sequence to the output you mention above.

Related

extracting characters from python string using variable index

I'm trying to split a string of letters and numbers into a list of tuples like this:
[(37, 'M'), (1, 'I'), (5, 'M'), (1, 'D'), (25, 'M'), (33, 'S')]
This is what is kind of working, but when I try to get print "37" (print(cigar[d:pos])) it does not print the entire string, only 3.
#iterate through cigar sequence
print(cigar)
#count position in cigar sequence
pos=0
#count position of last key
d=0
splitCigar=[]
for char in cigar:
#print(cigar[pos])
if char.isalpha() == False:
print("first for-loop")
print(cigar[d])
print(cigar[pos])
print(cigar[d:pos])
num=(cigar[d:pos])
pos+=1
if char.isalpha() == True:
print("second for-loop")
splitCigar.append((num,char))
pos+=1
d=pos
print(splitCigar)
The output of this code:
37M1I5M1D25M33S
first for-loop
3
3
first for-loop
3
7
3
second for-loop
<and so on...>
second for-loop
[('3', 'M'), ('', 'I'), ('', 'M'), ('', 'D'), ('2', 'M'), ('3', 'S')]
Solution using regexp:
import re
cigar = "37M1I5M1D25M33S"
digits = re.findall('[0-9]+', cigar)
chars = re.findall('[A-Z]+', cigar)
results = list(zip(digits, chars))
Everything printed so you can see what it does:
>>> print(digits)
['37', '1', '5', '1', '25', '33']
>>> print(chars)
['M', 'I', 'M', 'D', 'M', 'S']
>>> print(results)
[('37', 'M'), ('1', 'I'), ('5', 'M'), ('1', 'D'), ('25', 'M'), ('33', 'S')]
I hope this "functional" approach suits you
Pyparsing library makes writing parsers more maintainable and readable.
If the format of the data changes, you can modify the parser without too much effort.
import pyparsing as pp
def make_grammar():
# Number consists of several digits
num = pp.Word(pp.nums).setName("Num")
# Convert the num to int
num = num.setParseAction(
pp.pyparsing_common.convertToInteger)
# 1 letter
letter = pp.Word(pp.alphas, exact=1)\
.setName("Letter")
# 1 num followed by letter with possibly
# some spaces in between
package = pp.Group(num + letter)
# 1 or more packages
grammar = pp.OneOrMore(package)
return grammar
def main():
x = "37M1I5M1D25M33S"
g = make_grammar()
result = g.parseString(x, parseAll=True)
print(result)
# [[37, 'M'], [1, 'I'], [5, 'M'],
# [1, 'D'], [25, 'M'], [33, 'S']]
# If you really want tuples:
print([tuple(r) for r in result])
main()
Sounds like a job for itertools.groupby
inp = '37M1I5M1D25M33S'
e = [''.join(g) for k, g in itertools.groupby(inp, key=lambda l: l.isdigit())]
print(e)
This will give you-
['37', 'M', '1', 'I', '5', 'M', '1', 'D', '25', 'M', '33', 'S']
Basically, groupby collects all consecutive elements that satisfy the key function (.isdigit) into groups, each of those groups is turned into a string using ''.join
Now, all you have to do is zip them together-
res = list(zip(e[::2], e[1::2]))
print(res)
That will give you
[('37', 'M'), ('1', 'I'), ('5', 'M'), ('1', 'D'), ('25', 'M'), ('33', 'S')]
If you want numericals instead of string representation of numbers, that's also super simple-
res = list(map(lambda l: (int(l[0]), l[1]), res))
Which yields
[(37, 'M'), (1, 'I'), (5, 'M'), (1, 'D'), (25, 'M'), (33, 'S')]
I'd say this is a pretty pythonic solution for your problem.
You can simply attain the desired output as follows:
cigar= '37M1I5M1D25M33S'
splitCigar=[]
t=[]
num=''
for char in cigar:
if char.isalpha()==False:
num+= char
else:
t.append(num)
num=''
t.append(char)
splitCigar.append(tuple(t))
t=[]
print(splitCigar)
Output:
[('37', 'M'), ('1', 'I'), ('5', 'M'), ('1', 'D'), ('25', 'M'), ('33', 'S')]

Generate all unique k-subsequences

I am trying to write a Python (at least initially) function to generate all subsequences of some length k (where k > 0). Since I only need unique subsequences, I am storing both the subsequences and partial subsequences in sets. The following, adapted from a colleague, is the best I could come up with. It seems...overly complex...and like I should be able to abuse itertools, or recursion, to do what I want to do. Can anyone do better?
from typing import Set, Tuple
def subsequences(string: str, k: int) -> Set[Tuple[str, ...]]:
if len(string) < k:
return set()
start = tuple(string[:k])
result = {start}
prev_state = [start]
curr_state = set()
for s in string[k:]:
for p in prev_state:
for i in range(k):
new = p[:i] + p[i + 1 :] + (s,)
curr_state.add(new)
result.update(curr_state)
prev_state = list(curr_state)
curr_state.clear()
return result
(For context, I am interested in induction of k-strictly piecewise languages, an efficiently learnable subclass of the regular languages, and the grammar can be characterized by all licit k-subsequences.
Ultimately I am also thinking about doing this in C++, where std::make_tuple isn't quite as powerful as Python tuple.)
You want a set of r combinations from n items (w/o replacement, <= (n choose r).
Given
import itertools as it
import more_itertools as mit
Code
Option 1 - itertools.combinations
set(it.combinations("foo", 2))
# {('f', 'o'), ('o', 'o')}
set(it.combinations("foobar", 3))
# {('b', 'a', 'r'),
# ('f', 'a', 'r'),
# ('f', 'b', 'a'),
# ('f', 'b', 'r'),
# ('f', 'o', 'a'),
# ('f', 'o', 'b'),
# ('f', 'o', 'o'),
# ('f', 'o', 'r'),
# ('o', 'a', 'r'),
# ('o', 'b', 'a'),
# ('o', 'b', 'r'),
# ('o', 'o', 'a'),
# ('o', 'o', 'b'),
# ('o', 'o', 'r')}
Option 2 - more_itertools.distinct_combinations
list(mit.distinct_combinations("foo", 2))
# [('f', 'o'), ('o', 'o')]
list(mit.distinct_combinations("foobar", 3))
# [('f', 'o', 'o'),
# ('f', 'o', 'b'),
# ('f', 'o', 'a'),
# ('f', 'o', 'r'),
# ('f', 'b', 'a'),
# ('f', 'b', 'r'),
# ('f', 'a', 'r'),
# ('o', 'o', 'b'),
# ('o', 'o', 'a'),
# ('o', 'o', 'r'),
# ('o', 'b', 'a'),
# ('o', 'b', 'r'),
# ('o', 'a', 'r'),
# ('b', 'a', 'r')]
Both options yield the same (unordered) output. However:
Option 1 takes the set of all combinations (including duplicates)
Option 2 does not compute duplicate intermediates
Install more_itertools via > pip install more_itertools.
See also a rough implementation of itertools.combinations in written Python.

make a spark rdd from tuples list and use groupByKey

I have a list of tuples like below
ls=[('c', 's'),('c', 'm'), ('c', 'p'), ('h', 'bi'), ('h', 'vi'), ('n', 'l'), ('n', 'nc')]
I would like to use pyspark and groupByKey to produce:
nc=[['c','s', 'm', 'p'], ['h','bi','vi'], ['n','l', 'nc']
I dont know how to make a spark rdd and use groupByKey.
I tried:
tem=ls.groupByKey()
'list' object has no attribute 'groupByKey'
You are getting that error because your object is a list and not an rdd. Python lists do not have a groupByKey() method (as the error states).
You can first convert your list to an rdd using sc.parallelize:
myrdd = sc.parallelize(ls)
nc = myrdd.groupByKey().collect()
print(nc)
#[('c',['s', 'm', 'p']), ('h',['bi','vi']), ('n',['l', 'nc'])]
This returns a list of tuples where the first element is the key and the second element is a list of the values. If you wanted to flatten these tuples, you can use itertools.chain.from_iterable:
from itertools import chain
nc = [tuple(chain.from_iterable(v)) for v in nc]
print(nc)
#[('c', 's', 'm', 'p'), ('h', 'bi', 'vi'), ('n', 'l', 'nc')]
However, you can avoid spark completely achieve the desired result using itertools.groupby:
from itertools import groupby, chain
ls=[('c', 's'),('c', 'm'), ('c', 'p'), ('h', 'bi'), ('h', 'vi'), ('n', 'l'), ('n', 'nc')]
nc = [
(key,) + tuple(chain.from_iterable(g[1:] for g in list(group)))
for key, group in groupby(ls, key=lambda x: x[0])
]
print(nc)
#[('c', 's', 'm', 'p'), ('h', 'bi', 'vi'), ('n', 'l', 'nc')]
As pault mentioned, the problem here is that Spark operates on specialised parallelized datasets, such as an RDD. To get the exact format you're after using groupByKey you'll need to do some funky stuff with lists:
ls = sc.parallelize(ls)
tem=ls.groupByKey().map(lambda x: ([x[0]] + list(x[1]))).collect()
print(tem)
#[['h', 'bi', 'vi'], ['c', 's', 'm', 'p'], ['n', 'l', 'nc']]
However, generally its best to avoid groupByKey as it can result in a large number of shuffles. This problem could also be solved with reduceByKey using:
ls=[('c', 's'),('c', 'm'), ('c', 'p'), ('h', 'bi'), ('h', 'vi'), ('n', 'l'), ('n', 'nc')]
ls = sc.parallelize(ls)
tem=ls.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x,y: x + y).collect()
print(tem)
This will scale more effectively, but note that RDD operations can start to look a little cryptic when you need to manipulate list structure.

How to get all permutations of string as list of strings (instead of list of tuples)?

The goal was to create a list of all possible combinations of certain letters in a word... Which is fine, except it now ends up as a list of tuples with too many quotes and commas.
import itertools
mainword = input(str("Enter a word: "))
n_word = int((len(mainword)))
outp = (list(itertools.permutations(mainword,n_word)))
What I want:
[yes, yse, eys, esy, sye, sey]
What I'm getting:
[('y', 'e', 's'), ('y', 's', 'e'), ('e', 'y', 's'), ('e', 's', 'y'), ('s', 'y', 'e'), ('s', 'e', 'y')]
Looks to me I just need to remove all the brackets, quotes, and commas.
I've tried:
def remove(old_list, val):
new_list = []
for items in old_list:
if items!=val:
new_list.append(items)
return new_list
print(new_list)
where I just run the function a few times. But it doesn't work.
You can recombine those tuples with a comprehension like:
Code:
new_list = [''.join(d) for d in old_list]
Test Code:
data = [
('y', 'e', 's'), ('y', 's', 'e'), ('e', 'y', 's'),
('e', 's', 'y'), ('s', 'y', 'e'), ('s', 'e', 'y')
]
data_new = [''.join(d) for d in data]
print(data_new)
Results:
['yes', 'yse', 'eys', 'esy', 'sye', 'sey']
You need to call str.join() on your string tuples in order to convert it back to a single string. Your code can be simplified with list comprehension as:
>>> from itertools import permutations
>>> word = 'yes'
>>> [''.join(w) for w in permutations(word)]
['yes', 'yse', 'eys', 'esy', 'sye', 'sey']
OR you may also use map() to get the desired result as:
>>> list(map(''.join, permutations(word)))
['yes', 'yse', 'eys', 'esy', 'sye', 'sey']
You can use the join function . Below code works perfect .
I am also attach the screenshot of the output.
import itertools
mainword = input(str("Enter a word: "))
n_word = int((len(mainword)))
outp = (list(itertools.permutations(mainword,n_word)))
for i in range(0,6):
outp[i]=''.join(outp[i])
print(outp)

How to do Byte Pair Encoding bigram counting and replacements efficiently in Python?

In the Byte Pair Encoding algorithm, there's a replacement step where it changes the character strings delimited by spaces to bigrams.
I.e., given a list of str tuples as such:
[('t', 'h', 'i', 's', '\ue000'), ('c', 'o', 'r', 'p', 'u', 's', '\ue000'), ('i', 'n', '\ue000'), ('t', 'x', 't', 'f', 'i', 'l', 'e', '\ue000'), ('t', 'h', 'e', '\ue000'), ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '\ue000'), ('b', 'a', 'r', '\ue000'), ('a', 'n', 'd', '\ue000'), ('i', 's', '\ue000'), ('f', 'o', 'o', '\ue000'), ('f', 'i', 'r', 's', 't', '\ue000'), ('a', '\ue000'), ('.', '\ue000')]
And a string tuple: ('i', 's')
How do I process the list such that it iterates through all the tuple keys and and replace ('i', 's') with ('is')?, i.e. the output Counter will look something like this:
[('t', 'h', 'is', '\ue000'), ('c', 'o', 'r', 'p', 'u', 's', '\ue000'), ('i', 'n', '\ue000'), ('t', 'x', 't', 'f', 'i', 'l', 'e', '\ue000'), ('t', 'h', 'e', '\ue000'), ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '\ue000'), ('b', 'a', 'r', '\ue000'), ('a', 'n', 'd', '\ue000'), ('is', '\ue000'), ('f', 'o', 'o', '\ue000'), ('f', 'i', 'r', 's', 't', '\ue000'), ('a', '\ue000'), ('.', '\ue000')]
I've tried this:
>>> cin
[('t', 'h', 'i', 's', '\ue000'), ('c', 'o', 'r', 'p', 'u', 's', '\ue000'), ('i', 'n', '\ue000'), ('t', 'x', 't', 'f', 'i', 'l', 'e', '\ue000'), ('t', 'h', 'e', '\ue000'), ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '\ue000'), ('b', 'a', 'r', '\ue000'), ('a', 'n', 'd', '\ue000'), ('i', 's', '\ue000'), ('f', 'o', 'o', '\ue000'), ('f', 'i', 'r', 's', 't', '\ue000'), ('a', '\ue000'), ('.', '\ue000')]
>>> [tuple(' '.join(i).replace(' '.join(qtuple), ''.join(qtuple)).split()) for i in cin]
[('t', 'h', 'is', '\ue000'), ('c', 'o', 'r', 'p', 'u', 's', '\ue000'), ('i', 'n', '\ue000'), ('t', 'x', 't', 'f', 'i', 'l', 'e', '\ue000'), ('t', 'h', 'e', '\ue000'), ('s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '\ue000'), ('b', 'a', 'r', '\ue000'), ('a', 'n', 'd', '\ue000'), ('is', '\ue000'), ('f', 'o', 'o', '\ue000'), ('f', 'i', 'r', 's', 't', '\ue000'), ('a', '\ue000'), ('.', '\ue000')]
but is there a more efficient way than looping through each word, then changing them to string to do a replace and splitting them again and then casting them back into tuples?
Would regex replacement be faster? Is there a way to work with the list of tuples without dealing with strings?
I've tried this and it seems like replacing the string with str.replace is not the problem. It's really counting the bigrams and extracting them:
import io
from collections import Counter
import time
infile = 'big.txt' # comes from norvig.com/big.txt
n = 2
with io.open(infile, 'r', encoding='utf8') as fin:
text = fin.read().lower().replace(u' ', u"\uE000")
for j in range(1,6400):
unused_char = unichr(ord(u'\uE001') + j)
start = time.time()
char_bigrams = zip(*[text[i:] for i in range(n)])
bigram_time = time.time() - start
start = time.time()
most_freq_bigram = Counter(filter(lambda x: u"\uE000" not in x and '\n' not in x, char_bigrams)).most_common(1)[0][0]
max_time = time.time() - start
start = time.time()
text = text.replace(''.join(most_freq_bigram), unused_char)
replace_time = time.time() - start
print j, ''.join(most_freq_bigram), most_freq_bigram, bigram_time, max_time, replace_time
print text
This is tested on norvig.com/big.txt
[out]:
1 th (u't', u'h') 0.896255016327 3.28389787674 0.0253069400787
2 e (u'\ue002', u'e') 1.47053217888 3.16544914246 0.0280749797821
3 in (u'i', u'n') 1.13404297829 3.10529899597 0.0245559215546
4 an (u'a', u'n') 1.20013689995 3.63801002502 0.0242891311646
5 er (u'e', u'r') 1.41387891769 3.13376092911 0.0237591266632
6 on (u'o', u'n') 1.22826981544 3.06997895241 0.0227301120758
7 re (u'r', u'e') 1.21916294098 2.97599196434 0.0238041877747
8 at (u'a', u't') 1.14608097076 2.97988891602 0.0226521492004
9 en (u'e', u'n') 1.20747494698 2.88649988174 0.019054889679
10 ed (u'e', u'd') 1.16296696663 2.8995718956 0.0198271274567
11 is (u'i', u's') 1.17692494392 3.02292394638 0.0228500366211
12 d (u'\ue005', u'd') 1.13779211044 2.85169506073 0.0229239463806
I've already experimented with scikit-learn CountVectorizer and i didn't seem to be as fast as using zip, see Fast/Optimize N-gram implementations in python
Also, without them filter operation in the Counter step, it took even longer. The Counter operation is taking 3 seconds per iteration =(
How else can this operation be optimized?
Counter(filter(lambda x: u"\uE000" not in x and '\n' not in x, char_bigrams)).most_common(1)[0][0]
Your original code:
[tuple(' '.join(i).replace(' '.join(qtuple), ''.join(qtuple)).split()) for i in cin]
I'll expand it so it's easier to see what's happening
result = []
qtuple = ("i", "s")
for i in cin:
f = " ".join(qtuple)
r = "".join(qtuple)
word = ' '.join(i)
word = word.replace(f, r)
word = word.split()
word = tuple(word)
result.append(word)
print(result)
Look for things you can move outside of the loop.
We can precompute the replacements instead of computing them for each word
find = " ".join(qtuple)
replacement = "".join(qtuple)
result = []
# this will join and split each word once
for i in cin:
word = " ".join(i)
# if you had multiple replacements to do, they should be in an inner loop here
word = word.replace(find, replacement)
result.append(tuple(word.split(" ")))
print(result)
Perhaps someone else can speak to the relatively efficiency of str.replace versus re.replace. Personally I tend to avoid regular expressions if a simple replace will do it, just for readability.
Further efficiency gains can be realized by changing the data structure for the input. If the replacement symbols are single characters then we can use a string instead of a list of tuples and avoid any joins inside the loop.
result = []
replacements = [("\ue000", "X"), ("is", "Z")]
s = "".join(["".join(t) for t in cin])
for f, r in replacements:
s = s.replace(f,r)
print(s)
# output: thZXcorpusXinXtxtfileXtheXsentenceXbarXandXZXfooXfirstXaX.X
I think the question needs some requirements added to it to explain why the chosen data structure is advantageous. From an efficiency point of view, and in the context of the byte pair encoding algorithm, a string makes a lot more sense to me.
If you keep your string tuple to length 2 you could use reduce like this:
def cons_2(word_list, t):
j = ''.join(t)
f = lambda acc, e: acc[:-1] + (j,) if (acc[-1] == t[0] and e == t[1]) else acc + (e,)
return [reduce(f, i[1:], (i[0],)) for i in word_list]
print cons_2(cin, ('i', 's'))
No replacing is involved, f is applied for every element i, the value of cin is not altered instead a new array is made and returned.
Details:
reduce applies f for every array element i and returns a value to the accumulator acc.
parameters of reduce:
f: function to apply.
i[1:]: the array to iterate with all the elements but the first.
(i[0],): the initial value of the accumulator, it's a tuple with the first value of the input tuple i.
f: is a lambda function with the accumulator acc and the current element e as inputs:
If the last element of the accumulator is equal to the first element of the string tuple and the current element e is equal to the second element of the string tuple, then return the tuple: acc[-1] + (j,) else continue with a normal concatenation: acc + (e,).
For string tuples > 2 the idea is the same but we have to manage the tuple's length l.
def cons_n(word_list, t):
l = len(t)
j = ''.join(t)
f = lambda acc, e: acc[:-l] + (j, e,) if acc[-l:] == t or acc[:l] == t else acc + (e,)
return [reduce(f, i[l:], (i[:l])) for i in word_list]
print cons_n(cin, ('i', 's'))
This should work with n-length string tuples.
Details:
Same process as above but using l: reduce applies f to the rest of the elements i[l:] and the initial value of the accumulator is a tuple with the first l elements: (i[:l]).
Check backward and forward for the l elements equal to the string tuple t, if true then add the tuple: acc[:-l] + (j, e,) else continue with normal concatenation: acc + (e,).
This is a functional approach no data is modified but generated, so should be safe to have multiple processes at the same time (in theory, I'm no expert about the Python interpreter).
If the code above is too weird for people not into functional programming this is another approach:
def cons_n_iter(tuple_list, tuple_seq):
jnt = ''.join(tuple_seq)
lnt = len(tuple_seq)
res = []
for word in tuple_list:
acc = (word[:lnt])
for letter in word[lnt:]:
if acc[-lnt:] == tuple_seq or acc[:lnt] == tuple_seq:
acc = acc[:-lnt] + (jnt, letter,)
else:
acc += (letter,)
res += (acc,)
return res
print cons_n_iter(cin, ('i', 's'))
The logic is the same as the functional approach, same use of the accumulator. In this case the res accumulator is explicit because in the examples above reduce was taking care of it.
Is this what you need? using re.
import re,ast
cin = [('t','h',"i",'s', '\ue000'), ('c', 'i', 's', 'p')]
cin = re.sub(r"i'[,\s]+'s", r"is",str(cin))
cin = ast.literal_eval(cin)

Categories