Split every line of the document into groups of n - python

def ngram(n, k, document):
f = open(document, 'r')
for i, line in enumerate(f):
words = line.split() + line.split()
print words
return {}
For ex- "I love the Python programming language" and n = 2
are "I love", "love the", "the Python", "Python programming", and "programming langue";
I want to store in a list and then compare how many of them are same.

It's not entirely clear what you want returned. Assuming one line says:
I love the Python programming language
And that you want to do nothing inter-line.
from collections import deque
def linesplitter(line, n):
prev = deque(maxlen=n) # fixed length list
for word in line.split(): # iterate through each word
prev.append(word) # keep adding to the list
if len(prev) == n: # until there are n elements
print " ".join(prev) # then start printing
# oldest element is removed automatically
with open(document) as f: # 'r' is implied
for line in f:
linesplitter(line, 2) # or any other length!
Output:
I love
love the
the Python
Python programming
programming language

You could adapt from one of the itertools recipes:
import itertools
def ngrams(N, k, filepath):
with open(filepath) as infile:
words = (word for line in infile for word in line.split())
ts = itertools.tee(words, N)
for i in range(1, len(ts)):
for t in ts[i:]:
next(t, None)
return zip(*ts)
With a test file that looks like this:
I love
the
python programming language
Here's the output:
In [21]: ngrams(2, '', 'blah')
Out[21]:
[('I', 'love'),
('love', 'the'),
('the', 'python'),
('python', 'programming'),
('programming', 'language')]
In [22]: ngrams(3, '', 'blah')
Out[22]:
[('I', 'love', 'the'),
('love', 'the', 'python'),
('the', 'python', 'programming'),
('python', 'programming', 'language')]

Well, you can achieve this through a List Comprehension:
>>> [s1 + " " + s2 for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
You can also use the str.format function:
>>> ["{} {}".format(s1, s2) for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
The finalized version of the function:
from itertools import tee, islice
def ngram(n, s):
var = [islice(it, i, None) for i, it in enumerate(tee(s.split(), n))]
return [("{} " * n).format(*itt) for itt in zip(*var)]
Demo:
>>> from splitting import ngram
>>> thing = 'I love the Python programming language'
>>> ngram(2, thing)
['I love ', 'love the ', 'the Python ', 'Python programming ', 'programming language ']
>>> ngram(3, thing)
['I love the ', 'love the Python ', 'the Python programming ', 'Python programming language ']
>>> ngram(4, thing)
['I love the Python ', 'love the Python programming ', 'the Python programming language ']
>>> ngram(1, thing)
['I ', 'love ', 'the ', 'Python ', 'programming ', 'language ']

Here a "one-line" solution, using list comprenhension:
s = "I love the Python programming language"
def ngram(s, n):
return [" ".join(k) for k in zip(*[l[0] for l in zip(s.split()[e:] for e in range(n))])]
# Test
for i in range(1, 7):
print ngram(s, i)
Output:
['I', 'love', 'the', 'Python', 'programming', 'language']
['I love', 'love the', 'the Python', 'Python programming', 'programming language']
['I love the', 'love the Python', 'the Python programming', 'Python programming language']
['I love the Python', 'love the Python programming', 'the Python programming language']
['I love the Python programming', 'love the Python programming language']
['I love the Python programming language']
Note That no k parameter is needed.
Adapted to your case:
def ngram(document, n):
with open(document) as f:
for line in f:
print [" ".join(k) for k in zip(*[l[0] for l in zip(line.split()[e:] for e in range(n))])]

Related

How do I create a new list of tuples?

I have this homework problem, and I'm new to python. I have this list of tuples:
[('the, this is me', 'the night'), ('the night', 'me'), ('me', 'the store')]
My code doesn't work when I'm trying to write to target_bigrams with only the tuples that have "the" in position [0]. Please help.
target_bigrams = ()
bigrams_length = len(bigrams)
for i in range(bigrams_length):
if i[0] == target_word:
target_bigrams.append(i[0])
​
I think this is what you need;
bigrams = [('the, this is me', 'the night'), ('the night', 'me'), ('me', 'the store')]
target_word = 'the'
target_bigrams = []
bigrams_length = len(bigrams)
for i in range(bigrams_length):
if bigrams[i][0].startswith(target_word):
target_bigrams.append(bigrams[i][0])
The question is not clear, but i believe that you want to separate the tuples which has "the" in the first position.
If that is the case, here is the sample code for your reference
lst = [('the, this is me', 'the night'), ('the night', 'me'), ('me', 'the store'), ('the', 'the store'),("the","How are you")]
target_list = []
target_word = "the"
for i in range(len(lst)):
if target_word == lst[i][0]:
target_list.append(lst[i])
for i in target_list:
print(i)
Output:
('the', 'the store')
('the', 'How are you')

How do I convert a list of strings to a proper sentence

How do I convert a list of strings to a proper sentence like this?
lst = ['eat', 'drink', 'dance', 'sleep']
string = 'I love"
output: "I love to eat, drink, dance and sleep."
Note: the "to" needs to be generated and not added manually to string
Thanks!
You can join all the verbs except the last with commas, and add the last with an and
def build(start, verbs):
return f"{start} to {', '.join(verbs[:-1])} and {verbs[-1]}."
string = 'I love'
lst = ['eat', 'drink', 'dance', 'sleep']
print(build(string, lst)) # I love to eat, drink, dance and sleep
lst = ['eat', 'drink', 'dance', 'sleep', 'run', 'walk', 'count']
print(build(string, lst)) # I love to eat, drink, dance, sleep, run, walk and count.
One option, using list to string joining:
lst = ['eat', 'drink', 'dance', 'sleep']
string = 'I love'
output = string + ' to ' + ', '.join(lst)
output = re.sub(r', (?!.*,)', ' and ', output)
print(output) # I love to eat, drink, dance and sleep
Note that the call to re.sub above selectively replaces the final comma with and.
Heyy, you can add string elements of lists to form bigger string by doing the following :-
verbs = lst[:-1].join(", ") # This will result in "eat, drink, dance"
verbs = verbs + " and " + lst[-1] # This will result in "eat, drink, dance and sleep"
string = string + ' to ' + verbs # This will result in "I love to eat, drink, dance and sleep"
print(string)

Python : Split string every three words

I've been searching around for a while now, but I can't seem to find the answer to this small problem.
I have this code that is supposed to split the string after every three words:
import re
def splitTextToTriplet(Text):
x = re.split('^((?:\S+\s+){2}\S+).*',Text)
return x
print(splitTextToTriplet("Do you know how to sing"))
Currently the output is as such:
['', 'Do you know', '']
But I am actually expecting this output:
['Do you know', 'how to sing']
And if I print(splitTextToTriplet("Do you know how to")), it should also output:
['Do you know', 'how to']
how can I change the regex so it produces the expected output?
I believe re.split might not be the best approach for this since look-behind cannot take variable-length patterns.
Instead, you could use str.split and then join back words together.
def splitTextToTriplet(string):
words = string.split()
grouped_words = [' '.join(words[i: i + 3]) for i in range(0, len(words), 3)]
return grouped_words
splitTextToTriplet("Do you know how to sing")
# ['Do you know', 'how to sing']
splitTextToTriplet("Do you know how to")
# ['Do you know', 'how to']
Although be advised that with this solution, if some of your white spaces are linebreaks, that information will be lost in the process.
I used re.findall for the output you expected. To get more generic split function, I replaced splitTextToTriplet on splitTextonWords with numberOfWords as a param:
import re
def splitTextonWords(Text, numberOfWords=1):
if (numberOfWords > 1):
text = Text.lstrip()
pattern = '(?:\S+\s*){1,'+str(numberOfWords-1)+'}\S+(?!=\s*)'
x =re.findall(pattern,text)
elif (numberOfWords == 1):
x = Text.split()
else:
x = None
return x
print(splitTextonWords("Do you know how to sing", 3))
print(splitTextonWords("Do you know how to", 3))
print(splitTextonWords("Do you know how to sing how to dance how to", 3))
print(splitTextonWords("A sentence this code will fail at", 3))
print(splitTextonWords("A sentence this code will fail at ", 3))
print(splitTextonWords(" A sentence this code will fail at s", 3))
print(splitTextonWords(" A sentence this code will fail at s", 4))
print(splitTextonWords(" A sentence this code will fail at s", 2))
print(splitTextonWords(" A sentence this code will fail at s", 1))
print(splitTextonWords(" A sentence this code will fail at s", 0))
output:
['Do you know', 'how to sing']
['Do you know', 'how to']
['Do you know', 'how to sing', 'how to dance', 'how to']
['A sentence this', 'code will fail', 'at']
['A sentence this', 'code will fail', 'at']
['A sentence this', 'code will fail', 'at s']
['A sentence this code', 'will fail at s']
['A sentence', 'this code', 'will fail', 'at s']
['A', 'sentence', 'this', 'code', 'will', 'fail', 'at', 's']
None
Using the grouper itertools recipe:
import itertools
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
See also the more_itertools third-party library that implements this recipe for you.
Code
def split_text_to_triplet(s):
"""Return strings of three words."""
return [" ".join(c) for c in grouper(3, s.split())]
split_text_to_triplet("Do you know how to sing")
# ['Do you know', 'how to sing']

Python - Iterate through a list of strings and group partial matching strings

So I have a list of strings as below:
list = ["I love cat", "I love dog", "I love fish", "I hate banana", "I hate apple", "I hate orange"]
How do I iterate through the list and group partially matching strings without given keywords. The result should like below:
list 1 = [["I love cat","I love dog","I love fish"],["I hate banana","I hate apple","I hate orange"]]
Thank you so much.
Sequence matcher will do the task for you. Tune the score ratio for better results.
Try this:
from difflib import SequenceMatcher
sentence_list = ["I love cat", "I love dog", "I love fish", "I hate banana", "I hate apple", "I hate orange"]
result=[]
for sentence in sentence_list:
if(len(result)==0):
result.append([sentence])
else:
for i in range(0,len(result)):
score=SequenceMatcher(None,sentence,result[i][0]).ratio()
if(score<0.5):
if(i==len(result)-1):
result.append([sentence])
else:
if(score != 1):
result[i].append(sentence)
Output:
[['I love cat', 'I love dog', 'I love fish'], ['I hate banana', 'I hate apple', 'I hate orange']]
Try building an inverse index, and then you can pick whichever keywords you like. This approach ignores word order:
index = {}
for sentence in sentence_list:
for word in set(sentence.split()):
index.setdefault(word, set()).add(sentence)
Or this approach, which keys the index by all possible full-word phrase prefixes:
index = {}
for sentence in sentence_list:
number_of_words = length(sentence.split())
for i in xrange(1, number_of_words):
key_phrase = sentence.rsplit(maxsplit=i)[0]
index.setdefault(key_phrase, set()).add(sentence)
And then if you want to find all of the sentences that contain a keyword (or start with a phrase, if that's your index):
match_sentences = index[key_term]
Or a given set of keywords:
matching_sentences = reduce(list_of_keywords[1:], lambda x, y: x & index[y], initializer = index[list_of_keywords[0]])
Now you can generate a list grouped by pretty much any combination of terms or phrases by building a list comprehension using those indices to generate sentences. E.g., if you built the phrase prefix index and want everything grouped by the first two word phrase:
return [list(index[k]) for k in index if len(k.split()) == 2]
You can try this approach. Although it is not the best approach, it is helpful for understanding the problem in a more methodical way.
from itertools import groupby
my_list = ["I love cat","I love dog","I love fish","I hate banana","I hate apple","I hate orange"];
each_word = sorted([x.split() for x in my_list])
# I assumed the keywords would be everything except the last word
grouped = [list(value) for key, value in groupby(each_word, lambda x: x[:-1])]
result = []
for group in grouped:
temp = []
for i in range(len(group)):
temp.append(" ".join(group[i]))
result.append(temp)
print(result)
Output:
[['I hate apple', 'I hate banana', 'I hate orange'], ['I love cat', 'I love dog', 'I love fish']]
Avoid words like list in naming your variables. Also list 1 is not a valid python variable.
Try this:
import sys
from itertools import groupby
#Assuming you group by the first two words in each string, e.g. 'I love', 'I hate'.
L = ["I love cat", "I love dog", "I love fish", "I hate banana", "I hate apple", "I hate orange"]
L = sorted(L)
result = []
for key,group in groupby(L, lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]):
result.append(list(group))
print(result)

Refactor python: replace list of words in list of strings

Still getting my head around python, I wonder if this function could be improved either in performance or readability?
def multi_replace_words(sentences, words, replace_str):
"""Replace all words in the sentences list with replace_str
ex. multi_replace_words(['bad a list', 'og bad', 'in bady there bad2', 'another one', 'and bad. two'], ['bad','bad2']', 'EX')
>> ['EX a list', 'og EX', 'in bady there EX','another one','and EX two']
"""
docs = []
for doc in sentences:
for replace_me in words:
if(replace_me in doc.encode('ascii', 'ignore')):
doc = re.sub('((\A|[^A-Za-z0-9_])'+replace_me+'(\Z|[^A-Za-z0-9_]))', ' ' + replace_str+' ', doc)
docs.append(doc)
return docs
Thanks :)
Something like this:
In [86]: def func(lis,a,b):
strs= "|".join("({0}{1}{2})".format(r'\b',x,r'\b[;",.]?') for x in a)
for x in lis:
yield re.sub(strs,b,x)
....:
In [87]: lis
Out[87]: ['bad a list', 'og bad', 'in bady there bad2', 'another one', 'and bad. two']
In [88]: rep=['bad','bad2']
In [89]: st="EX"
In [90]: list(func(lis,rep,st))
Out[90]: ['EX a list', 'og EX', 'in bady there EX', 'another one', 'and EX two']
In [91]: rep=['in','two','a']
In [92]: list(func(lis,rep,st))
Out[92]: ['bad EX list', 'og bad', 'EX bady there bad2', 'another one', 'and bad. EX']
You could try to use replace(). It acts on a string and replaces all instances of a series of characters with another. An example from here shows how replace acts.
#!/usr/bin/python
str = "this is string example....wow!!! this is really string";
print str.replace("is", "was");
print str.replace("is", "was", 3);

Categories