Python - grouping words into sets of 3 - python

I'm trying to create a multidimensional array that contains the words in a string - the word before that word (unless at the beginning of the string, blank), the word, and the following word (unless at the end of the string, blank)
I've tried the following code:
def parse_group_words(text):
groups = []
words = re_sub("[^\w]", " ", text).split()
number_words = len(words)
for i in xrange(number_words):
print i
if i == 0:
groups[i][0] = ""
groups[i][1] = words[i]
groups[i][2] = words[i+1]
if i > 0 and i != number_words:
groups[i][0] = words[i-1]
groups[i][1] = words[i]
groups[i][2] = words[i+1]
if i == number_words:
groups[i][0] = words[i-1]
groups[i][1] = words[i]
groups[i][2] = ""
print groups
print parse_group_words("this is an example of text are you ready")
But I'm getting:
0
Traceback (most recent call last):
File "/home/akf/program.py", line 82, in <module>
print parse_group_words("this is an example of text are you ready")
File "/home/akf/program.py", line 69, in parse_group_words
groups[i][0] = ""
IndexError: list index out of range
Any idea how to fix this?

Here's a generic way to do it for arbitrary sized windows, using Python collections and itertools:
import re
import collections
import itertools
def window(seq, n=3):
d = collections.deque(maxlen=n)
for x in itertools.chain(('', ), seq, ('', )):
d.append(x)
if len(d) >= n:
yield tuple(d)
def windows(text, n=3):
return list(window((x.group() for x in re.finditer(r'\w+', text)), n=n))

What about...:
import itertools, re
def parse_group_words(text):
groups = []
words = re.finditer(r'\w+', text)
prv, cur, nxt = itertools.tee(words, 3)
next(cur); next(nxt); next(nxt)
for previous, current, thenext in itertools.izip(prv, cur, nxt):
# in Py 3, use `zip` in lieu of itertools.izip
groups.append([previous.group(0), current.group(0), thenext.group(0)])
print(groups)
parse_group_words('tanto va la gatta al lardo che ci lascia')
This is almost what you require -- it emits:
[['tanto', 'va', 'la'], ['va', 'la', 'gatta'], ['la', 'gatta', 'al'], ['gatta', 'al', 'lardo'], ['al', 'lardo', 'che'], ['lardo', 'che', 'ci'], ['che', 'ci', 'lascia']]
...missing the last-required group ['ci', 'lascia', ''].
To fix it, just before the print, you could add:
groups.append([groups[-1][1], groups[-1][2], ''])
This feels like a midly-unsavory hack -- I can't easily find an elegant way to have this last group "just emerge" from the general logic of the rest of the function.

Related

Python retrieving data from a block of lines containing specific characters and appending relevant data into separate lines

I am trying to create a program which selects specific information from a bulk paste, extract the relevant information and then proceed to paste said information into lines.
Here is some example data;
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA
I would like to have the output where the relevant data for the Track1 is grouped together in a single line, with semicolon joining identical information and " - " seperating between others.
LyrcistA - ComposerA - ArrangerA; ArrangerB
LyrcistA; LyrcistC - ComposerA - ArrangerA
I have not gotten very far despite my best efforts
while True:
YodobashiData = input("")
SplitData = YodobashiData.splitlines();
returns the following
['1. Track1 03:01']
['VOC:PersonA ']
['LYR:LyrcistA']
['COM:ComposerA']
['ARR:ArrangerA']
['ARR:ArrangerB']
[]
['2. Track2 04:18']
['VOC:PersonB']
['VOC:PersonC']
['LYR:LyrcistA']
['LYR:LyrcistC']
['COM:ComposerA']
['ARR:ArrangerA']
Whilst I have all the data now in separate lists, I have no idea how to identify and extract the information from the list I need from the ones I do not.
Also, it seems I need to have the while loop or else it will only return the first list and nothing else.
Here's a script that doesn't use regular expressions.
It assumes that header lines, and only the header lines, will always start with a digit, and that the overall structure of header line then credit lines is consistent. Empty lines are ignored.
Extraction and formatting of the track data are handled separately, so it's easier to change formats, or use the extracted data in other ways.
import collections
import unicodedata
data_from_question = """\
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA
"""
def prepare_data(data):
# The "colons" in the credits lines are actually
# "full width colons". Replace them (and other such characters)
# with their normal width equivalents.
# If full normalisation is undesirable then we could return
# data.replace('\N{FULLWIDTH COLON}', ':')
return unicodedata.normalize('NFKC', data)
def is_new_track(line):
return line[0].isdigit()
def parse_track_header(line):
id_, title, duration = line.split()
return {'id': id_.rstrip('.'), 'title': title, 'duration': duration}
def get_credit(line):
credit, _, name = line.partition(':')
return credit.strip(), name.strip()
def format_track_heading(track):
return 'id: {id} title: {title} length: {duration}'.format(**track)
def format_credits(track):
order = ['ARR', 'COM', 'LYR', 'VOC']
parts = ['; '.join(track[k]) for k in order]
return ' - '.join(parts)
def get_data():
# The data is expected to be a multiline string.
return data_from_question
def parse_data(data):
track = None
for line in filter(None, data.splitlines()):
if is_new_track(line):
if track:
yield track
track = collections.defaultdict(list)
header_data = parse_track_header(line)
track.update(header_data)
else:
role, name = get_credit(line)
track[role].append(name)
yield track
def report(tracks):
for track in tracks:
print(format_track_heading(track))
print(format_credits(track))
print()
def main():
data = get_data()
prepared_data = prepare_data(data)
tracks = parse_data(prepared_data)
report(tracks)
main()
Output:
id: 1 title: Track1 length: 03:01
ArrangerA; ArrangerB - ComposerA - LyrcistA - PersonA
id: 2 title: Track2 length: 04:18
ArrangerA - ComposerA - LyrcistA; LyrcistC - PersonB; PersonC
Here's another take on an answer to your question:
data = """
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA"""
import re
import collections
# Regular expression to pull apart the headline of each entry
headlinePattern = re.compile(r"(\d+)\.\s+(.*?)\s+(\d\d:\d\d)")
def main():
# break the data into lines
lines = data.strip().split("\n")
# while we have more lines...
while lines:
# The next line should be a title line
line = lines.pop(0)
m = headlinePattern.match(line)
if not m:
raise Exception("Unexpected data format")
id = m.group(1)
title = m.group(2)
length = m.group(3)
people = collections.defaultdict(list)
# Now read person lines until we hit a blank line or the end of the list
while lines:
line = lines.pop(0)
if not line:
break
# Break the line into label and name
label, name = re.split(r"\W+", line, 1)
# Add this entry to a map of lists, where the map's keys are the label and the
# map's values are all the people who had that label
people[label].append(name)
# Now we have everything for one entry in the data. Print everything we got.
print("id:", id, "title:", title, "length:", length)
print(" - ".join(["; ".join(person) for person in people.values()]))
# go on to the next entry...
main()
Result:
id: 1 title: Track1 length: 03:01
PersonA - LyrcistA - ComposerA - ArrangerA; ArrangerB
id: 2 title: Track2 length: 04:18
PersonB; PersonC - LyrcistA; LyrcistC - ComposerA - ArrangerA
You can just comment out the line that prints the headline info if you really just want the line with all of the people on it. Just replace the built in data with data = input("") if you want to read the data from a user prompt.
Assuming your data is in the format you specified in a file called tracks.txt, the following code should work:
import re
with open('tracks.txt') as fp:
tracklines = fp.read().splitlines()
def split_tracks(lines):
track = []
all_tracks = []
while True:
try:
if lines[0] != '':
track.append(lines.pop(0))
else:
all_tracks.append(track)
track = []
lines.pop(0)
except:
all_tracks.append(track)
return all_tracks
def gather_attrs(tracks):
track_attrs = []
for track in tracks:
attrs = {}
for line in track:
match = re.match('([A-Z]{3}):', line)
if match:
attr = line[:3]
val = line[4:].strip()
try:
attrs[attr].append(val)
except KeyError:
attrs[attr] = [val]
track_attrs.append(attrs)
return track_attrs
if __name__ == '__main__':
tracks = split_tracks(tracklines)
attrs = gather_attrs(tracks)
for track in attrs:
semicolons = map(lambda va: '; '.join(va), track.values())
hyphens = ' - '.join(semicolons)
print(hyphens)
The only thing you may have to change is the colon characters in your data - some of them are ASCII colons : and others are Unicode colons :, which will break the regex.
import re
list_ = data_.split('\n') # here data_ is your data
regObj = re.compile(rf'[A-Za-z]+(:|{chr(65306)})[A-Za-z]+')
l = []
pre = ''
for i in list_:
if regObj.findall(i):
if i[:3] != 'VOC':
if pre == i[:3]:
l.append('; ')
else:
l.append(' - ')
l.append(i[4:].strip())
else:
l.append(' => ')
pre = i[:3]
track_list = list(map(lambda item: item.strip(' - '), filter(lambda item: item, ''.join(l).split(' => '))))
print(track_list)
OUTPUT : list of result you want
['LyrcistA - ComposerA - ArrangerA; ArrangerB', 'LyrcistA; LyrcistC - ComposerA - ArrangerA']

Parsing blocks of text data with python itertools.groupby

I'm trying to parse a blocks of text in python 2.7 using itertools.groupby
The data has the following structure:
BEGIN IONS
TITLE=cmpd01_scan=23
RTINSECONDS=14.605
PEPMASS=694.299987792969 505975.375
CHARGE=2+
615.839727 1760.3752441406
628.788226 2857.6264648438
922.4323436 2458.0959472656
940.4432533 9105.5
END IONS
BEGIN IONS
TITLE=cmpd01_scan=24
RTINSECONDS=25.737
PEPMASS=694.299987792969 505975.375
CHARGE=2+
575.7636234 1891.1656494141
590.3553938 2133.4477539063
615.8339562 2433.4252929688
615.9032114 1784.0628662109
END IONS
I need to extract information from the line beigining with "TITLE=", "PEPMASS=","CHARGE=".
The code I'm using as follows:
import itertools
import re
data_file='Test.mgf'
def isa_group_separator(line):
return line=='END IONS\n'
regex_scan = re.compile(r'TITLE=')
regex_precmass=re.compile(r'PEPMASS=')
regex_charge=re.compile(r'CHARGE=')
with open(data_file) as f:
for (key,group) in itertools.groupby(f,isa_group_separator):
#print(key,list(group))
if not key:
precmass_match = filter(regex_precmass.search,group)
print precmass_match
scan_match= filter(regex_scan.search,group)
print scan_match
charge_match = filter(regex_charge.search,group)
print charge_match
However, the output only picks up the "PEPMASS=" line,and if 'scan_match' assignment is done before 'precmass_match', the "TITLE=" line is printed only;
> ['PEPMASS=694.299987792969 505975.375\n'] [] []
> ['PEPMASS=694.299987792969 505975.375\n'] [] []
can someone point out what I'm doing wrong here?
The reason for this is that group is an iterator and it runs only once.
Please find the modified script that does the job.
import itertools
import re
data_file='Test.mgf'
def isa_group_separator(line):
return line == 'END IONS\n'
regex_scan = re.compile(r'TITLE=')
regex_precmass = re.compile(r'PEPMASS=')
regex_charge = re.compile(r'CHARGE=')
with open(data_file) as f:
for (key, group) in itertools.groupby(f, isa_group_separator):
if not key:
g = list(group)
precmass_match = filter(regex_precmass.search, g)
print precmass_match
scan_match = filter(regex_scan.search, g)
print scan_match
charge_match = filter(regex_charge.search, g)
print charge_match
I might try to parse this way (without using groupby(
import re
file = """\
BEGIN IONS
TITLE=cmpd01_scan=23
RTINSECONDS=14.605
PEPMASS=694.299987792969 505975.375
CHARGE=2+
615.839727 1760.3752441406
628.788226 2857.6264648438
922.4323436 2458.0959472656
940.4432533 9105.5
END IONS
BEGIN IONS
TITLE=cmpd01_scan=24
RTINSECONDS=25.737
PEPMASS=694.299987792969 505975.375
CHARGE=2+
575.7636234 1891.1656494141
590.3553938 2133.4477539063
615.8339562 2433.4252929688
615.9032114 1784.0628662109
END IONS""".splitlines()
pat = re.compile(r'(TITLE|PEPMASS|CHARGE)=(.+)')
data = []
for line in file:
m = pat.match(line)
if m is not None:
if m.group(1) == 'TITLE':
data.append([])
data[-1].append(m.group(2))
print(data)
Prints:
[['cmpd01_scan=23', '694.299987792969 505975.375', '2+'], ['cmpd01_scan=24', '694.299987792969 505975.375', '2+']]

How do I raise the memory limit in python?

I have written a python (2.7) script but it use a lot of memory so I get a out of memory error. Is it possible to use memory?
My code (or the github):
from itertools import combinations
import numpy
# Find the unused members and put this in a other group
def findMembers(listIn,listMembers):
lengthlist2 = (len(listMembers)-len(listIn[0]))
group2 = [0] * lengthlist2 #making the other groups based on the length of the first group
for i in listIn:
wichRow = 0
for x in listMembers:
if not (x in i) :
group2[wichRow] = x
wichRow += 1
listIn.append(group2)
return listIn
#you give a list of members and the numbers of groups
#you get back all the possibilities of combinations
def findCombinations(listMembers,numbersOfGroups):
groupTemp = [] #list needed to save correctly all the combinations
group = [] #list needed for keep it simple
newGroup = [] #list that will be returned
for listPossibilities in combinations(listMembers,(len(listMembers)/numbersOfGroups)):
groupTemp.append(list(listPossibilities))
group.append(groupTemp) #saving all the possibilities
groupTemp = []
for k in group:
# place the unused members in group2
k = (findMembers(k,listMembers))
if numbersOfGroups > 2:
groupTemp = []
groupTemp = findCombinations(k[1],numbersOfGroups-1)
for i in groupTemp:
listTemp = []
listTemp.append(k[0])
listTemp.extend(i)
newGroup.append(listTemp)
else:
newGroup = group
return newGroup
# Calculate the happiness of the group
def findHappiness(tabel,listIn):
happiness = 0
for i in listIn:
for j in i:
for k in i:
happiness += tabel[j][k]
return happiness
def buildTabel(members): #build a random survey
tabel = numpy.random.random((members,members))
return tabel
def calculateHappiness(group):
print "Finding all the happiness: "
maxhappiness = 0
i = 0
for x in group:
happiness = findHappiness(tabel,x)
if happiness > maxhappiness:
maxhappiness = happiness
y = x
progress = int(round((((i)*1.0/(len(group)))*100.0)))
update_progress(progress)
i += 1
print "\n Best solution: ", y, " with: ", maxhappiness, " happiness"
def update_progress(progress):
print '\r[{0}] {1}%'.format('#'*(progress/5), progress),
if __name__ == "__main__":
members = 24 # members of the group
numbersOfGroups = 3
tabel = buildTabel(members) #preferences will be stored here
listMembers = (range(members)) #members of the group that need to be divided
print "Searching all the combinations..."
group = findCombinations(listMembers,numbersOfGroups) #find all the combinations (recursive)
print len(group)," combinations"
calculateHappiness(group) #calculate the most happiest group and print
the error:
Searching all the combinations...
Traceback (most recent call last):
File "main.py", line 75, in <module>
calculateHappiness(group) #calculate the most happiest group and print
File "main.py", line 38, in findCombinations
newGroup = group
MemoryError
I'm using windows 10 64bit with 6gb ram. Is it possible to use virtual ram or disk space of mine hard drive disk?

Index similar entries in Python

I have a column of data (easily imported from Google Docs thanks to gspread) that I'd like to intelligently align. I ingest entries into a dictionary. Input can include email, twitter handle or a blog URL. For example:
mike.j#gmail.com
#mikej45
j.mike#world.eu
_http://tumblr.com/mikej45
Right now, the "dumb" version is:
def NomineeCount(spreadsheet):
worksheet = spreadsheet.sheet1
nominees = worksheet.col_values(6) # F = 6
unique_nominees = {}
for c in nominees:
pattern = re.compile(r'\s+')
c = re.sub(pattern, '', c)
if unique_nominees.has_key(c) == True: # If we already have the name
unique_nominees[c] += 1
else:
unique_nominees[c] = 1
# Print out the alphabetical list of nominees with leading vote count
for w in sorted(unique_nominees.keys()):
print string.rjust(str(unique_nominees[w]), 2)+ " " + w
return nominees
What's an efficient(-ish) way to add in some smarts during the if process?
You can try with defaultdict:
from collections import defaultdict
unique_nominees = defaultdict(lambda: 0)
unique_nominees[c] += 1

Learning Python: Store values in dict from stdout

How can I do the following in Python:
I have a command output that outputs this:
Datexxxx
Clientxxx
Timexxx
Datexxxx
Client2xxx
Timexxx
Datexxxx
Client3xxx
Timexxx
And I want to work this in a dict like:
Client:(date,time), Client2:(date,time) ...
After reading the data into a string subject, you could do this:
import re
d = {}
for match in re.finditer(
"""(?mx)
^Date(.*)\r?\n
Client\d*(.*)\r?\n
Time(.*)""",
subject):
d[match.group(2)] = (match.group(1), match.group(2))
How about something like:
rows = {}
thisrow = []
for line in output.split('\n'):
if line[:4].lower() == 'date':
thisrow.append(line)
elif line[:6].lower() == 'client':
thisrow.append(line)
elif line[:4].lower() == 'time':
thisrow.append(line)
elif line.strip() == '':
rows[thisrow[1]] = (thisrow[0], thisrow[2])
thisrow = []
print rows
Assumes a trailing newline, no spaces before lines, etc.
What about using a dict with tuples?
Create a dictionary and add the entries:
dict = {}
dict['Client'] = ('date1','time1')
dict['Client2'] = ('date2','time2')
Accessing the entires:
dict['Client']
>>> ('date1','time1')

Categories