Get certain items from text by position, for loop, python - python

I'm trying to replicate something similar to dictionary like here: https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2
The code that is used there is
document = {}
# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
if doc_type == '10-K':
document[doc_type] = raw_10k[doc_start:doc_end]
I'm trying to create a for loop.
I have a list of types and positions (item text starts/ends) like this:
zl = [[('Item1', 1263, 42004),
('Item2', 42026, 652819),
('Item3', 652841, 697154),
('Item4', 697176, 705235),
('Item5', 705257, 2378296)],
[('Item1', 1195, 21386),
('Item3', 21408, 268339),
('Other', 268361, 290688)],
[('Item1', 1195, 27776),
('Item2', 27798, 323951),
('Item5', 323973, 348032)]]
My loop:
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
The problem is that it seems to correctly extract only the very first text (doc_start = 42026, doc_end = 652819). All further texts (final[2], final[3]...) are not extracted correctly and seem random.
I'm not sure which part of the loop is incorrect.

Hopefully I understand your problem fully
For this snippet:
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
Tab the final.append(document) line so that appending to final only happens if the >key< "item2" is in your list, zl.
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
With this code you will only extract two texts since "item2" is only in your list zl twice.

Related

How to merge common strings with different values between parenthesis in Python

I am processing some strings within lists that look like these:
['COLOR INCLUDES (40)', 'LONG_DESCRIPTION CONTAINS ("BLACK")', 'COLOR INCLUDES (38)']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 'COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)']
Thing is, I want to merge similar strings with their values into one, for each list. Expecting something like this:
['COLOR INCLUDES (40,38)', 'LONG_DESCRIPTION CONTAINS ("BLACK")']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)']
And some strings may have values without ():
['FAMILY EQUALS 1145']
What could be the more pythonic and fastest (lazy :P) way of doing this?
I have tried using regex to match strings until a "(" appears, but some strings don't have values between (), and can't find a fitting solution.
I have also tried STree function from suffix_trees lib, which finds the LCS (Longest Common Subsequence) from a list of strings, but then ran out of ideas about handling the values and the closing parenthesis:
from suffix_trees import STree
st = STree.STree(['COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)',
'COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)'])
st.lcs()
out: 'COLOR INCLUDES ('
EDIT: SOLVED
As #stef in the answer said, I broke the problem in smaller pieces and I solved it with his help. Let me paste here the Class Rule_process and the result:
class Rule_process:
def __init__(self):
self.rules = '(COLOR INCLUDES (40)) OR (LONG_DESCRIPTION CONTAINS ("BLACK")):1|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839):0|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839):0|||COLOR INCLUDES (40):1|||COLOR INCLUDES (800):0'
self.rules_dict = {
0:None,
1:None,
2:None,
4:None,
}
def append_rules(self):
rules = self.rules.split("|||")
values_0 = []
values_1 = []
values_2 = []
values_4 = []
for rule in range(len(rules)):
if rules[rule][-1]=='0':
rules[rule] = rules[rule][:-2]
# self.rules_dict[0].append(rules[rule])
values_0.append(rules[rule])
elif rules[rule][-1]=='1':
rules[rule] = rules[rule][:-2]
# self.rules_dict[1].append(rules[rule])
values_1.append(rules[rule])
elif rules[rule][-1]=='2':
rules[rule] = rules[rule][:-2]
# self.rules_dict[2].append(rules[rule])
values_2.append(rules[rule])
elif rules[rule][-1]=='4':
rules[rule] = rules[rule][:-2]
# self.rules_dict[4].append(rules[rule])
values_4.append(rules[rule])
if values_0!=[]:
self.rules_dict[0] = values_0
if values_1!=[]:
self.rules_dict[1] = values_1
if values_2!=[]:
self.rules_dict[2] = values_2
if values_4!=[]:
self.rules_dict[4] = values_4
regex = r'^\('
# for rules in self.rules_dict.values():
for key in self.rules_dict.keys():
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])):
new_rule = self.rules_dict[key][rule].split(' OR ')
if len(new_rule)>1:
joined_rule = []
for r in new_rule:
r = r.replace("))",")")
r = re.sub(regex, "", r)
joined_rule.append(r)
self.rules_dict[key].remove(self.rules_dict[key][rule])
self.rules_dict[key].extend(joined_rule)
self.rules_dict[key] = list(set(self.rules_dict[key]))
else:
new_rule = [r.replace("))",")") for r in new_rule]
new_rule = [re.sub(regex, "", r) for r in new_rule]
new_rule = ", ".join(new_rule)
self.rules_dict[key][rule] = new_rule
self.rules_dict[key] = list(set(self.rules_dict[key]))
return self.rules_dict
def split_rule(self): # COLOR INCLUDES (30,31,32,33) -> name = 'COLOR INCLUDES', values = [30,31,32,33]
# LONG_DESCRIPTION CONTAINS ("BLACK") -> name = LONG_DESCRIPTION, values ='"BLACK"'
new_dict = {
0:None,
1:None,
2:None,
4:None,
}
for key in self.rules_dict.keys():
pql_dict = {}
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])): #self.rules_dict[key][rule] -> COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)
rule = self.rules_dict[key][rule]
name = rule.rsplit(maxsplit=1)[0] #------------------------------->COLOR INCLUDES
values_as_str = rule.rsplit(maxsplit=1)[1].replace("(","")
values_as_str = values_as_str.replace(")","") #-------------------------------> 30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839
try:
values = list(map(int, values_as_str.split(","))) # [30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839]
except:
values = values_as_str # '"BLACK"'
if name in pql_dict.keys():
pql_dict[name] = pql_dict[name] + (values)
pql_dict[name] = list(set(pql_dict[name]))
else:
pql_dict.setdefault(name, values)
# pql_dict = {'COLOR INCLUDES': [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]}
for name in pql_dict.keys():
values = pql_dict[name]
joined_rule = name + " " + str(values)
if new_dict[key] is not None:
new_dict[key] = new_dict[key] + [joined_rule]
else:
new_dict[key] = [joined_rule]
self.rules_dict = new_dict
And the result:
process = Rule_process()
process.append_rules()
process.split_rule()
process.rules_dict
OUT:
{0: ['COLOR INCLUDES [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]'],
1: ['COLOR INCLUDES [40]', 'LONG_DESCRIPTION CONTAINS "BLACK"'],
2: None,
4: None}
Split this task into smaller, simpler tasks.
First task:
Write a function that takes a string and returns a pair (name, list_of_values) where name is the first part of the string and list_of_values is a python list of integers.
Hint: You can use '(' in s to test whether string s contains an opening parenthesis; you can use s.split() to split on whitespace or s.rsplit(maxsplit=1) to only split on the last whitespace; s.split('(') to split on opening parenthesis; and s.split(',') to split on comma.
Second task:
Write a function that takes a list of pairs (name, list_of_values) and merges the lists when the names are equal.
Hint: This is extremely easy in python using a dict with name as key and list_of_values as value. You can use if name in d: ... else: to test whether a name is already in the dict or not; or you can use d.get(name, []) or d.setdefault(name, []) to automatically add a name: [] entry in the dict when name is not already in the dict.
Third task:
Write a function to convert back, from the pairs (name, list_of_values) to the strings "name (value1, value2, ...)". This task is easier than the first task, so I suggest doing it first.
Hint: ' '.join(...) and ','.join(...) can both be useful.

How to generate a dictionary dynamically from a list in python?

I want to run a script which grabs all the titles of the files in a folder and collects them in a dictionary. I want the output structured like this:
{
1: {"title": "one"},
2: {"title": "two"},
...
}
I have tried the following, but how to add the "title"-part and make the dictionary dynamically?
from os import walk
mypath = '/Volumes/yahiaAmin-1'
filenames = next(walk(mypath), (None, None, []))[2] # [] if no file
courseData = {}
for index, x in enumerate(filenames):
# print(index, x)
# courseData[index]["title"].append(x)
# courseData[index].["tlt"].append(x)
courseData.setdefault(index).append(x)
print(courseData)
Assign the value dict directly to the index
courseData = {}
filenames = ["one", "two"]
for index, x in enumerate(filenames, 1):
courseData[index] = {"title": x}
print(courseData)
# {1: {'title': 'one'}, 2: {'title': 'two'}}
Not that using a dict where the key is an incremental int is generally useless, as a list will do the same

Fill tables in a template Word with Python (DocxTemplate, Jinja2)

I am trying to fill with Python a table in Word with DocxTemplate and I have some issues to do it properly. I want to use 2 dictionnaries to fill the data in 1 table, in the figure below.
Table to fill
The 2 dictionnaries are filled in a loop and I write the template document at the end.
The input document to create my dictionnaries is an DB extraction written in SQL.
My main issue is when I want to fill the table with my data in the 2 different dictionnaries.
In the code below I will give as an example the 2 dictionnaries with values in it.
# -*- coding: utf8 -*-
#
#
from docxtpl import DocxTemplate
if __name__ == "__main__":
document = DocxTemplate("template.docx")
DicoOccuTable = {'`num_carnet_adresses`': '`annuaire_telephonique`\n`carnet_adresses`\n`carnet_adresses_complement',
'`num_eleve`': '`CFA_apprentissage_ctrl_coherence`\n`CFA_apprentissage_ctrl_examen`}
DicoChamp = {'`num_carnet_adresses`': 72, '`num_eleve`': 66}
template_values = {}
#
template_values["keys"] = [[{"name":cle, "occu":val} for cle,val in DicoChamp.items()],
[{"table":vals} for cles,vals in DicoOccuTable.items()]]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
As a result the two lines for the table are created but nothing is written within...
I would like to add that it's only been 1 week that I work on Python, so I feel that I don't manage properly the different objects here.
If you have any suggestion to help me, I would appreciate it !
I put here the loop to create the dictionnaries, it may help you to understand why I coded it wrong :)
for c in ChampList:
with open("db_reference.sql", "r") as f:
listTable = []
line = f.readlines()
for l in line:
if 'CREATE TABLE' in l:
begin = True
linecreateTable = l
x = linecreateTable.split()
nomTable = x[2]
elif c in l and begin == True:
listTable.append(nomTable)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in l:
begin = False
nbreOccu=len(listTable)
Tables = "\n".join(listTable)
DicoChamp.update({c:nbreOccu})
DicoOccuTable.update({c:Tables})
# DicoChamp = {c:nbreOccu}
template_values = {}
Thank You very much !
Finally I found a solution for this problem. Here it is.
Instead of using 2 dictionnaries I created 1 dictionnary with this strucuture :
Dico = { Champ : [Occu , Tables] }
The full code for creating the table is detailed below :
from docxtpl import DocxTemplate
document = DocxTemplate("template.docx")
template_values = {}
Context = {}
for c in ChampList:
listTable = []
nbreOccu = 0
OccuTables = []
with open("db_reference.sql", "r") as g:
listTable = []
ligne = g.readlines()
for li in ligne:
if 'CREATE TABLE' in li:
begin = True
linecreateTable2 = li
y = linecreateTable2.split()
nomTable2 = y[2]
elif c in li and begin == True:
listTable.append(nomTable2)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in li:
begin = False
elif '/*!40101 SET COLLATION_CONNECTION=#OLD_COLLATION_CONNECTION */;' in li:
nbreOccu=len(listTable)
inter = "\n".join(listTable)
OccuTables.append(nbreOccu)
OccuTables.append(inter)
ChampNumPropre = c.replace('`','')
Context.update({ChampNumPropre:OccuTables})
else:
continue
template_values["keys"] = [{"label":cle, "cols":val} for cle,val in Context.items()]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
And I used a table with the following structure :
I hope you will find your answers here and good luck !

Aggregating values in one column by their corresponding value in another from two files

had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example:
1:5
2:4
3:2
1:4
Very basic but I'm looking for an output that looks like:
1:9
2:4
3:2
In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).
I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as:
Britney Spears (289) 2393140. Any help or input would be so appreciated.
import codecs
#from collections import defaultdict
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]
artists = {}
for a in artist_list:
artistID, name = a[0], a[1]
artists[artistID] = name
grouped_user_stats = {}
for u in user_stats_list:
userID, artistID, weight = u
grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
for artistID, weight in u:
grouped_user_stats.groupby('artistID')['weight'].sum()
print(grouped_user_stats.groupby('artistID')['weight'].sum())
#if userID not in grouped_user_stats:
#grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
#else:
#if artistID not in grouped_user_stats[userID]:
#grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
#else:
#grouped_user_stats[userID][artistID]['plays'] += 1
#print('this never happens')
#print(grouped_user_stats)
how about:
import codecs
from collections import defaultdict
# read stuff
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
# transform artist data in a dict with "artist id" as key and "artist name" as value
artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
user_stats_list = [x.strip().split('\t') for x in users][1:]
grouped_user_stats = defaultdict(lambda:0)
for u in user_stats_list:
#userID, artistID, weight = u
grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
# extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format
grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
# lastly print it
for k, v in grouped_user_stats.iteritems():
print k,v

Learning Python: Store values in dict from stdout

How can I do the following in Python:
I have a command output that outputs this:
Datexxxx
Clientxxx
Timexxx
Datexxxx
Client2xxx
Timexxx
Datexxxx
Client3xxx
Timexxx
And I want to work this in a dict like:
Client:(date,time), Client2:(date,time) ...
After reading the data into a string subject, you could do this:
import re
d = {}
for match in re.finditer(
"""(?mx)
^Date(.*)\r?\n
Client\d*(.*)\r?\n
Time(.*)""",
subject):
d[match.group(2)] = (match.group(1), match.group(2))
How about something like:
rows = {}
thisrow = []
for line in output.split('\n'):
if line[:4].lower() == 'date':
thisrow.append(line)
elif line[:6].lower() == 'client':
thisrow.append(line)
elif line[:4].lower() == 'time':
thisrow.append(line)
elif line.strip() == '':
rows[thisrow[1]] = (thisrow[0], thisrow[2])
thisrow = []
print rows
Assumes a trailing newline, no spaces before lines, etc.
What about using a dict with tuples?
Create a dictionary and add the entries:
dict = {}
dict['Client'] = ('date1','time1')
dict['Client2'] = ('date2','time2')
Accessing the entires:
dict['Client']
>>> ('date1','time1')

Categories