How to merge common strings with different values between parenthesis in Python - python

I am processing some strings within lists that look like these:
['COLOR INCLUDES (40)', 'LONG_DESCRIPTION CONTAINS ("BLACK")', 'COLOR INCLUDES (38)']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 'COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)']
Thing is, I want to merge similar strings with their values into one, for each list. Expecting something like this:
['COLOR INCLUDES (40,38)', 'LONG_DESCRIPTION CONTAINS ("BLACK")']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)']
And some strings may have values without ():
['FAMILY EQUALS 1145']
What could be the more pythonic and fastest (lazy :P) way of doing this?
I have tried using regex to match strings until a "(" appears, but some strings don't have values between (), and can't find a fitting solution.
I have also tried STree function from suffix_trees lib, which finds the LCS (Longest Common Subsequence) from a list of strings, but then ran out of ideas about handling the values and the closing parenthesis:
from suffix_trees import STree
st = STree.STree(['COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)',
'COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)'])
st.lcs()
out: 'COLOR INCLUDES ('
EDIT: SOLVED
As #stef in the answer said, I broke the problem in smaller pieces and I solved it with his help. Let me paste here the Class Rule_process and the result:
class Rule_process:
def __init__(self):
self.rules = '(COLOR INCLUDES (40)) OR (LONG_DESCRIPTION CONTAINS ("BLACK")):1|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839):0|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839):0|||COLOR INCLUDES (40):1|||COLOR INCLUDES (800):0'
self.rules_dict = {
0:None,
1:None,
2:None,
4:None,
}
def append_rules(self):
rules = self.rules.split("|||")
values_0 = []
values_1 = []
values_2 = []
values_4 = []
for rule in range(len(rules)):
if rules[rule][-1]=='0':
rules[rule] = rules[rule][:-2]
# self.rules_dict[0].append(rules[rule])
values_0.append(rules[rule])
elif rules[rule][-1]=='1':
rules[rule] = rules[rule][:-2]
# self.rules_dict[1].append(rules[rule])
values_1.append(rules[rule])
elif rules[rule][-1]=='2':
rules[rule] = rules[rule][:-2]
# self.rules_dict[2].append(rules[rule])
values_2.append(rules[rule])
elif rules[rule][-1]=='4':
rules[rule] = rules[rule][:-2]
# self.rules_dict[4].append(rules[rule])
values_4.append(rules[rule])
if values_0!=[]:
self.rules_dict[0] = values_0
if values_1!=[]:
self.rules_dict[1] = values_1
if values_2!=[]:
self.rules_dict[2] = values_2
if values_4!=[]:
self.rules_dict[4] = values_4
regex = r'^\('
# for rules in self.rules_dict.values():
for key in self.rules_dict.keys():
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])):
new_rule = self.rules_dict[key][rule].split(' OR ')
if len(new_rule)>1:
joined_rule = []
for r in new_rule:
r = r.replace("))",")")
r = re.sub(regex, "", r)
joined_rule.append(r)
self.rules_dict[key].remove(self.rules_dict[key][rule])
self.rules_dict[key].extend(joined_rule)
self.rules_dict[key] = list(set(self.rules_dict[key]))
else:
new_rule = [r.replace("))",")") for r in new_rule]
new_rule = [re.sub(regex, "", r) for r in new_rule]
new_rule = ", ".join(new_rule)
self.rules_dict[key][rule] = new_rule
self.rules_dict[key] = list(set(self.rules_dict[key]))
return self.rules_dict
def split_rule(self): # COLOR INCLUDES (30,31,32,33) -> name = 'COLOR INCLUDES', values = [30,31,32,33]
# LONG_DESCRIPTION CONTAINS ("BLACK") -> name = LONG_DESCRIPTION, values ='"BLACK"'
new_dict = {
0:None,
1:None,
2:None,
4:None,
}
for key in self.rules_dict.keys():
pql_dict = {}
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])): #self.rules_dict[key][rule] -> COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)
rule = self.rules_dict[key][rule]
name = rule.rsplit(maxsplit=1)[0] #------------------------------->COLOR INCLUDES
values_as_str = rule.rsplit(maxsplit=1)[1].replace("(","")
values_as_str = values_as_str.replace(")","") #-------------------------------> 30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839
try:
values = list(map(int, values_as_str.split(","))) # [30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839]
except:
values = values_as_str # '"BLACK"'
if name in pql_dict.keys():
pql_dict[name] = pql_dict[name] + (values)
pql_dict[name] = list(set(pql_dict[name]))
else:
pql_dict.setdefault(name, values)
# pql_dict = {'COLOR INCLUDES': [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]}
for name in pql_dict.keys():
values = pql_dict[name]
joined_rule = name + " " + str(values)
if new_dict[key] is not None:
new_dict[key] = new_dict[key] + [joined_rule]
else:
new_dict[key] = [joined_rule]
self.rules_dict = new_dict
And the result:
process = Rule_process()
process.append_rules()
process.split_rule()
process.rules_dict
OUT:
{0: ['COLOR INCLUDES [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]'],
1: ['COLOR INCLUDES [40]', 'LONG_DESCRIPTION CONTAINS "BLACK"'],
2: None,
4: None}

Split this task into smaller, simpler tasks.
First task:
Write a function that takes a string and returns a pair (name, list_of_values) where name is the first part of the string and list_of_values is a python list of integers.
Hint: You can use '(' in s to test whether string s contains an opening parenthesis; you can use s.split() to split on whitespace or s.rsplit(maxsplit=1) to only split on the last whitespace; s.split('(') to split on opening parenthesis; and s.split(',') to split on comma.
Second task:
Write a function that takes a list of pairs (name, list_of_values) and merges the lists when the names are equal.
Hint: This is extremely easy in python using a dict with name as key and list_of_values as value. You can use if name in d: ... else: to test whether a name is already in the dict or not; or you can use d.get(name, []) or d.setdefault(name, []) to automatically add a name: [] entry in the dict when name is not already in the dict.
Third task:
Write a function to convert back, from the pairs (name, list_of_values) to the strings "name (value1, value2, ...)". This task is easier than the first task, so I suggest doing it first.
Hint: ' '.join(...) and ','.join(...) can both be useful.

Related

Get certain items from text by position, for loop, python

I'm trying to replicate something similar to dictionary like here: https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2
The code that is used there is
document = {}
# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
if doc_type == '10-K':
document[doc_type] = raw_10k[doc_start:doc_end]
I'm trying to create a for loop.
I have a list of types and positions (item text starts/ends) like this:
zl = [[('Item1', 1263, 42004),
('Item2', 42026, 652819),
('Item3', 652841, 697154),
('Item4', 697176, 705235),
('Item5', 705257, 2378296)],
[('Item1', 1195, 21386),
('Item3', 21408, 268339),
('Other', 268361, 290688)],
[('Item1', 1195, 27776),
('Item2', 27798, 323951),
('Item5', 323973, 348032)]]
My loop:
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
The problem is that it seems to correctly extract only the very first text (doc_start = 42026, doc_end = 652819). All further texts (final[2], final[3]...) are not extracted correctly and seem random.
I'm not sure which part of the loop is incorrect.
Hopefully I understand your problem fully
For this snippet:
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
Tab the final.append(document) line so that appending to final only happens if the >key< "item2" is in your list, zl.
final = []
for text in listoftexts:
for i in zl:
document = {}
for doc_type, doc_start, doc_end in i:
if doc_type == 'Item2':
document[doc_type] = text[doc_start:doc_end]
final.append(document)
With this code you will only extract two texts since "item2" is only in your list zl twice.

how to handle date & time when splitting string on ":"

I am processing a text file, reading line by line splitting it, and inserting it into a database.
each line goes like
3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018 12:00:00 AM::11/19
The problem is that it also splits the date-time and as a result it populates the wrong information in the database like in the image below.
my code goes like:
with open(filename, encoding="utf-8") as f:
counter = 0
for line in f:
data = line.split(':')
id = str(counter)
Phonenumber = data[0].strip()
profileID = data[1].strip()
firstname = data[2].strip()
secondname = data[3].strip()
gender = data[4].strip()
LocationWhereLive = data[5].strip()
LocationWhereFrom = data[6].strip()
RelationshipStatus = data[7].strip()
whereWork = data[8].strip()
AccountCreationDate = data [9].strip()
Email = data[10].strip()
Birthdate = data [11].strip()
mycursor = mydb.cursor()
sql = mycursor.execute("insert into dataleads values ('"+id+"','"+Phonenumber+"','"+profileID+"','"+firstname+"','"+secondname+"','"+gender+"','"+LocationWhereLive+"','"+LocationWhereFrom+"','"+RelationshipStatus+"','"+whereWork+"','"+AccountCreationDate+"','"+Email+"','"+Birthdate+"')")
mycursor.execute(sql)
mydb.commit()
counter += 1
Alternative to splitting by spaces, you can also leverage the maxsplit argument in the split and rsplit methods:
def make_list(s):
before = s.split(":", maxsplit= 9) # splits up to the date
after = before[-1].rsplit(":", maxsplit= 2) # splits the last part up to the date (from the right)
return [*before[:-1], *after] # creates a list with both parts
s = "3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018 12:00:00 AM::11/19"
make_list(s)
Out:
['3530000000000',
'100000431506294',
'Jean',
'Camargo',
'male',
'',
'',
'',
'Kefron',
'6/4/2018 12:00:00 AM',
'',
'11/19']
As mentioned in the comments, you can split with the whitespace:
s = "3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018 12:00:00 AM::11/19"
split_s = s.split() # default split is any whitespace character
print(split_s[0]) # will print "3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018"
print(split_s[1]) # will print "12:00:00"
print(split_s[2]) # will print "AM::11/19"
To deal with the original file, you can split this in a loop with knowledge of the count of fields, rather than trying to use how many separator characters there are
collection = []
_line = line # keep a backup of the line to compare and count blocks
for field_index in range(12):
if field_index < 8: # get the first 8 fields (or some set)
prefix, _line = _line.split(":", 1) # only split once!
collection.append(prefix)
continue
if field_index == 9: # match date field _line from regex
if _line.startswith("::"): # test if field was omitted
_line = _line[1:] # truncate the first character
continue
r"^\d+/..." # TODO regex for field
continue
...
This can be tuned or adapted to handle any field which can be
absent
also contain the separators in it (thanks)
However, if you can instead take a moment to educate the author of this file that it's problematic and why and nicely.. they may rewrite the file to be better for you or provide you with its input files you are further munging
Specifically, the tool could either
use a separator unavailable in the resulting data (such as | or ##SEPARATOR##)
escape the fields or swap their separators to another character before writing (.replace(":", "-"))
An alternative solution is to match the field in the line first and transform it, allowing you to deal with the field on its own (perhaps transforming it back via a regex or .replace())
line = re.sub(r"(\d\d?):(\d\d):(\d\d) (AM|PM)", r"\1-\2-\3-\4", line)
# now split out line on :
>>> line = "3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018 12:00:00 AM::11/19"
>>> re.sub(r"(\d\d?):(\d\d):(\d\d) (AM|PM)", r"\1-\2-\3-\4", line).split(":")
['3530000000000', '100000431506294', 'Jean', 'Camargo', 'male', '', '', '', 'Kefron', '6/4/2018 12-00-00-AM', '', '11/19']
The structure is the same you only join the split data back together again
counter = 0
line = "3530000000000:100000431506294:Jean:Camargo:male::::Kefron:6/4/2018 12:00:00 AM::11/19"
data = line.split(':')
id = str(counter)
Phonenumber = data[0].strip()
profileID = data[1].strip()
firstname = data[2].strip()
secondname = data[3].strip()
gender = data[4].strip()
LocationWhereLive = data[5].strip()
LocationWhereFrom = data[6].strip()
RelationshipStatus = data[7].strip()
whereWork = data[8].strip()
AccountCreationDate = data [9].strip() + ':' + data[10].strip() +":" + data[11].strip()
Email = data[12].strip()
Birthdate = data [13].strip()

Parsing Erlang data to Python dictionary

I have an erlang script from which I would like to get some data and store it in python dictionary.
It is easy to parse the script to get string like this:
{userdata,
[{tags,
[#dt{number=111},
#mp{id='X23.W'}]},
{log,
'LG22'},
{instruction,
"String that can contain characters like -, _ or numbers"}
]
}.
desired result:
userdata = {"tags": {"dt": {"number": 111}, "mp": {"id": "X23.W"}},
"log": "LG22",
"instruction": "String that can contain characters like -, _ or numbers"}
# "#" mark for data in "tags" is not required in this structure.
# Also value for "tags" can be any iterable structure: tuple, list or dictionary.
But I am not sure how to transfer this data into a python dictionary. My first idea was to use json.loads but it requires many modifications (putting words into quotes marks, replacing "," with ":" and many more).
Moreover, keys in userdata are not limited to some pool. In this case, there are 'tags', 'log' and 'instruction', but there can be many more eg. 'slogan', 'ids', etc.
Also, I am not sure about the order. I assume that the keys can appear in random order.
My code (it is not working for id='X23.W' so I removed '.' from input):
import re
import json
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
buff = in_.replace("{userdata, [", "")[:-2]
re_helper = re.compile(r"(#\w+)")
buff = re_helper.sub(r'\1:', buff)
partition = buff.partition("instruction")
section_to_replace = partition[0]
replacer = re.compile(r"(\w+)")
match = replacer.sub(r'"\1"', section_to_replace)
buff = ''.join([match, '"instruction"', partition[2]])
buff = buff.replace("#", "")
buff = buff.replace('",', '":')
buff = buff.replace("}, {", "}, \n{")
buff = buff.replace("=", ":")
buff = buff.replace("'", "")
temp = buff.split("\n")
userdata = {}
buff = temp[0][:-2]
buff = buff.replace("[", "{")
buff = buff.replace("]", "}")
userdata .update(json.loads(buff))
for i, v in enumerate(temp[1:]):
v = v.strip()
if v.endswith(","):
v = v[:-1]
userdata .update(json.loads(v))
print(userdata)
Output:
{'tags': {'dt': {'number': '111'}, 'mp': {'id': 'X23W'}}, 'instruction': 'String that can contain characters like -, _ or numbers', 'log': 'LG22'}
import json
import re
in_ = """{userdata, [{tags, [#dt{number=111}, #mp{id='X23.W'}]}, {log, 'LG22'}, {instruction, "String that can contain characters like -, _ or numbers"}]}"""
qouted_headers = re.sub(r"\{(\w+),", r'{"\1":', in_)
changed_hashed_list_to_dict = re.sub(r"\[(#.*?)\]", r'{\1}', qouted_headers)
hashed_variables = re.sub(r'#(\w+)', r'"\1":', changed_hashed_list_to_dict)
equality_signes_replaced_and_quoted = re.sub(r'{(\w+)=', r'{"\1":', hashed_variables)
replace_single_qoutes = equality_signes_replaced_and_quoted.replace('\'', '"')
result = json.loads(replace_single_qoutes)
print(result)
Produces:
{'userdata': [{'tags': {'dt': {'number': 111}, 'mp': {'id': 'X23.W'}}}, {'log': 'LG22'}, {'instruction': 'String that can contain characters like -, _ or numbers'}]}

Python - using regex to search for a string then append each match to a new list

I am currently trying to extract some data from a file which has this kind of format:
#12 = ADVANCED_FACE ( 'NONE', ( #194 ), #326, .F. ) ;
...
#159 = EDGE_LOOP ( 'NONE', ( #21, #124, #264, #145 ) ) ;
...
#194 = FACE_OUTER_BOUND ( 'NONE', #159, .T. ) ;
...
#326 = PLANE ( 'NONE', #352 ) ;
The following is the method I am currently using:
faces_txt = re.findall(r'#(\d+) = ADVANCED_FACE.*;', text)
faces = [int(face) for face in faces_txt]
print('Face IDs = ', faces)
Which outputs:
Face IDs = [12, 73, 99, 131, 181, 214, 244, 273, 330, 358]
What would I do if I wanted to create a new list (named sequentially like "Face1, Face2, Face3...") for each match of the "ADVANCED_FACE" rather than appending all of these values to the same list?
Without having more details on exact output you want and maybe some more input, it sounds like you want a list of lists or a dictionary of lists.
faces_txt = re.findall(r'#(\d+) = ADVANCED_FACE.*;', text)
faces = [int(face) for face in faces_txt]
print('Face IDs = ', faces)
LoL = []
DoL = {}
for i, faceId in enumerate(faces):
LoL.append([faceId])
DoL.update({"Face{}".format(i): [faceId]})
print(LoL)
print(DoL)
print(DoL['Face0'])

Learning Python: Store values in dict from stdout

How can I do the following in Python:
I have a command output that outputs this:
Datexxxx
Clientxxx
Timexxx
Datexxxx
Client2xxx
Timexxx
Datexxxx
Client3xxx
Timexxx
And I want to work this in a dict like:
Client:(date,time), Client2:(date,time) ...
After reading the data into a string subject, you could do this:
import re
d = {}
for match in re.finditer(
"""(?mx)
^Date(.*)\r?\n
Client\d*(.*)\r?\n
Time(.*)""",
subject):
d[match.group(2)] = (match.group(1), match.group(2))
How about something like:
rows = {}
thisrow = []
for line in output.split('\n'):
if line[:4].lower() == 'date':
thisrow.append(line)
elif line[:6].lower() == 'client':
thisrow.append(line)
elif line[:4].lower() == 'time':
thisrow.append(line)
elif line.strip() == '':
rows[thisrow[1]] = (thisrow[0], thisrow[2])
thisrow = []
print rows
Assumes a trailing newline, no spaces before lines, etc.
What about using a dict with tuples?
Create a dictionary and add the entries:
dict = {}
dict['Client'] = ('date1','time1')
dict['Client2'] = ('date2','time2')
Accessing the entires:
dict['Client']
>>> ('date1','time1')

Categories