Regex through line of text with changing pattern - python

Expanding on Proper way to search through line of text, re.findall() and re.search() both don't fully work
If I have the following line of text:
txt = "Name : 'red' Wire : 'R' Wire: 'B' Name : 'blue' Name : 'orange' Name: 'yellow' Wire : 'Y'"
I am trying to parse through this line of text and get every Wire/Name pair to populate into a dataframe. The issue with this text is that the order Wire/Name on the line of text is variable.
for line in txt:
line = line.strip()
pairs = re.findall(r'Name *: *\'(?P<name>\w+)\' Wire *: *\'(?P<wire>\w+)\'', content)
if pairs:
for name, wire in pairs:
df = df.append({'Name' : name, 'Wire' : wire}, ignore_index=True)
The problem with this approach is that it misses the Blue/B pair, resulting in the following dataframe.
Name Wire
red R
yellow Y
The dataframe I am trying to achieve is
Name Wire
red R
blue B
yellow Y
Is it possible to handle the variation in the text pattern?

Can you just take one name/wire pair at a time and build up the pieces as you go? I created a Pair class with some helper functions:
txt = "Name : 'red' Wire : 'R' Wire: 'B' Name : 'blue' Name : 'orange' Name: 'yellow' Wire : 'Y'"
regex = r'((?P<name>Name)|(?P<wire>Wire))\s*?:\s*?\'(?P<value>\w+\')'
pat = re.compile(regex)
class Pair:
name = ''
wire = ''
def populated(self):
return self.name and self.wire
def to_dict(self):
return dict(name=self.name, wire=self.wire)
def __str__(self):
return f'{self.name} {self.wire}'
current_pair = Pair()
all_pairs = []
for x in pat.finditer(txt):
if x.group('name'):
current_pair.name = x.group('value')
elif x.group('wire'):
current_pair.wire = x.group('value')
if current_pair.populated():
all_pairs.append(current_pair)
current_pair = Pair()
for p in all_pairs:
print(p)
You could alter this code to keep track of the incomplete pairs (i.e. 'orange') and decide what to do with those later.

Related

How to merge common strings with different values between parenthesis in Python

I am processing some strings within lists that look like these:
['COLOR INCLUDES (40)', 'LONG_DESCRIPTION CONTAINS ("BLACK")', 'COLOR INCLUDES (38)']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)', 'COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)']
Thing is, I want to merge similar strings with their values into one, for each list. Expecting something like this:
['COLOR INCLUDES (40,38)', 'LONG_DESCRIPTION CONTAINS ("BLACK")']
['COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)']
And some strings may have values without ():
['FAMILY EQUALS 1145']
What could be the more pythonic and fastest (lazy :P) way of doing this?
I have tried using regex to match strings until a "(" appears, but some strings don't have values between (), and can't find a fitting solution.
I have also tried STree function from suffix_trees lib, which finds the LCS (Longest Common Subsequence) from a list of strings, but then ran out of ideas about handling the values and the closing parenthesis:
from suffix_trees import STree
st = STree.STree(['COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)',
'COLOR INCLUDES(30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839)', 'COLOR INCLUDES (800)'])
st.lcs()
out: 'COLOR INCLUDES ('
EDIT: SOLVED
As #stef in the answer said, I broke the problem in smaller pieces and I solved it with his help. Let me paste here the Class Rule_process and the result:
class Rule_process:
def __init__(self):
self.rules = '(COLOR INCLUDES (40)) OR (LONG_DESCRIPTION CONTAINS ("BLACK")):1|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839):0|||COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,409,800,823,830,833,838,839):0|||COLOR INCLUDES (40):1|||COLOR INCLUDES (800):0'
self.rules_dict = {
0:None,
1:None,
2:None,
4:None,
}
def append_rules(self):
rules = self.rules.split("|||")
values_0 = []
values_1 = []
values_2 = []
values_4 = []
for rule in range(len(rules)):
if rules[rule][-1]=='0':
rules[rule] = rules[rule][:-2]
# self.rules_dict[0].append(rules[rule])
values_0.append(rules[rule])
elif rules[rule][-1]=='1':
rules[rule] = rules[rule][:-2]
# self.rules_dict[1].append(rules[rule])
values_1.append(rules[rule])
elif rules[rule][-1]=='2':
rules[rule] = rules[rule][:-2]
# self.rules_dict[2].append(rules[rule])
values_2.append(rules[rule])
elif rules[rule][-1]=='4':
rules[rule] = rules[rule][:-2]
# self.rules_dict[4].append(rules[rule])
values_4.append(rules[rule])
if values_0!=[]:
self.rules_dict[0] = values_0
if values_1!=[]:
self.rules_dict[1] = values_1
if values_2!=[]:
self.rules_dict[2] = values_2
if values_4!=[]:
self.rules_dict[4] = values_4
regex = r'^\('
# for rules in self.rules_dict.values():
for key in self.rules_dict.keys():
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])):
new_rule = self.rules_dict[key][rule].split(' OR ')
if len(new_rule)>1:
joined_rule = []
for r in new_rule:
r = r.replace("))",")")
r = re.sub(regex, "", r)
joined_rule.append(r)
self.rules_dict[key].remove(self.rules_dict[key][rule])
self.rules_dict[key].extend(joined_rule)
self.rules_dict[key] = list(set(self.rules_dict[key]))
else:
new_rule = [r.replace("))",")") for r in new_rule]
new_rule = [re.sub(regex, "", r) for r in new_rule]
new_rule = ", ".join(new_rule)
self.rules_dict[key][rule] = new_rule
self.rules_dict[key] = list(set(self.rules_dict[key]))
return self.rules_dict
def split_rule(self): # COLOR INCLUDES (30,31,32,33) -> name = 'COLOR INCLUDES', values = [30,31,32,33]
# LONG_DESCRIPTION CONTAINS ("BLACK") -> name = LONG_DESCRIPTION, values ='"BLACK"'
new_dict = {
0:None,
1:None,
2:None,
4:None,
}
for key in self.rules_dict.keys():
pql_dict = {}
if self.rules_dict[key] is not None:
for rule in range(len(self.rules_dict[key])): #self.rules_dict[key][rule] -> COLOR INCLUDES (30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839)
rule = self.rules_dict[key][rule]
name = rule.rsplit(maxsplit=1)[0] #------------------------------->COLOR INCLUDES
values_as_str = rule.rsplit(maxsplit=1)[1].replace("(","")
values_as_str = values_as_str.replace(")","") #-------------------------------> 30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839
try:
values = list(map(int, values_as_str.split(","))) # [30,31,32,33,56,74,84,85,93,99,184,800,823,830,833,838,839]
except:
values = values_as_str # '"BLACK"'
if name in pql_dict.keys():
pql_dict[name] = pql_dict[name] + (values)
pql_dict[name] = list(set(pql_dict[name]))
else:
pql_dict.setdefault(name, values)
# pql_dict = {'COLOR INCLUDES': [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]}
for name in pql_dict.keys():
values = pql_dict[name]
joined_rule = name + " " + str(values)
if new_dict[key] is not None:
new_dict[key] = new_dict[key] + [joined_rule]
else:
new_dict[key] = [joined_rule]
self.rules_dict = new_dict
And the result:
process = Rule_process()
process.append_rules()
process.split_rule()
process.rules_dict
OUT:
{0: ['COLOR INCLUDES [32, 33, 800, 99, 833, 838, 839, 74, 84, 85, 30, 823, 184, 409, 56, 93, 830, 31]'],
1: ['COLOR INCLUDES [40]', 'LONG_DESCRIPTION CONTAINS "BLACK"'],
2: None,
4: None}
Split this task into smaller, simpler tasks.
First task:
Write a function that takes a string and returns a pair (name, list_of_values) where name is the first part of the string and list_of_values is a python list of integers.
Hint: You can use '(' in s to test whether string s contains an opening parenthesis; you can use s.split() to split on whitespace or s.rsplit(maxsplit=1) to only split on the last whitespace; s.split('(') to split on opening parenthesis; and s.split(',') to split on comma.
Second task:
Write a function that takes a list of pairs (name, list_of_values) and merges the lists when the names are equal.
Hint: This is extremely easy in python using a dict with name as key and list_of_values as value. You can use if name in d: ... else: to test whether a name is already in the dict or not; or you can use d.get(name, []) or d.setdefault(name, []) to automatically add a name: [] entry in the dict when name is not already in the dict.
Third task:
Write a function to convert back, from the pairs (name, list_of_values) to the strings "name (value1, value2, ...)". This task is easier than the first task, so I suggest doing it first.
Hint: ' '.join(...) and ','.join(...) can both be useful.

Python retrieving data from a block of lines containing specific characters and appending relevant data into separate lines

I am trying to create a program which selects specific information from a bulk paste, extract the relevant information and then proceed to paste said information into lines.
Here is some example data;
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA
I would like to have the output where the relevant data for the Track1 is grouped together in a single line, with semicolon joining identical information and " - " seperating between others.
LyrcistA - ComposerA - ArrangerA; ArrangerB
LyrcistA; LyrcistC - ComposerA - ArrangerA
I have not gotten very far despite my best efforts
while True:
YodobashiData = input("")
SplitData = YodobashiData.splitlines();
returns the following
['1. Track1 03:01']
['VOC:PersonA ']
['LYR:LyrcistA']
['COM:ComposerA']
['ARR:ArrangerA']
['ARR:ArrangerB']
[]
['2. Track2 04:18']
['VOC:PersonB']
['VOC:PersonC']
['LYR:LyrcistA']
['LYR:LyrcistC']
['COM:ComposerA']
['ARR:ArrangerA']
Whilst I have all the data now in separate lists, I have no idea how to identify and extract the information from the list I need from the ones I do not.
Also, it seems I need to have the while loop or else it will only return the first list and nothing else.
Here's a script that doesn't use regular expressions.
It assumes that header lines, and only the header lines, will always start with a digit, and that the overall structure of header line then credit lines is consistent. Empty lines are ignored.
Extraction and formatting of the track data are handled separately, so it's easier to change formats, or use the extracted data in other ways.
import collections
import unicodedata
data_from_question = """\
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA
"""
def prepare_data(data):
# The "colons" in the credits lines are actually
# "full width colons". Replace them (and other such characters)
# with their normal width equivalents.
# If full normalisation is undesirable then we could return
# data.replace('\N{FULLWIDTH COLON}', ':')
return unicodedata.normalize('NFKC', data)
def is_new_track(line):
return line[0].isdigit()
def parse_track_header(line):
id_, title, duration = line.split()
return {'id': id_.rstrip('.'), 'title': title, 'duration': duration}
def get_credit(line):
credit, _, name = line.partition(':')
return credit.strip(), name.strip()
def format_track_heading(track):
return 'id: {id} title: {title} length: {duration}'.format(**track)
def format_credits(track):
order = ['ARR', 'COM', 'LYR', 'VOC']
parts = ['; '.join(track[k]) for k in order]
return ' - '.join(parts)
def get_data():
# The data is expected to be a multiline string.
return data_from_question
def parse_data(data):
track = None
for line in filter(None, data.splitlines()):
if is_new_track(line):
if track:
yield track
track = collections.defaultdict(list)
header_data = parse_track_header(line)
track.update(header_data)
else:
role, name = get_credit(line)
track[role].append(name)
yield track
def report(tracks):
for track in tracks:
print(format_track_heading(track))
print(format_credits(track))
print()
def main():
data = get_data()
prepared_data = prepare_data(data)
tracks = parse_data(prepared_data)
report(tracks)
main()
Output:
id: 1 title: Track1 length: 03:01
ArrangerA; ArrangerB - ComposerA - LyrcistA - PersonA
id: 2 title: Track2 length: 04:18
ArrangerA - ComposerA - LyrcistA; LyrcistC - PersonB; PersonC
Here's another take on an answer to your question:
data = """
1. Track1 03:01
VOC:PersonA
LYR:LyrcistA
COM:ComposerA
ARR:ArrangerA
ARR:ArrangerB
2. Track2 04:18
VOC:PersonB
VOC:PersonC
LYR:LyrcistA
LYR:LyrcistC
COM:ComposerA
ARR:ArrangerA"""
import re
import collections
# Regular expression to pull apart the headline of each entry
headlinePattern = re.compile(r"(\d+)\.\s+(.*?)\s+(\d\d:\d\d)")
def main():
# break the data into lines
lines = data.strip().split("\n")
# while we have more lines...
while lines:
# The next line should be a title line
line = lines.pop(0)
m = headlinePattern.match(line)
if not m:
raise Exception("Unexpected data format")
id = m.group(1)
title = m.group(2)
length = m.group(3)
people = collections.defaultdict(list)
# Now read person lines until we hit a blank line or the end of the list
while lines:
line = lines.pop(0)
if not line:
break
# Break the line into label and name
label, name = re.split(r"\W+", line, 1)
# Add this entry to a map of lists, where the map's keys are the label and the
# map's values are all the people who had that label
people[label].append(name)
# Now we have everything for one entry in the data. Print everything we got.
print("id:", id, "title:", title, "length:", length)
print(" - ".join(["; ".join(person) for person in people.values()]))
# go on to the next entry...
main()
Result:
id: 1 title: Track1 length: 03:01
PersonA - LyrcistA - ComposerA - ArrangerA; ArrangerB
id: 2 title: Track2 length: 04:18
PersonB; PersonC - LyrcistA; LyrcistC - ComposerA - ArrangerA
You can just comment out the line that prints the headline info if you really just want the line with all of the people on it. Just replace the built in data with data = input("") if you want to read the data from a user prompt.
Assuming your data is in the format you specified in a file called tracks.txt, the following code should work:
import re
with open('tracks.txt') as fp:
tracklines = fp.read().splitlines()
def split_tracks(lines):
track = []
all_tracks = []
while True:
try:
if lines[0] != '':
track.append(lines.pop(0))
else:
all_tracks.append(track)
track = []
lines.pop(0)
except:
all_tracks.append(track)
return all_tracks
def gather_attrs(tracks):
track_attrs = []
for track in tracks:
attrs = {}
for line in track:
match = re.match('([A-Z]{3}):', line)
if match:
attr = line[:3]
val = line[4:].strip()
try:
attrs[attr].append(val)
except KeyError:
attrs[attr] = [val]
track_attrs.append(attrs)
return track_attrs
if __name__ == '__main__':
tracks = split_tracks(tracklines)
attrs = gather_attrs(tracks)
for track in attrs:
semicolons = map(lambda va: '; '.join(va), track.values())
hyphens = ' - '.join(semicolons)
print(hyphens)
The only thing you may have to change is the colon characters in your data - some of them are ASCII colons : and others are Unicode colons :, which will break the regex.
import re
list_ = data_.split('\n') # here data_ is your data
regObj = re.compile(rf'[A-Za-z]+(:|{chr(65306)})[A-Za-z]+')
l = []
pre = ''
for i in list_:
if regObj.findall(i):
if i[:3] != 'VOC':
if pre == i[:3]:
l.append('; ')
else:
l.append(' - ')
l.append(i[4:].strip())
else:
l.append(' => ')
pre = i[:3]
track_list = list(map(lambda item: item.strip(' - '), filter(lambda item: item, ''.join(l).split(' => '))))
print(track_list)
OUTPUT : list of result you want
['LyrcistA - ComposerA - ArrangerA; ArrangerB', 'LyrcistA; LyrcistC - ComposerA - ArrangerA']

Add gender column searching rows that contains info from 2 tables

I have a df that contains some emails:
Email
jonathat0420#email.com
12alexander#email.com
14abcdjoanna#email.com
maria44th#email.com
mikeasddf#email.com
I need to add a second column with the gender.
I will have 2 lists:
male_names = ['john', 'alex']
female_names = ['maria', joanna']
My output should look like that:
Email Gender
jonathat0420#email.com 1
12alexander#email.com 1
14abcdjoanna#email.com 2
maria44th#email.com 2
mikeasddf#email.com
I would need to search the emails that contains the names from the lists and if they are in the emails to add them a number, like "1" for males, 2 for "females" and leave empty for the emails without matching in the lists.
Can anybody help me with this?
You could simply use a map, like this:
def isinlist(email, names):
for name in names:
if name in email:
return True
return False
df.loc[:, 'Gender'] = df.Email.map(lambda x : 1 if isinlist(x, male_names) else (2 if isinlist(x, female_names) else None))
However, there are going to be a lot of ambiguous cases that risk being classified erroneously - e.g., "alexandra#email.com" would be classified as male, since alex is the list of male names.
Maybe you could implement a slighly more complex "best match" logic like this?
def maxmatchlen(email, names): # = length of longest name from list that is contained in the email
return max([len(name) for name in names if name in email] + [0]) # append a 0 to avoid empty lists
def f(email, male_names = male_names, female_names = female_names):
male_maxmatchlen = maxmatchlen(email, male_names)
female_maxmatchlen = maxmatchlen(email, female_names)
if male_maxmatchlen > female_maxmatchlen:
return 1
elif female_maxmatchlen > male_maxmatchlen:
return 2
else: # ambiguous case
return None
df.loc[:, 'Gender'] = df.Email.map(f)
It looks like you first must determine if the email contains a name. You can loop through both male and female. That will determine if the name is "in" the email. Then you could make a list or a dictionary of these.
#!/usr/bin/env python3
import os
def get_emails(filepath):
"""Open the data file and read the lines - return a list"""
with open(filepath, "r") as f:
email_list = f.readlines()
for email in email_list:
print(f'Email = {email}')
print(f'The total number of emails = {len(email_list)}')
return email_list
def find_names(email_list):
"""loop through the email list and see if each one contains a male or female name - return dictionary of tuples"""
male_names = ['john', 'alex', 'mike', 'jonathat']
female_names = ['maria', 'joanna']
name_dict = {}
for email in email_list:
for name_f in female_names:
if name_f in email:
data= (name_f , 1)
name_dict[email] = data
print(f"{email} is for {name_f} and is female {data[1]}")
continue
for name_m in male_names:
if name_m in email:
data= (name_m , 2)
name_dict[email] = data
print(f"{email} is for {name_m} and is male {data[1]}")
continue
return name_dict
if __name__ == '__main__':
your_Datafile = r"D:\Share\email.txt"
email_list = get_emails(your_Datafile)
my_dictionary = find_names(email_list)
print(my_dictionary)
for email, data in my_dictionary.items():
print(data[0], data[1], email)

Check string for specific format of substring, how to..?

Two strings. My items name:
Parfume name EDT 50ml
And competitor's items name:
Parfume another name EDP 60ml
And i have a long list of these names in one column, competitors names in other column, and I want to leave only those rows in dataframe, that have same amount of ml in both my and competitors names no matter what everything else in these strings look like. So how do I find a substring ending with 'ml' in a bigger string? I could simply do
"**ml" in competitors_name
to see if they both contain the same amount of ml.
Thank you
UPDATE
'ml' is not always at the end of string. It might look like this
Parfume yet another great name 60ml EDP
Try this:
import re
def same_measurement(my_item, competitor_item, unit="ml"):
matcher = re.compile(r".*?(\d+){}".format(unit))
my_match = matcher.match(my_item)
competitor_match = matcher.match(competitor_item)
return my_match and competitor_match and my_match.group(1) == competitor_match.group(1)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 50ml"
assert same_measurement(my_item, competitor_item)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 60ml"
assert not same_measurement(my_item, competitor_item)
You could use the python Regex library to select the 'xxml' values for each of your data rows and then do some logic to check if they match.
import re
data_rows = [["Parfume name EDT", "Parfume another name EDP 50ml"]]
for data_pairs in data_rows:
my_ml = None
comp_ml = None
# Check for my ml matches and set value
my_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[0])
if my_ml_matches != None:
my_ml = my_ml_matches[0]
else:
print("my_ml has no ml")
# Check for comp ml matches and set value
comp_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[1])
if comp_ml_matches != None:
comp_ml = comp_ml_matches[0]
else:
print("comp_ml has no ml")
# Print outputs
if (my_ml != None) and (comp_ml != None):
if my_ml == comp_ml:
print("my_ml: {0} == comp_ml: {1}".format(my_ml, comp_ml))
else:
print("my_ml: {0} != comp_ml: {1}".format(my_ml, comp_ml))
Where data_rows = each row in the data set
Where data_pairs = {your_item_name, competitor_item_name}
You could use a lambda function to do that.
import pandas as pd
import re
d = {
'Us':
['Parfume one 50ml', 'Parfume two 100ml'],
'Competitor':
['Parfume uno 50ml', 'Parfume dos 200ml']
}
df = pd.DataFrame(data=d)
df['Eq'] = df.apply(lambda x : 'Yes' if re.search(r'(\d+)ml', x['Us']).group(1) == re.search(r'(\d+)ml', x['Competitor']).group(1) else "No", axis = 1)
Result:
Doesn't matter whether 'ml' is in the end of in the middle of the string.

How do I Use re() in Python and Return Capture Groups within an "If" Statement?

Although I've been using Perl for many years, I've always had trouble with anything more than fairly basic use of Regular Expresions in the language. This is
only a worse situation now, as I'm trying to learn Python... and the use of re() is even more unclear to me.
I'm trying to check for a match if a substring is in a string, using re()
and also am using capture groups to extract some info from the matching process. However, I can't get things to work in a couple of
contexts; when using a re() call and assigning the returned values all
within an "if" statement.. and how to handle the situation when .groups items are not defined
in the match objects (when a match is not made).
So, what follows are examples of what I'm trying to do coded in Perl and Python, with their respective outputs.
I'd appreciate any pointers on how I might better approach the problem using Python.
Perl Code:
use strict;
use warnings;
my ($idx, $dvalue);
while (my $rec = <DATA>) {
chomp($rec);
if ( ($idx, $dvalue) = ($rec =~ /^XA([0-9]+)=(.*?)!/) ) {
printf(" Matched:\n");
printf(" rec: >%s<\n", $rec);
printf(" index = >%s< value = >%s<\n", $idx, $dvalue);
} elsif ( ($idx, $dvalue) = ($rec =~ /^PZ([0-9]+)=(.*?[^#])!/) ) {
printf(" Matched:\n");
printf(" rec: >%s<\n", $rec);
printf(" index = >%s< value = >%s<\n", $idx, $dvalue);
} else {
printf("\n Unknown Record format, \\%s\\\n\n", $rec);
}
}
close(DATA);
exit(0)
__DATA__
DUD=ABC!QUEUE=D23!
XA32=7!P^=32!
PZ112=123^!PQ=ABC!
Perl Output:
Unknown Record format, \DUD=ABC!QUEUE=D23!\
Matched:
rec: >XA32=7!P^=32!<
index = >32< value = >7<
Matched:
rec: >PZ112=123^!PQ=ABC!<
index = >112< value = >123^<
Python Code:
import re
string = 'XA32=7!P^=32!'
with open('data.dat', 'r') as fh:
for rec in fh:
orec = ' rec: >' + rec.rstrip('\n') + '<'
print(orec)
# always using 'string' at least lets this program run
(index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', string).groups()
# The following works when there is a match... but fails with an error when
# a match is NOT found, viz:-
# ...
# (index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', rec).groups()
#
# Traceback (most recent call last):
# File "T:\tmp\a.py", line 13, in <module>
# (index, dvalue) = re.search(r'^XA([0-9]+)=(.*?[^#])!', rec).groups()
# AttributeError: 'NoneType' object has no attribute 'groups'
#
buf = ' index = >' + index + '<' + ' value = >' + dvalue + '<'
print(buf)
exit(0)
data.dat contents:
DUD=ABC!QUEUE=D23!
XA32=7!P^=32!
PZ112=123^!PQ=ABC!
Python Output:
rec: >DUD=ABC!QUEUE=D23!<
index = >32< value = >7<
rec: >XA32=7!P^=32!<
index = >32< value = >7<
rec: >PZ112=123^!PQ=ABC!<
index = >32< value = >7<
Another development: Some more code to help me understand this better... but I'm unsure about when/how to use the match.group() or match.groups() ...
Python Code:
import re
rec = 'XA22=11^!S^=64!ABC=0,0!PX=0!SP=12B!'
print("rec = >{}<".format(rec))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=(.*?[^#])!(.*?)!', rec)
if match:
(index, dvalue, x) = match.groups()
print("3 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print("2 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.match(r'XA([0-9]+)=', rec)
if match:
#(index) = match.groups() # Why doesn't this work like above examples!?
(index, ) = match.groups() # ...and yet this works!?
# Does match.groups ALWAYS returns a tuple!?
#(index) = match.group(1) # This also works; 0 = entire matched string?
print("1 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
# ----
index = 0 ; dvalue = 0 ; x = 0
match = re.search(r'S\^=([0-9]+)!', rec)
if match:
(index, ) = match.groups() # Returns tuple(?!)
print("1 (): index = >{}< value = >{}< x = >{}<".format(index, dvalue, x))
Again, I'd appreciate any thoughts on which is the 'preferred' way.. or if there's another way to deal with the groups.
You need to check for a match first, then use the groups. I.e.
compile the regexes (optional for most cases nowadays, according to the documentation)
apply each regex to the string to generate a match object
match() only matches at the beginning of a string, i.e. with an implicit ^ anchor
search() matches anywhere in the string
check if the match object is valid
extract the groups
skip to next loop iteration
# works with Python 2 and Python 3
import re
with open('dummy.txt', 'r') as fh:
for rec in fh:
orec = ' rec: >' + rec.rstrip('\n') + '<'
print(orec)
match = re.match(r'XA([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
continue
match = re.match(r'PZ([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
continue
print(" Unknown Record format")
Output:
$ python dummy.py
rec: >DUD=ABC!QUEUE=D23!<
Unknown Record format
rec: >XA32=7!P^=32!<
index = >32< value = >7<
rec: >PZ112=123^!PQ=ABC!<
index = >112< value = >123^<
But I'm wondering why you don't simplify your Perl & Python code to just use a single regex instead? E.g.:
match = re.match(r'(?:XA|PZ)([0-9]+)=(.*?[^#])!', rec)
if match:
(index, dvalue) = match.groups()
print(" index = >{}< value = >{}<".format(index, dvalue))
else:
print(" Unknown Record format")

Categories