How can I leave whitespaces in nestedExpr pyparsing

How can I leave whitespaces in nestedExpr pyparsing - python

I've a wiki text like that
data = """
{{hello}}
{{hello world}}
{{hello much { }}
{{a {{b}}}}
{{a
td {
}
{{inner}}
}}
"""
and I want to extract the macros inside it
macro is a text enclosed between {{ and }}
so I tried using nestedExpr
from pyparsing import *
import pprint
def getMacroCandidates(txt):
candidates = []
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()):
if opener == closer:
raise ValueError("opening and closing strings cannot be the same")
if content is None:
if isinstance(opener,str) and isinstance(closer,str):
if ignoreExpr is not None:
content = (Combine(OneOrMore(~ignoreExpr +
~Literal(opener) + ~Literal(closer) +
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
).setParseAction(lambda t:t[0]))
ret = Forward()
ret <<= Group( opener + ZeroOrMore( ignoreExpr | ret | content ) + closer )
ret.setName('nested %s%s expression' % (opener,closer))
return ret
# use {}'s for nested lists
macro = nestedExpr("{{", "}}")
# print(( (nestedItems+stringEnd).parseString(data).asList() ))
for toks, preloc, nextloc in macro.scanString(data):
print(toks)
return candidates
data = """
{{hello}}
{{hello world}}
{{hello much { }}
{{a {{b}}}}
{{a
td {
}
{{inner}}
}}
"""
getMacroCandidates(data)
Which gives me the tokens and spaces removed
[['{{', 'hello', '}}']]
[['{{', 'hello', 'world', '}}']]
[['{{', 'hello', 'much', '{', '}}']]
[['{{', 'a', ['{{', 'b', '}}'], '}}']]
[['{{', 'a', 'td', '{', '}', ['{{', 'inner', '}}'], '}}']]

You can you replace
data = """
{{hello}}
{{hello world}}
{{hello much { }}
{{a {{b}}}}
{{a
td {
}
{{inner}}
}}
"""
import shlex
data1= data.replace("{{",'"')
data2 = data1.replace("}}",'"')
data3= data2.replace("}"," ")
data4= data3.replace("{"," ")
data5= ' '.join(data4.split())
print(shlex.split(data5.replace("\n"," ")))
Output
This returns you all the tokens with braces and white space removed with extra line space also removed
['hello', 'hello world', 'hello much ', 'a b', 'a td inner ']
PS:This can be made to a single expression , multiple expression is used for readability

Related

How would I extract all names of a similar type in a file using python and regex?

I have a file that is filled with cultures, and each culture has a unique set of surnames for the people of each culture. The problem I have is that there are numerous files, each of them with hundreds, if not thousands of names, so instead of gathering all of these files by hand, I would like to automate this task in a sense using python and regex.
Here is an example of the file's contents:
###Myrman###
360 = { # DUPLICATE §§§§§§
name="of Myr"
culture = myrman
}
300507 = {
name = "of Myr"
culture = myrman
}
300525 = {
name = "Trellos"
culture = myrman
}
300534 = {
name = "Uteuran"
culture = myrman
}
##Lysene##
1386 = {
name="Ormollen"
culture = lysene
coat_of_arms = {
template = 0
layer = {
texture = 14
texture_internal = 9
emblem = 0
color = 0
color = 0
color = 0
}
}
}
300505 = {
name = "of Lys"
culture = lysene
}
300523 = {
name = "Lohar"
culture = lysene
}
300532 = {
name = "Assadyrn"
culture = lysene
}
So as you can see, there are two types of cultures here, each with different surnames for people of the respective cultures. I want to take all of these different names, and sort them into different groups, that are also separated by commas and quotes. Here is an example of what I want to happen:
Myrman: ["of Myr", "of Myr", "Trellos", "Uteuran"]
Lysene: ["Ormollen", "of Lys", "Lohar", "Assadyrn"]
How would I go about doing this with python and it's regex library?

Ooh, a parser problem! Let's use the lark parser generator to figure this out.
First, let's create a syntax for our file - this is cobbled together based on the JSON parser example:
import lark
parser = lark.Lark(r"""
start: (term)*
term: key "=" value "\n"
key: CNAME | SIGNED_NUMBER
value: CNAME | SIGNED_NUMBER | ESCAPED_STRING | map
map: "{" (term)* "}"
%import common.CNAME
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
%ignore /#.*/
""")
Pretty straightforward; the file is a list of terms, which are key-values, where the key may be a name or a number, and the value can be a name, number, string, or a map, which is a brace-enclosed list of terms.
Then, let's write a transformer to transform the Lark parse tree to a dict:
class TreeTransformer(lark.Transformer):
def start(self, items):
return dict(items)
def term(self, items):
return (items[0], items[1])
def CNAME(self, item):
return item.value
def SIGNED_NUMBER(self, item):
return int(item.value)
def ESCAPED_STRING(self, item):
return item.value[1:-1]
def map(self, items):
return dict(items)
def key(self, item):
return item[0]
def value(self, item):
return item[0]
Could probably be more terse, but this works.
Let's run it against the data:
from pathlib import Path
from pprint import pprint
data = Path("./so75472097-data.txt").read_text()
tree = parser.parse(data)
res = TreeTransformer().transform(tree)
pprint(res)
The output is
{360: {'culture': 'myrman', 'name': 'of Myr'},
1386: {'coat_of_arms': {'layer': {'color': 0,
'emblem': 0,
'texture': 14,
'texture_internal': 9},
'template': 0},
'culture': 'lysene',
'name': 'Ormollen'},
300505: {'culture': 'lysene', 'name': 'of Lys'},
300507: {'culture': 'myrman', 'name': 'of Myr'},
300523: {'culture': 'lysene', 'name': 'Lohar'},
300525: {'culture': 'myrman', 'name': 'Trellos'},
300532: {'culture': 'lysene', 'name': 'Assadyrn'},
300534: {'culture': 'myrman', 'name': 'Uteuran'}}
-- that looks promising!
Then, it's just a matter of dict traversal:
from collections import defaultdict
names_by_culture = defaultdict(list)
for info in res.values():
names_by_culture[info["culture"]].append(info["name"])
pprint(dict(names_by_culture))
... and hey voilà!
{'lysene': ['Ormollen', 'of Lys', 'Lohar', 'Assadyrn'],
'myrman': ['of Myr', 'of Myr', 'Trellos', 'Uteuran']}
Now, all you have to do is wrap that bad boy into a function and call it on all of your files.
(EDIT, now that I read the latest comment and know what to google: you could just use the ClauseWizard library instead of writing the parser yourself, but this was more fun!)
EDIT 2
As discussed in the comments, a grammar and transformer that's also fine with "basically anything" for unquoted values:
parser = lark.Lark(r"""
start: (term)*
term: key "=" value "\n"
key: KEYNAME | SIGNED_NUMBER
value: VALUENAME | SIGNED_NUMBER | ESCAPED_STRING | map
map: "{" (term)* "}"
VALUENAME: /[a-zA-Z][^\s=]*/
KEYNAME: /[a-zA-Z][-a-zA-Z0-9_]*/
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
%ignore /#.*/
""")
from operator import itemgetter, attrgetter
class TreeTransformer(lark.Transformer):
start = dict
map = dict
key = itemgetter(0)
value = itemgetter(0)
VALUENAME = attrgetter("value")
KEYNAME = attrgetter("value")
term = tuple
def SIGNED_NUMBER(self, item):
return int(item.value)
def ESCAPED_STRING(self, item):
return item.value[1:-1]

Since the file is well structured, just use regex with the appropriate query and treat the tuples that are outputted accordingly.
result = re.findall('name[ ]*=[ ]*"([A-z ]+)"\n[ ]+culture[ ]*=[ ]*([A-z]+)', a)
names_by_culture = {}
for i in result:
name = i[0]
culture = i[1]
try:
names_by_culture[culture].append(name)
except:
names_by_culture[culture] = []
names_by_culture[culture].append(name)
print(names_by_culture)
Output:
{'myrman': ['of Myr', 'of Myr', 'Trellos', 'Uteuran'],
'lysene': ['Ormollen', 'of Lys', 'Lohar', 'Assadyrn']}

How to ignore punctuation in-between words using word_tokenize in NLTK?

I'm looking to ignore characters in-between words using NLTK word_tokenize.
If I have a a sentence:
test = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test#testing.com'
The word_tokenize method is splitting the S&P into
'S','&','P','?'
Is there a way to have this library ignore punctuation between words or letters?
Expected output: 'S&P','?'

Let me know how this works with your sentences.
I added an additional test with a bunch of punctuation.
The regular expression is, in the final portion, modified from the WordPunctTokenizer regexp.
from nltk.tokenize import RegexpTokenizer
punctuation = r'[]!"$%&\'()*+,./:;=##?[\\^_`{|}~-]?'
tokenizer = RegexpTokenizer(r'\w+' + punctuation + r'\w+?|[^\s]+?')
# result:
In [156]: tokenizer.tokenize(test)
Out[156]: ['Should', 'I', 'trade', 'on', 'the', 'S&P', '?']
# additional test:
In [225]: tokenizer.tokenize('"I am tired," she said.')
Out[225]: ['"', 'I', 'am', 'tired', ',', '"', 'she', 'said', '.']
Edit: the requirements changed a bit so we can slightly modify PottsTweetTokenizer for this purpose.
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
# Twitter symbols/cashtags: # Added by awd, 20140410.
# Based upon Twitter's regex described here: <https://blog.twitter.com/2013/symbols-entities-tweets>.
cashtag_string = r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)"""
# The components of the tokenizer:
regex_strings = (
# Phone numbers:
r"""
(?:
(?: # (international)
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[\(]?
\d{3}
[\-\s.\)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
)"""
,
# Emoticons:
emoticon_string
,
# HTML tags:
r"""(?:<[^>]+>)"""
,
# URLs:
r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)"""
,
# Twitter username:
r"""(?:#[\w_]+)"""
,
# Twitter hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Twitter symbols/cashtags:
cashtag_string
,
# email addresses
r"""(?:[\w.+-]+#[\w-]+\.(?:[\w-]\.?)+[\w-])""",
# Remaining word types:
r"""
(?:[a-z][^\s]+[a-z]) # Words with punctuation (modification here).
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
# The emoticon and cashtag strings get their own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
cashtag_re = re.compile(cashtag_string, re.VERBOSE | re.I | re.UNICODE)
# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&"
class CustomTweetTokenizer(object):
def __init__(self, *, preserve_case: bool=False):
self.preserve_case = preserve_case
def tokenize(self, tweet: str) -> list:
"""
Argument: tweet -- any string object.
Value: a tokenized list of strings; concatenating this list returns the original string if preserve_case=True
"""
# Fix HTML character entitites:
tweet = self._html2unicode(tweet)
# Tokenize:
matches = word_re.finditer(tweet)
if self.preserve_case:
return [match.group() for match in matches]
return [self._normalize_token(match.group()) for match in matches]
#staticmethod
def _normalize_token(token: str) -> str:
if emoticon_re.search(token):
# Avoid changing emoticons like :D into :d
return token
if token.startswith('$') and cashtag_re.search(token):
return token.upper()
return token.lower()
#staticmethod
def _html2unicode(tweet: str) -> str:
"""
Internal method that seeks to replace all the HTML entities in
tweet with their corresponding unicode characters.
"""
# First the digits:
ents = set(html_entity_digit_re.findall(tweet))
if len(ents) > 0:
for ent in ents:
entnum = ent[2:-1]
try:
entnum = int(entnum)
tweet = tweet.replace(ent, chr(entnum))
except:
pass
# Now the alpha versions:
ents = set(html_entity_alpha_re.findall(tweet))
ents = filter((lambda x: x != amp), ents)
for ent in ents:
entname = ent[1:-1]
try:
tweet = tweet.replace(ent, chr(html.entities.name2codepoint[entname]))
except:
pass
tweet = tweet.replace(amp, " and ")
return tweet
To test it out:
tknzr = CustomTweetTokenizer(preserve_case=True)
tknzr.tokenize(test)
# result:
['Should',
'I',
'trade',
'on',
'the',
'S&P',
'?',
'This',
'works',
'with',
'a',
'phone',
'number',
'333-445-6635',
'and',
'email',
'test#testing.com']

Following up on #mechanical_meat answer,
There's a twitter text tokenizer in NLTK
Most probably, it's derived from the PottsTweetTokenizer at https://github.com/nltk/nltk/blob/develop/nltk/tokenize/casual.py
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
text = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test#testing.com'
print(tt.tokenize(text))
[out]:
['Should', 'I', 'trade', 'on', 'the', 'S', '&', 'P', '?', 'This', 'works', 'with', 'a', 'phone', 'number', '333-445-6635', 'and', 'email', 'test#testing.com']
But that doesn't solve the S&P problem!!
So you can try the Multi-Word Expression approach, see https://stackoverflow.com/a/55644296/610569
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
def multiword_tokenize(text, mwe, tokenize_func=word_tokenize):
# Initialize the MWETokenizer
protected_tuples = [tokenize_func(word) for word in mwe]
protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
tokenizer = MWETokenizer(protected_tuples)
# Tokenize the text.
tokenized_text = tokenizer.tokenize(tokenize_func(text))
# Replace the underscored protected words with the original MWE
for i, token in enumerate(tokenized_text):
if token in protected_tuples_underscore:
tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
return tokenized_text
text = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test#testing.com'
mwe = ['S&P']
tt = TweetTokenizer()
print(multiword_tokenize(text, mwe, tt.tokenize))
[out]:
['Should', 'I', 'trade', 'on', 'the', 'S&P', '?', 'This', 'works', 'with', 'a', 'phone', 'number', '333-445-6635', 'and', 'email', 'test#testing.com']

Capture property names

I'm scanning a ".twig" (PHP template) file and trying to capture the property names of an object.
The twig file contains lines (strings) like these:
{{ product.id }}
{{ product.parentProductId }}
{{ product.countdown.startDate | date('Y/m/d H:i:s') }}
{{ product.countdown.endDate | date('Y/m/d H:i:s') }}
{{ product.countdown.expireDate | date('Y/m/d H:i:s') }}
{{ product.primaryImage.originalUrl }}
{{ product.image(1).originalUrl }}
{{ product.image(1).thumbUrl }}
{{ product.priceWithTax(preferences.default_currency) | money }}
The things I want to capture are:
.id
.parentProductId
.countdown
.startDate
.endDate
.expireDate
.primaryImage
.originalUrl
.image(1)
.originalUrl
.thumbUrl
.priceWithTax(preferences.default_currency)
Basically, I'm trying to figure out the properties of the product object. I have the following pattern, but it doesn't capture chained properties. For example,
"{{.+?product(\.[a-zA-Z]+(?:\(.+?\)){,1})++.+?}}" captures only .startDate, but it should capture both .countdown and .startDate seperately. Is this not possible, or am I missing something?
regex101
I could capture ("{{.+?product((?:\.[a-zA-Z]+(?:\(.+?\)){,1})+).+?}}") it as a whole (.countdown.startDate) and later check/split it, but this sounds troublesome.

If you want to handle it with a single regex, you might want to use the PyPi regex module:
import regex
s = """{{ product.id }}
{{ product.parentProductId }}
{{ product.countdown.startDate | date('Y/m/d H:i:s') }}
{{ product.primaryImage.originalUrl }}
{{ product.image(1).originalUrl }}
{{ product.priceWithTax(preferences.default_currency) | money }}"""
rx = r'{{[^{}]*product(\.[a-zA-Z]+(?:\([^()]+\))?)*[^{}]*}}'
l = [m.captures(1) for m in regex.finditer(rx, s)]
print([item for sublist in l for item in sublist])
# => ['.id', '.parentProductId', '.countdown', '.startDate', '.primaryImage', '.originalUrl', '.image(1)', '.originalUrl', '.priceWithTax(preferences.default_currency)']
See the Python demo
The {{[^{}]*product(\.[a-zA-Z]+(?:\([^()]+\))?)*[^{}]*}} regex will match
{{ - {{ substring
[^{}]* - 0+ chars other than { and }
product - the substring product
(\.[a-zA-Z]+(?:\([^()]+\))?)* - Capturing group 1: zero or more sequences of
\. - a dot
[a-zA-Z]+ - 1+ ASCII letters
(?:\([^()]+\))? - an optional sequence of (, 1+ chars other than ( and ) and then )
[^{}]* - 0+ chars other than { and }
}} - a }} substring.
If you are only limited to re, you will need to capture all the properties into 1 capturing group (wrap this (\.[a-zA-Z]+(?:\([^()]+\))?)* with (...)) and then run a regex based post-process to split by . not inside parentheses:
import re
rx = r'{{[^{}]*product((?:\.[a-zA-Z]+(?:\([^()]+\))?)*)[^{}]*}}'
l = re.findall(rx, s)
res = []
for m in l:
res.extend([".{}".format(n) for n in filter(None, re.split(r'\.(?![^()]*\))', m))])
print(res)
# => ['.id', '.parentProductId', '.countdown', '.startDate', '.primaryImage', '.originalUrl', '.image(1)', '.originalUrl', '.priceWithTax(preferences.default_currency)']
See this Python demo

try this one, captures all in your requirement
^{{ product(\..*?[(][^\d\/]+[)]).*?}}|^{{ product(\..*?)(\..*?)?(?= )
demo and explanation at regex 101

I've decided to stick to re (instead of regex, as suggested by Victor) and this is what I ended up with:
import re, json
file = open("test.twig", "r", encoding="utf-8")
content = file.read()
file.close()
patterns = {
"template" : r"{{[^{}]*product((?:\.[a-zA-Z]+(?:\([^()]+\))?)*)[^{}]*}}",
"prop" : r"^[^\.]+$", # .id
"subprop" : r"^[^\.()]+(\.[^\.]+)+$", # .countdown.startDate
"itemprop" : r"^[^\.]+\(\d+\)\.[^\.]+$", # .image(1).originalUrl
"method" : r"^[^\.]+\(.+\)$", # .priceWithTax(preferences.default_currency)
}
temp_re = re.compile(patterns["template"])
matches = temp_re.findall(content)
product = {}
for match in matches:
match = match[1:]
if re.match(patterns["prop"], match):
product[match] = match
elif re.match(patterns["subprop"], match):
match = match.split(".")
if match[0] not in product:
product[match[0]] = []
if match[1] not in product[match[0]]:
product[match[0]].append(match[1])
elif re.match(patterns["itemprop"], match):
match = match.split(".")
array = re.sub("\(\d+\)", "(i)", match[0])
if array not in product:
product[array] = []
if match[1] not in product[array]:
product[array].append(match[1])
elif re.match(patterns["method"], match):
product[match] = match
props = json.dumps(product, indent=4)
print(props)
Example output:
{
"id": "id",
"parentProductId": "parentProductId",
"countdown": [
"startDate",
"endDate",
"expireDate"
],
"primaryImage": [
"originalUrl"
],
"image(i)": [
"originalUrl",
"thumbUrl"
],
"priceWithTax(preferences.default_currency)": "priceWithTax(preferences.default_currency)"
}

ParseError parsing empty valued xml doc

There are lots of articles pertaining to parsing xml with elementtree. I've gone through a bunch of them and read through the docs but I can't come up with a solution that works for me. I'm trying to supplement info thats created by another app in a nfo file but i need to preserve the conventions in the file.
Here is an example of how the file is laid out
<title>
<name>Test Name</name>
<alt name />
<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>
<file local="" type="excel">http://filestore/file2.xls</file>
<file local="C:\file\file3.xls" type="excel" />
<file local="" type="ppt" />
</title>
Note: Elements are not closed properly e.g...
<alt name /> should be <alt name></alt name>
This is what I'm running...
import xml.etree.ElementTree as ET
tree = ET.parse('file.nfo')
root = tree.getroot()
The error I'm getting is...
xml.etree.ElementTree.ParseError: not well-formed (invalid token):
I've tried...
myparser = ET.XMLParser(encoding='UTF-8')
tree = ET.parse('file.nfo', myparser)
Also tried, xmlparser, opening with codecs but i'm pretty sure its the formatting. I'm guessing the immediate issue is non-escaped > but i suspect ET needs opening/closing?
I'm sure i could open this file and go through it with regex but i was hoping to use ElementTree.
The end goal is to have the details from the nfo as a dictionary that looks like...
dict = {'title': [{'name': 'Test Name',
'alt name': '',
'file': [{'local': 'C:\file\file1.doc', 'type': 'word', 'url': 'http://filestore/file1.doc'},
{'local': '', 'type': 'excel', 'url': 'http://filestore/file2.xls'},
{'local': 'C:\file\file3.xls', 'type': 'excel', 'url': ''},
{'local': '', 'type': 'ppt', 'url': ''}]
}]}
I'm sure there is a better (more pythonic) way to do this but I'm pretty new to python.
Any help would be appreciated
EDIT: I'm also trying to avoid using 3rd party libraries if possible

So I ended up creating a customer parser of sorts, its not ideal but it works. It was suggested to me that lxml and html.parser may parse malformed xml better but i just went with this.
I'm also still very interested in any feedback whether it be on this or using any other method.
import re
def merge_dicts(*dict_args):
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result
def make_dict(str_arg, op):
result = {}
result = dict(s.split(op) for s in str_arg.split(","))
return result
'''
Samples
lst = r' <name>Test Name</name>'
lst = r' <alt name />'
lst = r' <file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
lst = r' <file local="" type="excel">http://filestore/file2.xls</file>'
lst = r' <file local="C:\file\file3.xls" type="excel" />'
lst = r' <file local="" type="ppt" />'
'''
def match_pattern(file_str):
#<description>desc blah</description>'
pattern1 = r'''(?x)
^
\s* # cut leading whitespace
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b # word boundary, so we can
> # skip attributes
(?P<tag_body> .+? ) # insides
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) > # closing tag, nothing interesting
)
$'''
#<alt name />
pattern2 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b
\s/>
)
$'''
#<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
pattern3 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
\s
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
\s
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
.*? >
(?P<tag_body> .+? )
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) >
)
$'''
#<file local="" type="ppt" />
pattern4 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
\s
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
\s
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
\s/>
)
$'''
pat_str = 'pattern'
pat_val = 1
return_dict = {}
while (pat_val <= 4):
pattern = pat_str+str(pat_val)
matchObj = re.match(eval(pattern), file_str, re.L|re.M)
if matchObj:
#for k, v in matchObj.groupdict().items():
# print('matchObj.group({!r}) == {!r}'.format(k, v))
if pat_val == 1:
body = matchObj.group('tag_body')
return_dict = {matchObj.group('tag_open'): body}
elif pat_val == 2:
return_dict = {matchObj.group('tag_open'): ''}
elif pat_val == 3:
attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
body = {'url': matchObj.group('tag_body')}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {matchObj.group('tag_open'): attrib}
elif pat_val == 4:
attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
body = {'url': ''}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {matchObj.group('tag_open'): attrib}
return return_dict
else:
pat_val = pat_val + 1
if pat_val > 4:
print("No match!!")
#print(match_pattern(lst))
def in_file(file):
result = {}
with open(file, "r") as file:
data = (file.read().splitlines())
for d in data:
if data.index(d) == 0 or data.index(d) == len(data)-1:
if data.index(d) == 0:
print(re.sub('<|/|>', '', d))
elif d:
lst = []
dct = {}
if 'file' in match_pattern(d).keys():
for i in match_pattern(d).items():
if 'file' in result.keys():
lst = result['file']
lst.append(i[1])
dct = {i[0]: lst}
result = merge_dicts(result, dct)
#print(result['file'])
else:
dct = {i[0]: [i[1]]}
result = merge_dicts(result, dct)
else:
result = merge_dicts(result, match_pattern(d))
print('else', match_pattern(d))
return result
print(in_file('C:\\test.nfo'))
NOTE: I dropped the top most dictionary from the original post

Python regex sub confusion

There are four keywords: title, blog, tags, state
Excess keyword occurrences are being removed from their respective matches.
Example:
blog: blog state title tags and returns state title tags and instead of
blog state title tags and
The sub function should be matching .+ after it sees blog:, so I don't know why it treats blog as an exception to .+
Regex:
re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
Code:
def n15():
import re
a = """blog: blog: fooblog
state: private
title: this is atitle bun
and text"""
kwargs = {}
def matcher(string):
v = string.group(1).replace(string.group(2), '').replace(string.group(3), '').replace(string.group(4), '').replace(string.group(5), '')
if string.group(3) == 'title':
kwargs['title'] = v
elif string.group(3) == 'blog':
kwargs['blog_url'] = v
elif string.group(3) == 'tags':
kwargs['comma_separated_tags'] = v
elif string.group(3) == 'state':
kwargs['post_state'] = v
return ''
a = re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
a = a.replace('\n', '<br />')
a = a.replace('\r', '')
a = a.replace('"', r'\"')
a = '<p>' + a + '</p>'
kwargs['body'] = a
print kwargs
Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'foo', 'title': 'this is a bun'}
Edit:
Desired Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'fooblog', 'title': 'this is atitle bun'}

replace(string.group(3), '')
is replacing all occurrences of 'blog' with '' .
Rather than try to replace all the other parts of the matched string, which will be hard to get right, I suggest capture the string you actually want in the original match.
r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s)(.+)(\n|$))'
which has () around the .+ to capture that part of the string, then
v = match.group(5)
at the start of matcher.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I leave whitespaces in nestedExpr pyparsing - python

Related

How would I extract all names of a similar type in a file using python and regex?

How to ignore punctuation in-between words using word_tokenize in NLTK?

Capture property names

ParseError parsing empty valued xml doc

Python regex sub confusion

Categories

Resources