How to remove values from file if N/A in beam - python

Just looking for advice, if I have a file like below. What function do I use in apache beam to remove any N/A value. I tried filter but it removes the whole row but I just want to remove that 'cell' if its N/A... I first read the file in and then split the rows using a template split row class which is called using pardo now want remove any N/A values...
Example file
Start_loc, Loc_2, loc_3, loc_4, end_loc
Loc 1, loc 2, N/A, loc 3, loc 4
Loc 1, N/A, N/A, N/A, loc 2
Any suggestions?

I had tried the below before seeing these answers. It works, I am just aware it might not be the 'right' way of doing it... any feedback on the below?
class Split(beam.DoFn):
def process(self, element):
"""
Splits each row on commas and returns a dictionary representing the
row
"""
Start_loc, loc_1, loc_2, loc_3, loc_4 ,End_loc = element.split(",")
return [{
'Start_loc': Start_loc,
'Loc_1': loc_1,
'Loc_2': loc_2,
'Loc_3': loc_3,
'Loc_4': loc_4,
'End_loc': End_loc
}]
class CleanFile(beam.DoFn):
def process(self, element):
for k, v in list(element.items()):
if v == 'N/A':
element[k] = None
return [{
'Start_loc': element['Start_loc'],
'loc_1': element['Loc_1'],
'loc_2': element['Loc_2'],
'loc_3': element['Loc_3'],
'loc_4': element['Loc_4'],
'End_loc': element['End_Loc']
}]
class CombineColumns(beam.DoFn):
def process(self, element):
"""
Prepares each row to be written in the csv
"""
other_loc = ''
for k, v in list(element.items()):
if v != None and k != 'Start_loc' and k != 'End_loc' and other_loc == '':
other_loc = '"' + v
elif v != None and k != 'Start_loc' and k != 'End_loc':
other_loc = other_loc + ',' + v
other_loc = other_loc + '"'
return [{
'Start_loc': element['Start_loc'],
'Other_loc': other_loc,
'End_loc': element['End_loc']
}]
class WriteToCSV(beam.DoFn):
def process(self, element):
"""
Prepares each row to be written in the csv
"""
result = [
"{},{},{}".format(
element['Start_loc'],
element['Other_loc'],
element['End_loc']
)
]
return result
def process_file():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
tstmp = datetime.now().strftime("%Y%m%d%H")
input = user_options.input
output = user_options.output
with beam.Pipeline(options=pipeline_options) as p:
ROWS = p | 'Read from a File' >> beam.io.ReadFromText(input, skip_header_lines=1) | beam.ParDo(Split())
CLEAN = ROWS | beam.ParDo(CleanFile())
FORMAT = CLEAN | beam.ParDo(CombineColumns())
FORMAT |beam.ParDo(WriteToCSV()) |beam.io.WriteToText(output, file_name_suffix=".csv", header='Start_loc,Other_loc,End_loc')

I tried it out using the Create method to create a PCollection and then applied a ParDo on it to get the desired result. I have assumed that you wanted to replace the N/A string with an empty String.
the DoFn reads the string splits it based on the delimiter "," and replaces the "N/A" with "" before adding them to a list. After that it returns the list values separated by the delimiter.
import apache_beam as beam
class FilterFn(beam.DoFn):
def __init__(self, delimiter, filter_item ):
self.delimiter = delimiter
self.filter_item = filter_item
def process(self, text):
a_list = []
for word in text.split(self.delimiter):
if( word.strip() == self.filter_item): # Replace the condition and output to your Requirment
a_list.append("")
else:
a_list.append(word)
# print(",".join(a_list))
yield self.delimiter.join(a_list)
with beam.Pipeline() as pipeline:
plants = (
pipeline
| 'Create Dummy Input' >> beam.Create([ "Loc 1,loc 2,N/A,loc 3,loc 4"])
| 'Split words and Remove N/A' >> beam.ParDo(FilterFn(',', 'N/A'))
| beam.Map(print)#Do Further Processing
)
The output String that I'm getting after this ParDo is
Loc 1,loc 2,,loc 3,loc 4

Related

How to reduce called paramets in methods?

I have a class and in that class I have a method that calls multiple methods in it.
But the problem I am facing now is that when the method with the multiple methods in it duplicate parameter has.
And so when I am calling the method with the multiple methods in it, it returns a empty list:[].
So this is the method with the multiple methods in it:
def show_extracted_data_from_file(self, file_name):
self.extractingText.extract_text_from_image(file_name)
total_fruit = self.filter_verdi_total_number_fruit()
fruit_name = self.filter_verdi_fruit_name()
fruit_total_cost = self.filter_verdi_total_fruit_cost(file_name)
return "\n".join("{} \t {} \t {}".format(a, b, c) for a, b, c in zip(total_fruit, fruit_name, fruit_total_cost))
and this is the method: filter_verdi_total_fruit_cost:
def filter_verdi_total_fruit_cost(self, file_name):
locale.setlocale(locale.LC_ALL, locale='Dutch')
self.extractingText.extract_text_from_image(file_name)
return [
locale.atof(items[-1]) for items in (
token.split() for token in file_name.split('\n')
) if len(items) > 2 and items[1] in self.extractingText.list_fruit
]
this method returns the following data:
[123.2, 2772.0, 46.2, 577.5, 69.3, 3488.16, 137.5, 500.0, 1000.0, 2000.0, 1000.0, 381.25]
You see that I am calling two times file_name.
and so when I calling the method show_extracted_data_from_file in the views.py:
if uploadfile.image.path.endswith('.pdf'):
content = filter_text.show_extracted_data_from_file(uploadfile.image.path)
print(content)
it produces a empty list: []
Question: how can I reduce the parameter file_name so that it will return the correct results?
this are my two other methods that I am calling in the combined method:
def filter_verdi_total_number_fruit(self):
regex = r"(\d*(?:\.\d+)*)\s*\W+(?:" + '|'.join(re.escape(word)
for word in self.extractingText.list_fruit) + ')'
return re.findall(regex, self.extractingText.text_factuur_verdi[0])
def filter_verdi_fruit_name(self):
regex = r"(?:\d*(?:\.\d+)*)\s*\W+(" + '|'.join(re.escape(word)
for word in self.extractingText.list_fruit) + ')'
return re.findall(regex, self.extractingText.text_factuur_verdi[0])
So this is the other class:
class ExtractingTextFromFile:
def extract_text_from_image(self, filename):
self.text_factuur_verdi = []
pdf_file = wi(filename=filename, resolution=300)
all_images = pdf_file.convert('jpeg')
for image in all_images.sequence:
image = wi(image=image)
image = image.make_blob('jpeg')
image = Image.open(io.BytesIO(image))
text = pytesseract.image_to_string(image, lang='eng')
self.text_factuur_verdi.append(text)
return self.text_factuur_verdi
def __init__(self):
# class variables:
self.tex_factuur_verdi = []
self.list_fruit = ['Appels', 'Ananas', 'Peen Waspeen',
'Tomaten Cherry', 'Sinaasappels',
'Watermeloenen', 'Rettich', 'Peren', 'Peen',
'Mandarijnen', 'Meloenen', 'Grapefruit', 'Rettich']
#AndrewRyan has the right idea.
I presume calling extract_text_from_image just adds the attribute list_fruit
Two routes you can go, from what you are commenting you'll probably just go with #1.. but I gave #2 as another option in case you'd ever want to call filter_verdi_total_fruit_cost by itself.
Path 1, Just remove it.
Note: filter_verdi_total_fruit_cost is only called from show_extracted_data_from_file.
def show_extracted_data_from_file(self, file_name):
# extract text
# Note: stores data in `self.extractingText.list_fruit`
self.extractingText.extract_text_from_image(file_name)
total_fruit = self.filter_verdi_total_number_fruit()
fruit_name = self.filter_verdi_fruit_name()
fruit_total_cost = self.filter_verdi_total_fruit_cost()
return "\n".join("{} \t {} \t {}".format(a, b, c) for a, b, c in zip(total_fruit, fruit_name, fruit_total_cost))
def filter_verdi_total_fruit_cost(self):
# Note: `self.extractingText.list_fruit` should be already defined
locale.setlocale(locale.LC_ALL, locale='Dutch')
return [
locale.atof(items[-1]) for items in (
token.split() for token in file_name.split('\n')
) if len(items) > 2 and items[1] in self.extractingText.list_fruit
]
Path 2, Check if it's already extracted- if not, extract; if so, continue
Note: if you wanted to just call filter_verdi_total_fruit_cost
def show_extracted_data_from_file(self, file_name):
# extract text
# Note: stores data in `self.extractingText.list_fruit`
self.extractingText.extract_text_from_image(file_name)
total_fruit = self.filter_verdi_total_number_fruit()
fruit_name = self.filter_verdi_fruit_name()
fruit_total_cost = self.filter_verdi_total_fruit_cost(file_name)
return "\n".join("{} \t {} \t {}".format(a, b, c) for a, b, c in zip(total_fruit, fruit_name, fruit_total_cost))
def filter_verdi_total_fruit_cost(self, file_name):
locale.setlocale(locale.LC_ALL, locale='Dutch')
if not hasattr(self, 'list_fruit'):
# file hasn't been extracted yet.. extract it
# Note: stores data in `self.extractingText.list_fruit`
self.extractingText.extract_text_from_image(file_name)
return [
locale.atof(items[-1]) for items in (
token.split() for token in file_name.split('\n')
) if len(items) > 2 and items[1] in self.extractingText.list_fruit
]

MapReduce on several columns

I am new to MapReduce and have troubles with writing a MapReduce job which would work with several columns at once for a CSV file.
I would like to find for each garment group the most frequent product, the second most frequent section and the most frequent department so that the output would look according to the following schema: garment_group, product, section, department. This is based on the articles.csv dataset from kaggle.
So far I could only find for each garment group the most frequent product and do not understand how to incorporate the other columns. This is my code:
from mrjob.step import MRStep
from mrjob.util import log_to_stream, log_to_null
from mr3px.csvprotocol import CsvProtocol
import csv
import logging
log = logging.getLogger(__name__)
class MyMRJob1(MRJob):
OUTPUT_PROTOCOL = CsvProtocol # write output as CSV
def set_up_logging(cls, quiet=False, verbose=False, stream=None):
log_to_stream(name='mrjob', debug=verbose, stream=stream)
log_to_stream(name='__main__', debug=verbose, stream=stream)
def mapper(self, _, line):
result = next(csv.reader([line],quotechar=None)) # extract columns from line
garment_group_name = result[23]
prod_name = result[2]
#section_name = result[21]
#department_name = result[15]
# name = result[2]
#skip sparse entries and header
if prod_name == "prod_name" or prod_name == "" or garment_group_name == "": #skip sparse entries and header
return
yield (garment_group_name,prod_name), 1
def reducer(self,garmetProd,valuelist):
garmet, prod = garmetProd
output = sum(valuelist)
yield None,(garmet,prod,output)
def mapper_top(self, _, line):
result = line # input from last round already a list of strings
garmet = result[0]
prod = result[1]
nProd = result[2]
yield garmet, (prod,nProd)
def reducer_top(self,garmet,values):
mostProduct = "" # most favourite product per garmet group
maxBought = 0 # max amount of times bought
for (prod,nProds) in values:
if int(nProds) > maxBought:
maxBought = int(nProds)
mostProduct = prod
if maxBought > 0:
#CsvProtocol needs None key for output
yield None, (garmet,mostProduct)
def steps(self):
return [
MRStep(mapper = self.mapper,
reducer = self.reducer),
MRStep(mapper = self.mapper_top,
reducer = self.reducer_top)
]
if __name__ == '__main__':
MyMRJob1.run()

More Idiomatic way of extracting column values and assigning it back to DF (Pandas)

I have an existing routine that is running just fine. But since I'm new to python I find my codes to be ugly and I'm finding ways to improve it.
My program goes this way, I have created a class where it needs to have a complete address string which I need to process. Thus, this class have 4 attributes namely, address, state, city and zipcode.
This is the said class:
class Address:
def __init__(self, fulladdress):
self.fulladdress = fulladdress.split(",")
self.address = self.get_address()
self.city = self.get_city()
stateandzip = str(self.fulladdress[-1]).strip()
self.statezip = stateandzip.split(" ")
self.state = self.get_state()
self.zipcode = self.get_zipcode()
def get_address(self):
len_address = len(self.fulladdress)
if len_address == 3:
return self.fulladdress[0].strip()
elif len_address == 4:
return self.fulladdress[0].strip() + ", " + self.fulladdress[1].strip()
elif len_address > 5:
temp_address = self.fulladdress[0]
for ad in self.fulladdress[0:-3]:
temp_address = temp_address + ", " + ad.strip()
return temp_address
else:
return ''
def get_city(self):
if len(self.fulladdress) > 0:
address = self.fulladdress[-2]
return address.strip()
else:
return ''
def get_state(self):
if len(self.fulladdress) > 0:
return self.statezip[0]
else:
return ''
def get_zipcode(self):
if len(self.fulladdress) > 0:
return self.statezip[1]
else:
return ''
Now my existing routine needs to append this results to my dataframe which based on the address column. What I did to parse the address data is I use df.iterrows() since I don't know how can I use the Address Class using the df.apply method.
Here is the routine:
import pandas as pd
import datahelper as dh
import address as ad
# Find the header name of the Address column
address_header = dh.findcolumn('Address', list(df.columns))
header_loc = df.columns.get_loc(address_header)
address = []
city = []
state = []
zipcode = []
for index, row in df.iterrows():
if not row[address_header]:
address.append('')
city.append('')
state.append('')
zipcode.append('')
continue
# extract details from the address
address_data = ad.Address(row[address_header])
address.append(address_data.address)
city.append(address_data.city)
state.append(address_data.state)
zipcode.append(address_data.zipcode)
df[address_header] = address
df.insert(header_loc + 1, 'City', city)
df.insert(header_loc + 2, 'State', state)
df.insert(header_loc + 3, 'Zip Code', zipcode)
I would really appreciate if someone can point me in the right direction. Thank you in advance.
By the way, dh is a datahelper module where I put all my helper functions.
def findcolumn(searchstring, list):
if searchstring in list:
return searchstring
else:
try:
return [i for i in list if searchstring in i][0]
except ValueError:
return None
except IndexError:
return None
And here is my desired output given the sample data from Address column.
df = pd.DataFrame({'Address': ['Rubin Center Dr Ste, Fort Mill, SC 29708', 'Miami, FL 33169']})
Output should be:
Address | City | State | Zip Code
--------------------------------------------------
Rubin Center Dr Ste |Fort Mill| SC |29708
--------------------------------------------------
|Miami | FL |33169

openpyxl read tables from existing data book example?

In the openpyxl documentation there is an example of how to place a table into a workbook but there are no examples of how to find back the tables of a workbook. I have an XLS file that has named tables in it and I want to open the file, find all of the tables and parse them. I cannot find any documentation on how to do this. Can anyone help?
In the meantime I worked it out and wrote the following class to work with openpyxl:
class NamedArray(object):
''' Excel Named range object
Reproduces the named range feature of Microsoft Excel
Assumes a definition in the form <Worksheet PinList!$A$6:$A$52 provided by openpyxl
Written for use with, and initialised by the get_names function
After initialisation named array can be used in the same way as for VBA in excel
Written for openpyxl version 2.4.1, may not work with earlier versions
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, wb, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given workbook
'''
self.sheet, cellrange_str = str(named_range_raw).split('!')
self.sheet = self.sheet.replace("'",'') # remove the single quotes if they exist
self.loc = wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo, hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def value(self, row=1, col=1):
''' Returns the value at row, col
'''
assert row <= self.rows , 'invalid row number given'
assert col <= self.cols , 'invalid column number given'
return self.loc.cell(self.n_to_col(self.min_col + col-1)+str(self.row + row-1)).value
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
def __contains__(self, val):
rval = False
for row in range(1,self.rows+1):
for col in range(1,self.cols+1):
if self.value(row,col) == val:
rval = True
return rval
def vlookup(self, key, col):
''' excel style vlookup function
'''
assert col <= self.cols , 'invalid column number given'
rval = None
for row in range(1,self.rows+1):
if self.value(row,1) == key:
rval = self.value(row, col)
break
return rval
def hlookup(self, key, row):
''' excel style hlookup function
'''
assert row <= self.rows , 'invalid row number given'
rval = None
for col in range(1,self.cols+1):
if self.value(1,col) == key:
rval = self.value(row, col)
break
return rval
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls, n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
def get_names(workbook, filt='', debug=False):
''' Create a structure containing all of the names in the given workbook
filt is an optional parameter and used to create a subset of names starting with filt
useful for IO_ring_spreadsheet as all names start with 'n_'
if present, filt characters are stipped off the front of the name
'''
named_ranges = workbook.defined_names.definedName
name_list = {}
for named_range in named_ranges:
name = named_range.name
if named_range.attr_text.startswith('#REF'):
print('WARNING: named range "', name, '" is undefined')
elif filt == '' or name.startswith(filt):
name_list[name[len(filt):]] = NamedArray(workbook, named_range.attr_text)
if debug:
with open("H:\\names.txt",'w') as log:
for item in name_list:
print (item, '=', name_list[item])
log.write(item.ljust(30) + ' = ' + str(name_list[item])+'\n')
return name_list
I agree that the documentation does not really help, and the public API also seems to have only add_table() method.
But then I found an openpyxl Issue 844 asking for a better interface, and it shows that worksheet has an _tables property.
This is enough to get a list of all tables in a file, together with some basic properties:
from openpyxl import load_workbook
wb = load_workbook(filename = 'test.xlsx')
for ws in wb.worksheets:
print("Worksheet %s include %d tables:" % (ws.title, len(ws._tables)))
for tbl in ws._tables:
print(" : " + tbl.displayName)
print(" - name = " + tbl.name)
print(" - type = " + (tbl.tableType if isinstance(tbl.tableType, str) else 'n/a')
print(" - range = " + tbl.ref)
print(" - #cols = %d" % len(tbl.tableColumns))
for col in tbl.tableColumns:
print(" : " + col.name)
Note that the if/else construct is required for the tableType, since it can return NoneType (for standard tables), which is not convertible to str.
Building on #MichalKaut's answer, I created a simple function that returns a dictionary with all tables in a given workbook. It also puts each table's data into a Pandas DataFrame.
from openpyxl import load_workbook
import pandas as pd
def get_all_tables(filename):
""" Get all tables from a given workbook. Returns a dictionary of tables.
Requires a filename, which includes the file path and filename. """
# Load the workbook, from the filename, setting read_only to False
wb = load_workbook(filename=file, read_only=False, keep_vba=False, data_only=True, keep_links=False)
# Initialize the dictionary of tables
tables_dict = {}
# Go through each worksheet in the workbook
for ws_name in wb.sheetnames:
print("")
print(f"worksheet name: {ws_name}")
ws = wb[ws_name]
print(f"tables in worksheet: {len(ws.tables)}")
# Get each table in the worksheet
for tbl in ws.tables.values():
print(f"table name: {tbl.name}")
# First, add some info about the table to the dictionary
tables_dict[tbl.name] = {
'table_name': tbl.name,
'worksheet': ws_name,
'num_cols': len(tbl.tableColumns),
'table_range': tbl.ref}
# Grab the 'data' from the table
data = ws[tbl.ref]
# Now convert the table 'data' to a Pandas DataFrame
# First get a list of all rows, including the first header row
rows_list = []
for row in data:
# Get a list of all columns in each row
cols = []
for col in row:
cols.append(col.value)
rows_list.append(cols)
# Create a pandas dataframe from the rows_list.
# The first row is the column names
df = pd.DataFrame(data=rows_list[1:], index=None, columns=rows_list[0])
# Add the dataframe to the dictionary of tables
tables_dict[tbl.name]['dataframe'] = df
return tables_dict
# File location:
file = r"C:\Users\sean\spreadsheets\full_of_tables.xlsx"
# Run the function to return a dictionary of all tables in the Excel workbook
tables_dict = get_all_tables(filename=file)
The answer to this has changed.
ws objects now contain the tables accessor which acts as a dictionary. Updated answer is:
tmp = [ws.tables for ws in wb.worksheets]
tbls = [{v.name:v} for t in tmp for v in t.values()]
I'm not sure what you mean by parsing but read-support for worksheet tables has been possible since version 2.4.4. If you have questions about the details then I suggest you ask your question on the openpyxl mailing list as that is a more suitable place for this kind of discussion.
I don't think this is possible. I seems to work similarly to images; if you read and save a file with a table it will get striped.

Get correct brace grouping from string

I have files with incorrect JSON that I want to start fixing by getting it into properly grouped chunks.
The brace grouping {{ {} {} } } {{}} {{{}}} should already be correct
How can I grab all the top-level braces, correctly grouped, as separate strings?
If you don't want to install any extra modules simple function will do:
def top_level(s):
depth = 0
start = -1
for i, c in enumerate(s):
if c == '{':
if depth == 0:
start = i
depth += 1
elif c == '}' and depth:
depth -= 1
if depth == 0:
yield s[start:i+1]
print(list(top_level('{{ {} {} } } {{}} {{{}}}')))
Output:
['{{ {} {} } }', '{{}}', '{{{}}}']
It will skip invalid braces but could be easily modified to report an error when they are spotted.
Using the regex module:
In [1]: import regex
In [2]: braces = regex.compile(r"\{(?:[^{}]++|(?R))*\}")
In [3]: braces.findall("{{ {} {} } } {{}} {{{}}}")
Out[3]: ['{{ {} {} } }', '{{}}', '{{{}}}']
pyparsing can be really helpful here. It will handle pathological cases where you have braces inside strings, etc. It might be a little tricky to do all of this work yourself, but fortunately, somebody (the author of the library) has already done the hard stuff for us.... I'll reproduce the code here to prevent link-rot:
# jsonParser.py
#
# Implementation of a simple JSON parser, returning a hierarchical
# ParseResults object support both list- and dict-style data access.
#
# Copyright 2006, by Paul McGuire
#
# Updated 8 Jan 2007 - fixed dict grouping bug, and made elements and
# members optional in array and object collections
#
json_bnf = """
object
{ members }
{}
members
string : value
members , string : value
array
[ elements ]
[]
elements
value
elements , value
value
string
number
object
array
true
false
null
"""
from pyparsing import *
TRUE = Keyword("true").setParseAction( replaceWith(True) )
FALSE = Keyword("false").setParseAction( replaceWith(False) )
NULL = Keyword("null").setParseAction( replaceWith(None) )
jsonString = dblQuotedString.setParseAction( removeQuotes )
jsonNumber = Combine( Optional('-') + ( '0' | Word('123456789',nums) ) +
Optional( '.' + Word(nums) ) +
Optional( Word('eE',exact=1) + Word(nums+'+-',nums) ) )
jsonObject = Forward()
jsonValue = Forward()
jsonElements = delimitedList( jsonValue )
jsonArray = Group(Suppress('[') + Optional(jsonElements) + Suppress(']') )
jsonValue << ( jsonString | jsonNumber | Group(jsonObject) | jsonArray | TRUE | FALSE | NULL )
memberDef = Group( jsonString + Suppress(':') + jsonValue )
jsonMembers = delimitedList( memberDef )
jsonObject << Dict( Suppress('{') + Optional(jsonMembers) + Suppress('}') )
jsonComment = cppStyleComment
jsonObject.ignore( jsonComment )
def convertNumbers(s,l,toks):
n = toks[0]
try:
return int(n)
except ValueError, ve:
return float(n)
jsonNumber.setParseAction( convertNumbers )
Phew! That's a lot ... Now how do we use it? The general strategy here will be to scan the string for matches and then slice those matches out of the original string. Each scan result is a tuple of the form (lex-tokens, start_index, stop_index). For our use, we don't care about the lex-tokens, just the start and stop. We could do: string[result[1], result[2]] and it would work. We can also do string[slice(*result[1:])] -- Take your pick.
results = jsonObject.scanString(testdata)
for result in results:
print '*' * 80
print testdata[slice(*result[1:])]

Categories