Parsing relatively structured text files in python and inserting in mongodb - python

Testbed: ABC123
Image : FOOBAR
Keyword: heredity
Date : 6/27
Other : XYZ suite crash
Suite : XYZ, crash post XYZ delivery
Failure:
Reason :
Known :
Failure:
Reason :
Known :
Type :
Notes :
Testbed: ABC456
Image : FOOBAR
Keyword: isolate
Date :6/27
Other : 3 random failures in 3 different test suites
Suite : LMO Frag
Failure: jumbo_v4_to_v6
Reason : ?
Known : ?
Type :
Notes :
Suite : XYZ suite
Failure: XYZ_v4_to_v4v
Reason : failed to receive expected packets
Known : ?
Type :
Notes :
Suite : RST
Failure: RST_udp_v4_to_v6
Reason : failed to receive expected packets
Known : ?
Type :
Notes :
Image : BARFOO
Keyword: repugnat
Date : 6/26
Other :
Suite : PQR test
Failure: unable to destroy flow - flow created without ppx flow id
Reason : SCRIPT issue
Known : maybe?
Type : embtest
Notes :
Suite : UVW suite
Failure: 8 failures in UVW duplicate - interworking cases not working!
Reason : ?
Known : ?
Type :
Notes :
I am trying to create documents of the type
{
"_id" : "xxxxxxxxxxxxx",
"platform" : "ABC123",
"image" : "FOOBAR",
"keyword" : "parricide",
"suite" : [
{
"name" : "RST (rst_only_v6v_to_v6)",
"notes" : "",
"failure" : "flow not added properly",
"reason" : "EMBTEST script issue",
"known" : "yes?",
"type" : ""
}
]
}
Where each document is unique based on the testbed, platform and image.
I have tried using regex and came up with something of this format but this is prone to human error in creating the structured text in which case this would fail due to its dependencies:
for iter in content:
if re.match(r"\s*testbed",iter,re.IGNORECASE):
testbed = iter.split(':')[1].strip()
if result_doc['platform'] == None:
result_doc['platform'] = testbed
if re.match(r"\s*image",iter,re.IGNORECASE):
image = iter.split(':')[1].strip()
if result_doc['image'] == None:
result_doc['image'] = image
if re.match(r"\s*keyword",iter,re.IGNORECASE):
keyword = iter.split(':')[1].strip()
if result_doc['keyword'] == None:
result_doc['keyword'] = keyword
key = str(testbed)+'-'+str(image)+'-'+str(keyword)
if prev_key == None:
prev_key = key
if key != prev_key: #if keys differ, then add to db
self.insert(result_doc)
prev_key = key
result_doc = self.getTemplate("result") #assign new document template
result_doc['platform'] = testbed
result_doc['image'] = image
result_doc['keyword'] = keyword
result_doc['_id'] = key
if re.match(r"\s*suite",iter,re.IGNORECASE):
suitename = iter.split(':')[1].strip()
if re.match(r"\s*Failure",iter,re.IGNORECASE):
suitefailure = iter.split(':')[1].strip()
result_suite = self.getTemplate("suite") # assign new suite template
result_suite['name'] = suitename
result_suite['failure'] = suitefailure
if re.match(r"\s*Reason",iter,re.IGNORECASE):
suitereason = iter.split(':')[1].strip()
result_suite['reason'] = suitereason
if re.match(r"\s*Known",iter,re.IGNORECASE):
suiteknown = iter.split(':')[1].strip()
result_suite['known'] = suiteknown
if re.match(r"\s*type",iter,re.IGNORECASE):
suitetype = iter.split(':')[1].strip()
result_suite['type'] = suitetype
if re.match(r"\s*Notes",iter,re.IGNORECASE):
suitenotes = iter.split(':')[1].strip()
result_suite['notes'] = suitenotes
result_doc['suite'].append(result_suite)
self.insert(result_doc) #Last document to be inserted
Is there a better way to do this than match on the next tag to create a new document??
Thanks

Yes there is definitely a better, more robust way to do this. One would use a hash table, or python "dictionary," to store the key value pairings provided in an input file and do some formatting to print them out in the desired output format.
# Predefine some constants / inputs
testbed_dict = { "_id" : "xxxxxxxxxxxxx", "platform" : "ABC456" }
inputFile = "ABC456.txt"
with open(inputFile,"r") as infh:
inputLines = infh.readlines()
image_start_indices = [inputLines.index(x) for x in inputLines if x.split(":")[0].strip() == "Image"]
image_end_indices = [x-1 for x in image_start_indices[1:]]
image_end_indices.append(len(inputLines)-1)
image_start_stops = zip(image_start_indices, image_end_indices)
suite_start_indices = [i for i, x in enumerate(inputLines) if x.split(":")[0].strip() == "Suite"]
suite_end_indices = [i+1 for i, x in enumerate(inputLines) if x.split(":")[0].strip() == "Notes"]
suite_start_stops = zip(suite_start_indices,suite_end_indices)
for image_start_index, image_stop_index in image_start_stops:
suiteCount = 1
image_suite_indices, suites, image_dict = [], [], {}
for start, stop in suite_start_stops:
if start >= image_stop_index or image_start_index >= stop:
continue
image_suite_indices.append((start,stop))
suites = [inputLines[x:y] for x, y in image_suite_indices]
header_end_index = min([x for x, y in image_suite_indices])
for line in inputLines[image_start_index:header_end_index]:
if line.strip() == "":
continue
key, value = (line.split(":")[0].strip().lower(), line.split(":")[1].strip())
image_dict[key] = value
for suite in suites:
suite_dict = {}
for line in suite:
if line.strip() == "":
continue
key, value = (line.split(":")[0].strip().lower(), line.split(":")[1].strip())
suite_dict[key] = value
image_dict["suite "+str(suiteCount)] = suite_dict
suiteCount += 1
with open(image_dict["image"]+".txt","w") as outfh:
outfh.write('{\n')
for key, value in testbed_dict.iteritems():
outfh.write('\t"'+key+'" : "'+testbed_dict[key]+'"\n')
for key, value in image_dict.iteritems():
if 'suite' in key:
continue
else:
outfh.write('\t"'+key+'" : "'+value+'",\n')
for key, value in image_dict.iteritems():
if 'suite' not in key:
continue
else:
outfh.write('\t"suite" : [\n\t\t{\n')
for suitekey, suitevalue in value.iteritems():
outfh.write('\t\t\t"'+suitekey+'" : "'+str(suitevalue)+'",\n')
outfh.write("\t\t}\n")
outfh.write("\t],\n")
outfh.write('}\n')
The above code expects to be run in the same directory as an input file (i.e. ' inputFile = "ABC456.txt" '), and writes a variable number of output files depending on how many "images" are present in the input -- in the case of your ABC456 the outputs written would be "FOOBAR.txt" and "BARFOO.txt". For example, if "ABC456.txt" contains the text contents of the section "Testbed: ABC456" in your question above, then the outputs will be the following.
BARFOO.txt
{
"platform" : "ABC456"
"_id" : "xxxxxxxxxxxxx"
"keyword" : "repugnat",
"image" : "BARFOO",
"other" : "",
"date" : "6/26",
"suite" : [
{
"notes" : "",
"failure" : "8 failures in UVW duplicate - interworking cases not working!",
"reason" : "?",
"known" : "?",
"suite" : "UVW suite",
"type" : "",
}
],
"suite" : [
{
"notes" : "",
"failure" : "unable to destroy flow - flow created without ppx flow id",
"reason" : "SCRIPT issue",
"known" : "maybe?",
"suite" : "PQR test",
"type" : "embtest",
}
],
}
FOOBAR.txt
{
"platform" : "ABC456"
"_id" : "xxxxxxxxxxxxx"
"keyword" : "isolate",
"image" : "FOOBAR",
"other" : "3 random failures in 3 different test suites",
"date" : "6/27",
"suite" : [
{
"notes" : "",
"failure" : "RST_udp_v4_to_v6",
"reason" : "failed to receive expected packets",
"known" : "?",
"suite" : "RST",
"type" : "",
}
],
"suite" : [
{
"notes" : "",
"failure" : "XYZ_v4_to_v4v",
"reason" : "failed to receive expected packets",
"known" : "?",
"suite" : "XYZ suite",
"type" : "",
}
],
"suite" : [
{
"notes" : "",
"failure" : "jumbo_v4_to_v6",
"reason" : "?",
"known" : "?",
"suite" : "LMO Frag",
"type" : "",
}
],
}
The code above works but has some caveats -- it doesn't preserve ordering of the lines, but assuming you're just sticking this JSON into mongoDB certainly ordering doesn't matter. Also you would need to modify it to handle some redundancies -- if the "Suite" line has redundant info nested under it (e.g. multiple "Failure" lines, like in your ABC123 example) all but one is ignored. Hopefully you get a chance to look through the code, figure out how it's working, and modify it to meet whatever your needs are.
Cheers.

Related

Removing items from JSON using Python loop

how do I iterate over the data and keep object keys that have the string "Java" in the value and remove keys with the string "Javascript" in the value? In addition to the iterations I already have in my code. For example:
this key has the word 'Java' in the value.
"value" : "A vulnerability in the encryption implementation of EBICS messages in the open source librairy ebics-java/ebics-java-client allows an attacker sniffing network traffic to decrypt EBICS payloads. This issue affects: ebics-java/ebics-java-client versions prior to 1.2."
the current code below iterates thru other JSON items (that are also needed), but not the Java/Javascript issue.
from encodings import utf_8
import json
from zipfile import ZipFile
from urllib.request import urlretrieve
from io import BytesIO
import os
url = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2022.json.zip"
urlretrieve(url, "nvdcve-1.1-2022.json.zip")
with ZipFile('nvdcve-1.1-2022.json.zip', 'r') as zip:
zip.extractall('.')
with open('nvdcve-1.1-2022.json', encoding='utf-8') as x:
data = json.load(x)
#function to sort through without rewriting code with parameters/arguments passed into the function(variable)
def base_score(metric):
if 'baseMetricV3' not in metric['impact']:
#no values = 0 so it will auto sort by ID
return (0, metric['cve']['CVE_data_meta']['ID'])
#sorts by ID if two or more base scores are equal
return (metric['impact']['baseMetricV3']['cvssV3']['baseScore'], metric['cve']['CVE_data_meta']['ID'])
#return allows assigment of function output to new variable
#direct python to open json file using specific encoding to avoid encoding error
for CVE_Item in data['CVE_Items']:
for node in CVE_Item['configurations']['nodes']:
#removes items while iterating through them
node['cpe_match'][:] = [item for item in node['cpe_match'] if item['vulnerable']]
#also check children objects for vulnerable
if node['children']:
for children_node in node['children']:
children_node['cpe_match'][:] = [item for item in children_node['cpe_match'] if item['vulnerable']]
#sorts data in descending order using reverse
data['CVE_Items'].sort(reverse=True, key=base_score)
#write file to current working directory
with open('sorted_nvdcve-1.1-2022.json', 'w') as new_file:
new_file.write(json.dumps(data, indent=4))
if os.path.exists('nvdcve-1.1-2022.json.zip'):
os.remove('nvdcve-1.1-2022.json.zip')
else:
print("The file does not exist")
if os.path.exists('nvdcve-1.1-2022.json'):
os.remove('nvdcve-1.1-2022.json')
else:
print("The file does not exist")
here is the link to the original JSON file (too large to post entire text here):
https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2022.json.zip
the key 'value' is located in the 'description' list.
here is a sample of the JSON text:
{
"CVE_data_type" : "CVE",
"CVE_data_format" : "MITRE",
"CVE_data_version" : "4.0",
"CVE_data_numberOfCVEs" : "15972",
"CVE_data_timestamp" : "2022-11-01T07:00Z",
"CVE_Items" : [ {
"cve" : {
"data_type" : "CVE",
"data_format" : "MITRE",
"data_version" : "4.0",
"CVE_data_meta" : {
"ID" : "CVE-2022-0001",
"ASSIGNER" : "secure#intel.com"
},
"problemtype" : {
"problemtype_data" : [ {
"description" : [ {
"lang" : "en",
"value" : "NVD-CWE-noinfo"
} ]
} ]
},
"references" : {
"reference_data" : [ {
"url" : "https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00598.html",
"name" : "https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00598.html",
"refsource" : "MISC",
"tags" : [ "Vendor Advisory" ]
}, {
"url" : "http://www.openwall.com/lists/oss-security/2022/03/18/2",
"name" : "[oss-security] 20220318 Xen Security Advisory 398 v2 - Multiple speculative security issues",
"refsource" : "MLIST",
"tags" : [ "Mailing List", "Third Party Advisory" ]
}, {
"url" : "https://www.oracle.com/security-alerts/cpujul2022.html",
"name" : "N/A",
"refsource" : "N/A",
"tags" : [ "Patch", "Third Party Advisory" ]
}, {
"url" : "https://security.netapp.com/advisory/ntap-20220818-0004/",
"name" : "https://security.netapp.com/advisory/ntap-20220818-0004/",
"refsource" : "CONFIRM",
"tags" : [ "Third Party Advisory" ]
} ]
},
"description" : {
"description_data" : [ {
"lang" : "en",
"value" : "JavaScript sharing of branch predictor selectors between contexts in some Intel(R) Processors may allow an authorized user to potentially enable information disclosure via local access."
} ]
}
Add this inside the for CVE_Item loop.
CVE_Item['cve']['description']['description_data'] = [
d for d in CVE_Item['cve']['description']['description_data']
if 'Java' in d['value'] and 'JavaScript' not in d['value']]
The modified loop looks like:
for CVE_Item in data['CVE_Items']:
CVE_Item['cve']['description']['description_data'] = [
d for d in CVE_Item['cve']['description']['description_data']
if 'Java' in d['value'] and 'JavaScript' not in d['value']]
for node in CVE_Item['configurations']['nodes']:
#removes items while iterating through them
node['cpe_match'][:] = [item for item in node['cpe_match'] if item['vulnerable']]
#also check children objects for vulnerable
if node['children']:
for children_node in node['children']:
children_node['cpe_match'][:] = [item for item in children_node['cpe_match'] if item['vulnerable']]

Auto increment pymongo

I am trying to auto increment a field in my mongo collection. The field is an 'id' field and it contains the 'id' of each document. For example. 1, 2, 3 etc.
What I want to happen is insert a new document and take the 'id' from the last document and add 1 to it so that the new document is lastID + 1.
The way I have written the code makes it so that it gets the last document and adds 1 to the last document and then updates it. So if the last id is 5, then the new document will have 5 and the document that I was incrementing on now has the new 'id' of 6.
I am not sure how to get round this so any help would be appreciated.
Code
last_id = pokemons.find_one({}, sort=[( 'id', -1)])
last_pokemon = pokemons.find_one_and_update({'id' : last_id['id']}, {'$inc': {'id': 1}}, sort=[( 'id', -1)])
new_pokemon = {
"name" : name, "avg_spawns" : avg_spawns, "candy" : candy, "img" : img_link, "weaknesses" : [], "type" : [], "candy_count" : candy_count,
"egg" : egg, "height" : height, "multipliers" : [], "next_evolution" : [], "prev_evolution" : [],
"spawn_chance" : spawn_chance, "spawn_time" : spawn_time, "weight" : weight, "id" : last_pokemon['id'], "num" : last_pokemon['id'],
}
pokemons.insert_one(new_pokemon)
The variables in new_pokemon don't matter as I am just having issues with the last_pokemon part
The find_one command in MongoDB command doesn't support sort functionality. You have to make use of normal find command with limit parameter set to 1.
last_id = pokemons.find({}, {"id": 1}, sort=[('id', -1)]).limit(1).next() # Will error if there are no documents in collection due to the usage of `next()`
last_id["id"] += 1
new_pokemon = {
"name" : name, "avg_spawns" : avg_spawns, "candy" : candy, "img" : img_link, "weaknesses" : [], "type" : [], "candy_count" : candy_count,
"egg" : egg, "height" : height, "multipliers" : [], "next_evolution" : [], "prev_evolution" : [],
"spawn_chance" : spawn_chance, "spawn_time" : spawn_time, "weight" : weight, "id" : last_id['id'], "num" : last_id['id'],
}
pokemons.insert_one(new_pokemon)

Eliminate keys from list of dict python

i am pulling out information from this websites API:
https://financialmodelingprep.com/
to be specific i need the data from the income statements:
https://financialmodelingprep.com/developer/docs/#Company-Financial-Statements
what i get back from the API is a list, which contains 36 dictionarys with the following Data:
[ {
"date" : "2019-09-28",
"symbol" : "AAPL",
"fillingDate" : "2019-10-31 00:00:00",
"acceptedDate" : "2019-10-30 18:12:36",
"period" : "FY",
"revenue" : 260174000000,
"costOfRevenue" : 161782000000,
"grossProfit" : 98392000000,
"grossProfitRatio" : 0.378178,
"researchAndDevelopmentExpenses" : 16217000000,
"generalAndAdministrativeExpenses" : 18245000000,
"sellingAndMarketingExpenses" : 0.0,
"otherExpenses" : 1807000000,
"operatingExpenses" : 34462000000,
"costAndExpenses" : 196244000000,
"interestExpense" : 3576000000,
"depreciationAndAmortization" : 12547000000,
"ebitda" : 81860000000,
"ebitdaratio" : 0.314636,
"operatingIncome" : 63930000000,
"operatingIncomeRatio" : 0.24572,
"totalOtherIncomeExpensesNet" : 422000000,
"incomeBeforeTax" : 65737000000,
"incomeBeforeTaxRatio" : 0.252666,
"incomeTaxExpense" : 10481000000,
"netIncome" : 55256000000,
"netIncomeRatio" : 0.212381,
"eps" : 2.97145,
"epsdiluted" : 2.97145,
"weightedAverageShsOut" : 18595652000,
"weightedAverageShsOutDil" : 18595652000,
"link" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119-index.html",
"finalLink" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/a10-k20199282019.htm"
}, ...
]
What i dont need in the dictionary are the keys:
fillingDate, acceptedDate, link, finalLink
I managed to remove them, but my problem is that now that piece of code i wrote spits out those dictionaries way too often, and i am not able to understand why...
Here is what i tried:
import requests
import json
url = "https://financialmodelingprep.com/api/v3/income-statement/AAPL?apikey=b60bb3d1967bb15bfb9daaa4426e77dc"
response = requests.get(url)
data = response.text
dataList = json.loads(data)
entriesToRemove = {
'fillingDate' : 0,
'acceptedDate' : 0,
'link' : 0,
'finalLink' : 0
}
removedEntries = []
newDict = {}
for index in range(len(dataList)):
for key in dataList[index]:
newDict[key] = dataList[index].get(key)
if key in entriesToRemove:
removedEntries = newDict.pop(key)
print(json.dumps(newDict, indent=4))
Thanks in advance
OP:
for each key in the dictionary, the dictionary gets printed a new time.
Reason:
for index in range(len(dataList)):
for key in dataList[index]:
newDict[key] = dataList[index].get(key)
if key in entriesToRemove:
removedEntries = newDict.pop(key)
print(json.dumps(newDict, indent=4)) # notice this line
The reason why the dictionary is printed for each key is because you have a print(json.dumps(newDict, indent=4)) statement inside the loop for each key-val iteration over the dictionary.
To eradicate the highlighted keys from a list of dict, you could iterate over the list and create another list of dict without the unnecessary keys:
s = [ {
"date" : "2019-09-28",
"symbol" : "AAPL",
"fillingDate" : "2019-10-31 00:00:00",
"acceptedDate" : "2019-10-30 18:12:36",
"period" : "FY",
"revenue" : 260174000000,
"costOfRevenue" : 161782000000,
"grossProfit" : 98392000000,
"grossProfitRatio" : 0.378178,
"researchAndDevelopmentExpenses" : 16217000000,
"generalAndAdministrativeExpenses" : 18245000000,
"sellingAndMarketingExpenses" : 0.0,
"otherExpenses" : 1807000000,
"operatingExpenses" : 34462000000,
"costAndExpenses" : 196244000000,
"interestExpense" : 3576000000,
"depreciationAndAmortization" : 12547000000,
"ebitda" : 81860000000,
"ebitdaratio" : 0.314636,
"operatingIncome" : 63930000000,
"operatingIncomeRatio" : 0.24572,
"totalOtherIncomeExpensesNet" : 422000000,
"incomeBeforeTax" : 65737000000,
"incomeBeforeTaxRatio" : 0.252666,
"incomeTaxExpense" : 10481000000,
"netIncome" : 55256000000,
"netIncomeRatio" : 0.212381,
"eps" : 2.97145,
"epsdiluted" : 2.97145,
"weightedAverageShsOut" : 18595652000,
"weightedAverageShsOutDil" : 18595652000,
"link" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119-index.html",
"finalLink" : "https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/a10-k20199282019.htm"
}
]
res = []
ignored_keys = ['fillingDate', 'acceptedDate', 'link', 'finalLink']
for dd in s:
for k,v in dd.items():
if k not in ignored_keys:
res.append({k: v})
print(res)
EDIT:
one-liner:
print({k:v for dd in s for k,v in dd.items() if k not in ignored_keys})

Extract values from oddly-nested Python

I must be really slow because I spent a whole day googling and trying to write Python code to simply list the "code" values only so my output will be Service1, Service2, Service2. I have extracted json values before from complex json or dict structure. But now I must have hit a mental block.
This is my json structure.
myjson='''
{
"formatVersion" : "ABC",
"publicationDate" : "2017-10-06",
"offers" : {
"Service1" : {
"code" : "Service1",
"version" : "1a1a1a1a",
"index" : "1c1c1c1c1c1c1"
},
"Service2" : {
"code" : "Service2",
"version" : "2a2a2a2a2",
"index" : "2c2c2c2c2c2"
},
"Service3" : {
"code" : "Service4",
"version" : "3a3a3a3a3a",
"index" : "3c3c3c3c3c3"
}
}
}
'''
#convert above string to json
somejson = json.loads(myjson)
print(somejson["offers"]) # I tried so many variations to no avail.
Or, if you want the "code" stuffs :
>>> [s['code'] for s in somejson['offers'].values()]
['Service1', 'Service2', 'Service4']
somejson["offers"] is a dictionary. It seems you want to print its keys.
In Python 2:
print(somejson["offers"].keys())
In Python 3:
print([x for x in somejson["offers"].keys()])
In Python 3 you must use the list comprehension because in Python 3 keys() is a 'view', not a list.
This should probably do the trick , if you are not certain about the number of Services in the json.
import json
myjson='''
{
"formatVersion" : "ABC",
"publicationDate" : "2017-10-06",
"offers" : {
"Service1" : {
"code" : "Service1",
"version" : "1a1a1a1a",
"index" : "1c1c1c1c1c1c1"
},
"Service2" : {
"code" : "Service2",
"version" : "2a2a2a2a2",
"index" : "2c2c2c2c2c2"
},
"Service3" : {
"code" : "Service4",
"version" : "3a3a3a3a3a",
"index" : "3c3c3c3c3c3"
}
}
}
'''
#convert above string to json
somejson = json.loads(myjson)
#Without knowing the Services:
offers = somejson["offers"]
keys = offers.keys()
for service in keys:
print(somejson["offers"][service]["code"])

Deep check for two python dictionaries and get the difference in report form

Say There are two dictionaries in python -
Dict1
mydict1 = {
"Person" :
{
"FName" : "Rakesh",
"LName" : "Roshan",
"Gender" : "Male",
"Status" : "Married",
"Age" : "60",
"Children" :
[
{
"Fname" : "Hrithik",
"Lname" : "Roshan",
"Gender" : "Male",
"Status" : "Married",
"Children" : ["Akram", "Kamal"],
},
{
"Fname" : "Pinky",
"Lname" : "Roshan",
"Gender" : "Female",
"Status" : "Married",
"Children" : ["Suzan", "Tina", "Parveen"]
}
],
"Movies" :
{
"The Last Day" :
{
"Year" : 1990,
"Director" : "Mr. Kapoor"
},
"Monster" :
{
"Year" : 1991,
"Director" : "Mr. Khanna"
}
}
}
}
Dict2
mydict2 = {
"Person" :
{
"FName" : "Rakesh",
"LName" : "Roshan",
"Gender" : "Male",
"Status" : "Married",
"Children" :
[
{
"Fname" : "Hrithik",
"Lname" : "Losan",
"Gender" : "Male",
"Status" : "Married",
"Children" : ["Akram", "Ajamal"],
},
{
"Fname" : "Pinky",
"Lname" : "Roshan",
"Gender" : "Female",
"Status" : "Married",
"Children" : ["Suzan", "Tina"]
}
]
}
}
I want to compare two dictionaries and print the difference in report format as below -
MISMATCH 1
==========
MATCH DICT KEY : Person >> Children >> LName
EXPECTED : Roshan
ACUTAL : Losan
MISMATCH 2
==========
MATCH LIST ITEM : Person >> Children >> Children
EXPECTED : Kamal
ACTUAL : Ajamal
MISMATCH 3
==========
MATCH LIST ITEM : Person >> Children >> Children
EXPECTED : Parveen
ACTUAL : NOT_FOUND
MISMATCH 4
==========
MATCH DICT KEY : Person >> Age
EXPECTED : 60
ACTUAL : NOT_FOUND
MISMATCH 5
==========
MATCH DICT KEY : Person >> Movies
EXPECTED : { Movies : {<COMPLETE DICT>} }
ACTUAL : NOT_FOUND
I tried with Python module called datadiff which does not give me a pretty output in a dictionary format. To generate the report I have to traverse dictionary and find '+' '-' keys. If the dictionary is too complex then its hard to traverse.
UPDATE: I've updated the code to deal with lists in a more appropriate way. I've also commented the code to make it more clear if you need to change it.
This answer is not 100% general right now, but it can be expanded upon easily to fit what you need.
def print_error(exp, act, path=[]):
if path != []:
print 'MATCH LIST ITEM: %s' % '>>'.join(path)
print 'EXPECTED: %s' % str(exp)
print 'ACTUAL: %s' % str(act)
print ''
def copy_append(lst, item):
foo = lst[:]
foo.append(str(item))
return foo
def deep_check(comp, compto, path=[], print_errors=True):
# Total number of errors found, is needed for when
# testing the similarity of dicts
errors = 0
if isinstance(comp, list):
# If the types are not the same then it is probably a critical error
# return a number to represent how important this is
if not isinstance(compto, list):
if print_errors:
print_error(comp, 'NOT_LIST', path)
return 1
# We don't want to destroy the original lists
comp_copy = comp[:]
compto_copy = compto[:]
# Remove items that are both is comp and compto
# and find items that are only in comp
for item in comp_copy[:]:
try:
compto_copy.remove(item)
# Only is removed if the item is in compto_copy
comp_copy.remove(item)
except ValueError:
# dicts need to be handled differently
if isinstance(item, dict):
continue
if print_errors:
print_error(item, 'NOT_FOUND', path)
errors += 1
# Find non-dicts that are only in compto
for item in compto_copy[:]:
if isinstance(item, dict):
continue
compto_copy.remove(item)
if print_errors:
print_error('NOT_FOUND', item, path)
errors += 1
# Now both copies only have dicts
# This is the part that compares dicts with the minimum
# errors between them, it is expensive since each dict in comp_copy
# has to be compared against each dict in compto_copy
for c in comp_copy:
lowest_errors = None
lowest_value = None
for ct in compto_copy:
errors_in = deep_check(c, ct, path, print_errors=False)
# Get and store the minimum errors
if errors_in < lowest_errors or lowest_errors is None:
lowest_errors = errors_in
lowest_value = ct
if lowest_errors is not None:
errors += lowest_errors
# Has to have print_errors passed incase the list of dicts
# contains a list of dicts
deep_check(c, lowest_value, path, print_errors)
compto_copy.remove(lowest_value)
return errors
if not isinstance(compto, dict):
# If the types are not the same then it is probably a critical error
# return a number to represent how important this is
if print_errors:
print_error(comp, 'NOT_DICT')
return 1
for key, value in compto.iteritems():
try:
comp[key]
except KeyError:
if print_errors:
print_error('NO_KEY', key, copy_append(path, key))
errors += 1
for key, value in comp.iteritems():
try:
tovalue = compto[key]
except KeyError:
if print_errors:
print_error(value, 'NOT_FOUND', copy_append(path, key))
errors += 1
continue
if isinstance(value, (list, dict)):
errors += deep_check(value, tovalue, copy_append(path, key), print_errors)
else:
if value != tovalue:
if print_errors:
print_error(value, tovalue, copy_append(path, key))
errors += 1
return errors
With your dicts as input I get:
MATCH LIST ITEM: Person>>Age
EXPECTED: 60
ACTUAL: NOT_FOUND
MATCH LIST ITEM: Person>>Movies
EXPECTED: {'The Last Day': {'Director': 'Mr. Kapoor', 'Year': 1990}, 'Monster': {'Director': 'Mr. Khanna', 'Year': 1991}}
ACTUAL: NOT_FOUND
MATCH LIST ITEM: Person>>Children>>Lname
EXPECTED: Roshan
ACTUAL: Losan
MATCH LIST ITEM: Person>>Children>>Children
EXPECTED: Kamal
ACTUAL: NOT_FOUND
MATCH LIST ITEM: Person>>Children>>Children
EXPECTED: NOT_FOUND
ACTUAL: Ajamal
MATCH LIST ITEM: Person>>Children>>Children
EXPECTED: Parveen
ACTUAL: NOT_FOUND
The way lists are compared has been updated so that these two lists:
['foo', 'bar']
['foo', 'bing', 'bar']
Will only raise an error about 'bing' not being in the first list. With string values the value can either be in the list or not, but an issue arises when you are comparing a list of dicts. You'll end up with dicts from the list that do not match to varying degrees, and knowing what dicts to compare from those is not straight forward.
My implementation solves this by assuming that pairs of dicts that create the lowest number of errors are the ones that need to be compared together. For example:
test1 = {
"Name": "Org Name",
"Members":
[
{
"Fname": "foo",
"Lname": "bar",
"Gender": "Neuter",
"Roles": ["President", "Vice President"]
},
{
"Fname": "bing",
"Lname": "bang",
"Gender": "Neuter",
"Roles": ["President", "Vice President"]
}
]
}
test2 = {
"Name": "Org Name",
"Members":
[
{
"Fname": "bing",
"Lname": "bang",
"Gender": "Male",
"Roles": ["President", "Vice President"]
},
{
"Fname": "foo",
"Lname": "bar",
"Gender": "Female",
"Roles": ["President", "Vice President"]
}
]
}
Produces this output:
MATCH LIST ITEM: Members>>Gender
EXPECTED: Neuter
ACTUAL: Female
MATCH LIST ITEM: Members>>Gender
EXPECTED: Neuter
ACTUAL: Male

Categories