Parsing file via 'iter' excludes a string - python

This is driving me crazy, help is appreciated. Here's what I'm trying to do:
With a JSON file as input, find two consecutive lines that look like:
{
"description"
Then, if that condition is found, insert additional JSON above this point. Following code almost works except that for some reason that I can't figure out, one line is getting skipped.
Code:
with open('file.in',encoding="utf8") as in_file:
with open('file.out','w',encoding="utf8") as out_file:
tag_to_check_line1 = '{'
tag_to_check_line2 = '"description"'
tag_to_check_not_line2 = ',"description"'
irofile = iter(in_file)
for line in irofile:
if tag_to_check_line1 in line:
out_file.write(line)
line = next(irofile)
if tag_to_check_line2 in line and tag_to_check_not_line2 not in line:
out_file.write('\n')
out_file.write('"fields": {\n')
out_file.write('"project":\n')
out_file.write('{\n')
out_file.write('"key": "GID"\n')
out_file.write('},\n')
out_file.write(line)
else:
out_file.write(line)
Input data looks like:
{
"description": "<p>The description is here.</p>",
"customfield_16818": "REQ-7591",
"customfield_16819": "GID-1214020",
"customfield_16815":{"self":"https://jira.com/rest/api/2/customFieldOption/20685","value":"No","id":"20685"},
"summary": "MySQL Redundancy",
"customfield_16816": "0",
"customfield_16817": "0",
"tag": "tagtext"
}
The resulting output looks right, except that the "summary" tag is missing:
{
"fields": {
"project":
{
"key": "GID"
},
"description": "<p>The description is here.</p>",
"customfield_16818": "REQ-7591",
"customfield_16819": "GID-1214020",
"customfield_16815":{"self":"https://jira.com/rest/api/2/customFieldOption/20685","value":"No","id":"20685"},
"customfield_16816": "0",
"customfield_16817": "0",
"tag": "tagtext"
}
So the question is: Why is the "summary" tag missing?

You're missing an else: clause. Can be fixed as indicated below:
with open('file.in',encoding="utf8") as in_file:
with open('file.out', 'w', encoding="utf8") as out_file:
tag_to_check_line1 = '{'
tag_to_check_line2 = '"description"'
tag_to_check_not_line2 = ',"description"'
irofile = iter(in_file)
for line in irofile:
if tag_to_check_line1 in line:
out_file.write(line)
line = next(irofile)
if tag_to_check_line2 in line and tag_to_check_not_line2 not in line:
out_file.write('\n')
out_file.write('"fields": {\n')
out_file.write('"project":\n')
out_file.write('{\n')
out_file.write('"key": "GID"\n')
out_file.write('},\n')
out_file.write(line)
else: # ADD THESE
out_file.write(line) # TWO LINES
else:
out_file.write(line)

Related

Handling keys inside a key from .json file

I am trying to print an specific key from a JSON file without success.
.json file is:
{
"testing": [
{
"_name": "teste",
"_profile_telphone": "212331233",
"_age": "21"
}
]
}
the function I'm using is:
def load(self):
filename = self.profile_entry.get()
if filename == "":
msg = "Insert a profile name"
messagebox.showinfo("Profile name is empty", msg)
self.profile_entry.focus()
else:
with open('profile_'+filename+'.json', 'r') as outfile:
data = json.load(outfile)
print(data[filename])
outfile.close()
with print(data[filename]) I can print entire
{
"_name": "teste",
"_profile_telphone": "212331233",
"_age": "21"
}
But how can I print only name or age for example?
data is a list of JSONs - you have a json array there, in order to parse it you can do it with a for statement
for element in data[filename]:
print(element[‘_name’])
print(element[‘_profile_telphone’])
print(element[‘_age’])

Remove first comma starting from end of file

I'm using Python3 to edit a JSON file by cutting lines 0 - 51 but because I am cutting lines from the file, the last line has a comma which breaks the JSON. What I'm trying to do is go to the last line and remove the comma.
with open(f'test.json', 'r') as f:
line = f.readlines()
cut = line[0:51]
with open(f'test.json', 'w') as file:
close = "]"
for item in cut:
file.write(item)
line.translate(',')[0:-1] #go to last line and remove ,
file.write(close)
JSON example:
[
{"title": "title 1", "description": "description1", "date": "1:15pm Jul 23, 2020", "image": "ImageURL", "url": "link"},
{"title": "title 2", "description": "description 2", "date": "12:30am Jul 23, 2020", "image": "ImageURL", "url": "link"},
]
You might do something like this (see the inline comments for details):
with open('test.json', 'r') as f:
line = f.readlines()
cut = line[0:51]
# strip both the comma and the new line at the right of the last line
# and then assign the value back to the last line
cut[-1] = cut[-1].rstrip(',\n')
with open('test.json', 'w') as file:
# add a new line in front of ] just for the look
close = "\n]"
for item in cut:
file.write(item)
file.write(close)
By the way, you don't need to put that f in front of 'test.json', like f'test.json'. It is perfectly fine to just use 'test.json'.

Parse JSON structures in a txt file containing JSON and text structures

I have a txt file with json structures. the problem is the file does not only contain json structures but also raw text like log error:
2019-01-18 21:00:05.4521|INFO|Technical|Batch Started|
2019-01-18 21:00:08.8740|INFO|Technical|Got Entities List from 20160101 00:00 :
{
"name": "1111",
"results": [{
"filename": "xxxx",
"numberID": "7412"
}, {
"filename": "xgjhh",
"numberID": "E52"
}]
}
2019-01-18 21:00:05.4521|INFO|Technical|Batch Started|
2019-01-18 21:00:08.8740|INFO|Technical|Got Entities List from 20160101 00:00 :
{
"name": "jfkjgjkf",
"results": [{
"filename": "hhhhh",
"numberID": "478962"
}, {
"filename": "jkhgfc",
"number": "12544"
}]
}
I read the .txt file but trying to patch the jason structures I have an error:
IN :
import json
with open("data.txt", "r", encoding="utf-8", errors='ignore') as f:
json_data = json.load(f)
OUT : json.decoder.JSONDecodeError: Extra data: line 1 column 5 (char 4)
I would like to parce json and save as csv file.
A more general solution to parsing a file with JSON objects mixed with other content without any assumption of the non-JSON content would be to split the file content into fragments by the curly brackets, start with the first fragment that is an opening curly bracket, and then join the rest of fragments one by one until the joined string is parsable as JSON:
import re
fragments = iter(re.split('([{}])', f.read()))
while True:
try:
while True:
candidate = next(fragments)
if candidate == '{':
break
while True:
candidate += next(fragments)
try:
print(json.loads(candidate))
break
except json.decoder.JSONDecodeError:
pass
except StopIteration:
break
This outputs:
{'name': '1111', 'results': [{'filename': 'xxxx', 'numberID': '7412'}, {'filename': 'xgjhh', 'numberID': 'E52'}]}
{'name': 'jfkjgjkf', 'results': [{'filename': 'hhhhh', 'numberID': '478962'}, {'filename': 'jkhgfc', 'number': '12544'}]}
This solution will strip out the non-JSON structures, and wrap them in a containing JSON structure.This should do the job for you. I'm posting this as is for expediency, then I'll edit my answer for a more clear explanation. I'll edit this first bit when I've done that:
import json
with open("data.txt", "r", encoding="utf-8", errors='ignore') as f:
cleaned = ''.join([item.strip() if item.strip() is not '' else '-split_here-' for item in f.readlines() if '|INFO|' not in item]).split('-split_here-')
json_data = json.loads(json.dumps(('{"entries":[' + ''.join([entry + ', ' for entry in cleaned])[:-2] + ']}')))
Output:
{"entries":[{"name": "1111","results": [{"filename": "xxxx","numberID": "7412"}, {"filename": "xgjhh","numberID": "E52"}]}, {"name": "jfkjgjkf","results": [{"filename": "hhhhh","numberID": "478962"}, {"filename": "jkhgfc","number": "12544"}]}]}
What's going on here?
In the cleaned = ... line, we're using a list comprehension that creates a list of the lines in the file (f.readlines()) that do not contain the string |INFO| and adds the string -split_here- to the list whenever there's a blank line (where .strip() yields '').
Then, we're converting that list of lines (''.join()) into a string.
Finally we're converting that string (.split('-split_here-') into a list of lists, separating the JSON structures into their own lists, marked by blank lines in data.txt.
In the json_data = ... line, we're appending a ', ' to each of the JSON structures using a list comprehension.
Then, we convert that list back into a single string, stripping off the last ', ' (.join()[:-2]. [:-2]slices of the last two characters from the string.).
We then wrap the string with '{"entries":[' and ']}' to make the whole thing a valid JSON structure, and feed it to json.dumps and json.loads to clean any encoding and load your data a a python object.
You could do one of several things:
On the Command Line, remove all lines where, say, "|INFO|Technical|" appears (assuming this appears in every line of raw text):
sed -i '' -e '/\|INFO\|Technical/d' yourfilename (if on Mac),
sed -i '/\|INFO\|Technical/d' yourfilename (if on Linux).
Move these raw lines into their own JSON fields
Use the "text structures" as a delimiter between JSON objects.
Iterate over the lines in the file, saving them to a buffer until you encounter a line that is a text line, at which point parse the lines you've saved as a JSON object.
import re
import json
def is_text(line):
# returns True if line starts with a date and time in "YYYY-MM-DD HH:MM:SS" format
line = line.lstrip('|') # you said some lines start with a leading |, remove it
return re.match("^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})", line)
json_objects = []
with open("data.txt") as f:
json_lines = []
for line in f:
if not is_text(line):
json_lines.append(line)
else:
# if there's multiple text lines in a row json_lines will be empty
if json_lines:
json_objects.append(json.loads("".join(json_lines)))
json_lines = []
# we still need to parse the remaining object in json_lines
# if the file doesn't end in a text line
if json_lines:
json_objects.append(json.loads("".join(json_lines)))
print(json_objects)
Repeating logic in the last two lines is a bit ugly, but you need to handle the case where the last line in your file is not a text line, so when you're done with the for loop you need parse the last object sitting in json_lines if there is one.
I'm assuming there's never more than one JSON object between text lines and also my regex expression for a date will break in 8,000 years.
You could count curly brackets in your file to find beginning and ending of your jsons, and store them in list, here found_jsons.
import json
open_chars = 0
saved_content = []
found_jsons = []
for i in content.splitlines():
open_chars += i.count('{')
if open_chars:
saved_content.append(i)
open_chars -= i.count('}')
if open_chars == 0 and saved_content:
found_jsons.append(json.loads('\n'.join(saved_content)))
saved_content = []
for i in found_jsons:
print(json.dumps(i, indent=4))
Output
{
"results": [
{
"numberID": "7412",
"filename": "xxxx"
},
{
"numberID": "E52",
"filename": "xgjhh"
}
],
"name": "1111"
}
{
"results": [
{
"numberID": "478962",
"filename": "hhhhh"
},
{
"number": "12544",
"filename": "jkhgfc"
}
],
"name": "jfkjgjkf"
}

Create JSON array from python list

I am currently trying to write something to convert an Ansible inventory file into a JSON array which would allow it to be pulled into awx/tower however I have struggled to build a brand new array from the current inventory file format. I am avoiding use of any of the Ansible python API modules as there is no guarantee that future updates won't break these. One solution I found no longer works as there appears to be a change to the Ansible InventoryParser python module so I'm trying to come up with a Python 2.7 solution.
Example inventory file;
[test]
host1
host2
[test1]
host3
host4
The [] signify groups and the other entries are the hosts which will be the key:value relationship. I have converted this to a list in python and I am then attempting to format this into a key:value setup using the [] as where to split the key from the values.
both = []
f = open(filename, "r")
line = f.readline().strip()
while line:
both.append(line)
line = f.readline().strip()
f.close()
start = '['
end = ']'
json_dict = {'all': [dict(item.split(start)[1].split(end)[0] for item in
both)]}
print json.dumps(json_dict)
Unfortunately this returns the error:
ValueError: dictionary update sequence element #0 has length 4; 2 is required
Although truth be told I'm not sure this will return what I am looking for regardless.
Hoping someone can point me in the right direction or highlight where I've gone wrong so far.
Cheers
EDIT: Adding some code for what output is actually expected;
{
[test]: {
'hosts': ['host1', 'host2'],
},
[test1]: {
'hosts': ['host3', 'host4'],
}
}
A more detailed output example of what I'm trying to achieve;
{
"databases" : {
"hosts" : [ "host1.example.com", "host2.example.com" ],
"vars" : {
"a" : true
}
},
"webservers" : [ "host2.example.com", "host3.example.com" ],
"atlanta" : {
"hosts" : [ "host1.example.com", "host4.example.com",
"host5.example.com" ],
"vars" : {
"b" : false
},
},
"marietta" : [ "host6.example.com" ],
"5points" : [ "host7.example.com" ]
}
So we have a key which holds the group names and within that there are key:value pairs for hosts and vars.
After some more study I am closer to the output I desire with the following code;
both = {}
group = None
with open(filename, "r") as f:
line = f.readline().strip()
while line:
if line.startswith('#') or line.startswith(';') or len(line) == 0:
continue
if line.startswith("["):
# is a group
group = line
both[group] = {}
elif not line.startswith("["):
host = line
both[group][host] = {}
line = f.readline().strip()
f.close()
return both
Which returns the following which isn't quite what I'm after but I feel like I am making progress;
{
"[test2]": {
"host1": {},
"host2": {}
},
"[test3]": {
"host3": {}
},
"[test]": {
"host4": {},
"host5": {}
}
}
It's may help you.
import json
both = {}
start = '['
end = ']'
with open(filename, "r") as f:
line = f.readline().strip()
while line:
if start in line or end in line:
line = line.split(start)[1].split(end)[0]
both[line] = line
line = f.readline().strip()
json_dict = {'all': [both]}
print(json.dumps(json_dict))

Python Help needed : Unable to read contents of a tsv file and populate in the dictionary as desired

What i have to parse :
I have a tsv file that looks like this :
https://i.stack.imgur.com/yxsXD.png
What is the end goal:
My goal is to read the tsv file and populate the contents of the csv file in a dictionary and nested lists without using csv parser.
In the end the in_memory_table structure would look
like this ( of course with more than two rows ):
{
"header": [
"STATION",
"STATION_ID",
"ELEVATION",
"LAT",
"LONG",
"DATE",
"MNTH_MIN",
"MNTH_MAX"
],
"rows": [
[
"Tukwila",
"12345afbl",
"10",
"47.5463454",
"-122.34234234",
"2016-01-01",
"10",
"41"
],
[
"Tukwila",
"12345afbl",
"10",
"47.5463454",
"-122.34234234",
"2016-02-01",
"5",
"35"
],
]
}
My code looks like this:
in_memory_table = {
'header': [],
'rows': [] }
with open('fahrenheit_monthly_readings.tsv') as f:
in_file = f.readlines()
i = 0
for line in in_file:
temp_list = [line.split('\t')]
if (i == 0):
in_memory_table['header']= line
elif(i != 0):
in_memory_table['rows'].append(line)
i += 1
print("\n",in_memory_table)
Output of the code:
C:\Users\svats\AppData\Local\Programs\Python\Python36-32\python.exe C:/Users/svats/PycharmProjects/BrandNew/module4_lab2/module4_lab2.py
{'header': 'STATION\tSTATION_ID\tELEVATION\tLAT\tLONG\tDATE\tMNTH_MIN\tMNTH_MAX\n', 'rows': ['Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-01-01\t10\t41\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-02-01\t5\t35\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-03-01\t32\t47\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-04-01\t35\t49\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-05-01\t41\t60\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-06-01\t50\t72\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-07-01\t57\t70\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-08-01\t68\t79\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-09-01\t55\t71\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-10-01\t47\t77\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-11-01\t32\t66\n', 'Tukwila\t12345afbl\t10\t47.5463454\t-122.34234234\t2016-12-01\t27\t55\n']}
Help needed:
i am very close towards getting the solution
I have 2 questions :
1. how to get rid of the \t in the o/p?
2. My o/p is little different from the desired o/p. how do i get it ?
If you rewrite your code as:
for line in in_file:
print('repr(line) before :', repr(line) )
temp_list = [line.split()]
#line = line.split()
print('temp_list :',temp_list)
print('repr(line) after :', repr(line) )
print(' %s -----------------' % i)
if ........
and de-comment the line #line = line.split()
you'll understand the reason of the bad result you obtain.
The reason is that line.split() doesn't change the object of name line ,
it creates a new object (the list you want) to which name line must be re-assigned if you want this name to refer to the obtained list.
Note that the method str.split([sep[, maxsplit]]) has a different algorithm according if parameter sep is None or not None, see documentation https://docs.python.org/2/library/stdtypes.html#str.split for this point
.
That said, there's a better way.
with open('fahrenheit_monthly_readings.tsv','r') as f:
in_memory_table = {'header':next(f).split()}
in_memory_table['rows'] = [line.split() for line in f]
or
with open('fahrenheit_monthly_readings.tsv','r') as f:
in_memory_table = {'header':next(f).split()}
in_memory_table['rows'] = list(map(str.split, f))

Categories