Python Avro, how to write data to a modified schema? - python

I'm new to Avro and I'm trying to perform basic tasks like read data from data.avro and now I want to write data to data.avro.
My problem is : ...is not an example of the schema...
I don't understand where is my mistake and I'd appreciate your help :
from avro import schema, datafile, io
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader
OUTFILE_NAME = '4_2.avro'
SCHEMA_STR = """{
"namespace": "advdaba",
"type": "record",
"name": "Conference",
"fields": [
{ "name": "name", "type": "string" },
{ "name": "time", "type": "long" },
{ "name": "location", "type": "string" },
{ "name": "speakers", "type": {"type":"array","items":"string"} },
{ "name": "participants", "type": {"type": "array", "items": "string"} },
{ "name": "seating", "type": {"type": "map", "values": "int"} }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
# read data writtent with the old schema
reader = DataFileReader(open("example.avro", "rb"), DatumReader())
#data = []
for example in reader:
print(example)
#data.append(example)
reader.close()
# generate data for new schema
data = {
'name': 'Foo',
'time': 25612345,
'location': 'Berne',
'speakers': ['Jean', 'Elton'],
'participants': ['John', 'Michel', 'Jacques'],
'seating': [{'John': 1}, {'Michel': 2}, {'Jacques': 3}]
}
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='deflate'
)
df_writer.append(data)
df_writer.close()
I think for what I understand, it's even possible to write the old and new data in the same .avro file
[EDIT]
after debugging the problem comes from seating

The problem was in seating
The solution is
'seating': {"John": 1, "Michel": 2, "Jacques": 3}

Related

Python parse json object within object within object

I've been successful in reading in and parsing a json string using python including normalising and exploding arrays. However I have a second json format which I'm struggling with and making little head way.
I need to pull out the 'Entities' and 'Transitions' within the 'data' object.
My json structure is:
{
"model": {
"id": "639b2970ac4d16767484b2bd",
"name": "TestImport",
"description": "",
"type": "LineageModel",
"referenceModelType": null,
"owner": {
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
},
"members": {
"totalUsers": 1,
"totalGroups": 0,
"users": {
"Owner": [
{
"joinTime": "2022-12-15T14:04:32.076Z",
"modificationTime": "2022-12-15T14:04:32.076Z",
"email": "sxyz",
"username": null,
"hasPendingSiteInvite": false,
"isDisabled": false,
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
}
]
},
"groups": {}
},
"$type": "ModelInformation"
},
"data": {
"version": "simple",
"roots": [
"8a44e4d6-4062-4e1c-a46c-c7787cab4405",
"d1494635-9005-4337-8eab-227265b29332"
],
"entities": {
"29380f60-620e-4314-9969-4ad6fe5bbea6": {
"name": "Element",
"children": [],
"id": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"properties": {}
},
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b": {
"name": "Application",
"children": [
"29380f60-620e-4314-9969-4ad6fe5bbea6"
],
"id": "86361ab4-6002-4f3b-b6ca-7e35acd69f9b",
"properties": {}
},
"223d9749-feb2-425d-b512-17b5322cda96": {
"name": "S_Group",
"children": [
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b"
],
"id": "223d9749-feb2-425d-b512-17b5322cda96",
"properties": {}
}
},
"transitions": {
"c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5": {
"source": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"target": "040677a5-820f-4d17-ae50-1296c0e36273",
"id": "c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5",
"properties": {}
}
},
"metadata": {
"queries": "{\"version\":\"3\"}",
"views": "{\"views\":[{\"name\":\"MainView\",\"description\":\"\",\"applyOnLoad\":true,\"view\":{\"version\":20,\"zoom\":{\"scale\":1,\"layerWidth\":250,\"layerSpacing\":40},\"collapsed\":{\"expanded\":[\"961847ad-1693-47b5-aa29-88ee07700b5e\",\"0da88727-e272-44a4-adeb-5a0465970490\",\"223d9749-feb2-425d-b512-17b5322cda96\",\"86361ab4-6002-4f3b-b6ca-7e35acd69f9b\"],\"collapsed\":[]},\"trace\":{\"enabled\":true,\"lock\":null,\"highlightedTraceDepth\":1,\"isHighlightedTraceDepthAll\":true,\"isTraversable\":true},\"selection\":[],\"queries\":{\"styled\":[],\"filtered\":[],\"filterType\":{\"6eecd266-2c2e-46d1-a00a-ceec18a87cdb\":\"show\"},\"expandedModules\":[]},\"settings\":{\"hideEmptyContainers\":false,\"hideFilteredLayers\":false,\"expandFilteredEntities\":false,\"portHintsEnabled\":true,\"autoBundleTransitions\":1,\"autoStyleTransitions\":1,\"autoHideTransitions\":1,\"maxSpanningTransitionDepth\":10,\"rootEntityType\":\"Layer\"}},\"options\":{\"zoom\":true,\"collapsed\":true,\"trace\":true,\"selection\":true,\"queries\":true,\"settings\":true},\"id\":\"VIEW-tcPIi3jP\"}]}"
},
"queries": [
{
"id": "9eb5c7fe-8d4c-4d94-b1a7-fee08bb2f663",
"name": "TestID",
"description": "",
"displayRules": "[{\"id\":\"e119af2d-5061-43f3-901a-4c70f402bb5d\",\"type\":\"PROPERTY\",\"staticColour\":\"#E47833\",\"dynamicColouring\":false,\"cls\":\"dr-19\",\"property\":\"TestID\",\"prefix\":\"\",\"suffix\":\"\",\"center\":false,\"left\":false}]",
"querySource": "not isEmpty(TestID)",
"modulePath": "Uncategorised",
"importedFromModel": null
}
],
"propertyDefinitions": {
"propertyDefinitionsId": "2ab6a0c3-8d24-4235-9783-e241437bf860",
"modelId": "639b2970ac4d16767484b2bd",
"properties": {
"TestID": {
"type": "Number",
"defaultValue": null,
"options": [],
"optionInfos": {}
}
}
}
},
"importedModels": {},
"importedModelsForQueries": {},
"propertyDefinitionsForImports": {},
"templateCollections": {}
}
I have been using the following to convert a json to data frame:
fInput = 'filepath to json file'
with open(fInput, encoding='utf-16') as inputfile:
df = pd.read_json(inputfile)
fOutput = 'output file path'
df.to_csv(fOutput, encoding='utf-16, index = false)
I then normalise columns using which for other json formats works
pd.json_normalize(df['column'])
and I explode arrays in the columns using:
df2 = pd.DataFrame([d, tup.id) for df.intertuples() for d in tup.columnName])
What I can't work out is how to pull into a data frame the 'entities' object from the 'data' object. Once I can get to that then I should be able to parse the content.
I had got to:
df = df["data"]
df = df["entities"]
When I print that it looks promising, but if I try to output to csv it fails with "'dict' object has no attributes" so I'm going wrong somewhere. The traceback for the error is:
AttributeError
Input [48], in <cell line: 14>()
12 df = df["entities"]
13 print(df)
14 df.to_csv(fOutput, encoding='utf-16', index=false)
AttributeError: 'dict' object has no attribute 'to_csv'
Any pointers appreciated.
Couldn't you just do this?
I saved the json data you gave in a file named data.txt
import json
with open('data.txt','r') as file:
data = json.load(file)
entities = data['data']['entities']
transitions = data['data']['transitions']
print(f'Entities: {entities}\nTransitions: {transitions}')
Output
Entities: {'29380f60-620e-4314-9969-4ad6fe5bbea6': {'name': 'Element', 'children': [], 'id': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'properties': {}}, '86361ab4-6002-4f3b-b6ca-7e35acd69f9b': {'name': 'Application', 'children': ['29380f60-620e-4314-9969-4ad6fe5bbea6'], 'id': '86361ab4-6002-4f3b-b6ca-7e35acd69f9b', 'properties': {}}, '223d9749-feb2-425d-b512-17b5322cda96': {'name': 'S_Group', 'children': ['86361ab4-6002-4f3b-b6ca-7e35acd69f9b'], 'id': '223d9749-feb2-425d-b512-17b5322cda96', 'properties': {}}}
Transitions: {'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5': {'source': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'target': '040677a5-820f-4d17-ae50-1296c0e36273', 'id': 'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5', 'properties': {}}}

Avro schema not respecting alias in schema definition

Avro schema schema.avsc:
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title", "nickname"]
}
]
}
Python script main.py:
from fastavro import writer, reader
from fastavro.schema import load_schema
schema = load_schema('schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
with open(avro_data, 'wb') as fout:
writer(fout, schema, data, validator=True)
with open(avro_data, 'rb') as fin:
for i in reader(fin, schema):
print(i)
When my json lines data.jsonl file looks like this:
{"id":"1","name":"foo"}
{"id":"2","name":"bar"}
My python script returns:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}
However, if my json lines data.jsonl file looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
My python script returns:
{'id': '1', 'name': None}
{'id': '2', 'name': None}
Any idea why the name column isn't respecting the aliases attribute I've defined in the avro schema file for that particular field?
Aliases are used when you have data written with an old schema that you want to read with a new schema. Your example only uses one schema, so aliases wouldn't work with just a single schema.
Let's use the following two schemas in an example. Here's an "old" schema which uses the title field:
old_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "title",
"type": ["string", "null"]
}
]
}
And a new schema where we want the new name field to be an alias of the old title field:
new_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title"]
}
]
}
If we use your second data.jsonl which looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
Then we can use a slightly modified version of your main.py so that the data is written with the old schema and then the new schema is passed to the reader so that the aliases are respected:
from fastavro import writer, reader
from fastavro.schema import load_schema
import jsonlines
old_schema = load_schema('old_schema.avsc')
new_schema = load_schema('new_schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
# Data is writen with old schema
with open(avro_data, 'wb') as fout:
writer(fout, old_schema, data, validator=True)
# And read with new schema
with open(avro_data, 'rb') as fin:
for i in reader(fin, new_schema):
print(i)
Now the output is correct:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}

CSV to Avro with Python: Avro Schema Issue

I am trying to serialise my CSV file into Avro and then iterate through each row and send to a Kafka consumer. Currently I get an issue where the data being send through doesn't match my schema but I am unsure as to why.
below is code to read csv and serialise the rows in it and output to a file with Avro format.
import os, csv, avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from kafka import KafkaProducer
from collections import namedtuple
output_loc = '{}/avro.avro'.format(os.path.dirname(__file__))
CSV = '{}/oscar_age_male.csv'.format(os.path.dirname(__file__))
fields = ("Index","Year", "Age", "Name", "Movie")
csv_record = namedtuple('csv_record', fields)
def read_csv(path):
with open(path, 'rU') as data:
data.readline()
reader = csv.reader(data, delimiter=",")
for row in map(csv_record._make, reader):
print(row)
yield row
def parse_schema(path='{}/schema.avsc'.format(os.path.dirname(__file__))):
with open(path, 'r') as data:
return avro.schema.parse(data.read())
def serilialise_records(records, outpath=output_loc):
schema = parse_schema()
with open(outpath, 'w') as out:
writer = DataFileWriter(out, DatumWriter(), schema)
for record in records:
record = dict((f, getattr(record, f)) for f in record._fields)
writer.append(record)
serilialise_records(read_csv(CSV))
and here is the error is receive:
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'Index': '1', 'Year': '1928', 'Age': '44', 'Name': ' "Emil Jannings"', 'Movie': ' "The Last Command The Way of All Flesh"'} is not an example of the schema {
"type": "record",
"name": "Test",
"namespace": "avro_schema_test",
"fields": [
{
"type": "int",
"name": "Index"
},
{
"type": "int",
"name": "Year"
},
{
"type": "int",
"name": "Age"
},
{
"type": "string",
"name": "Name"
},
{
"type": "string",
"name": "Movie"
}
]
}
my Avro schema is:
{
"type": "record",
"namespace": "avro_schema_test",
"name": "Test",
"fields": [
{"name": "Index", "type": "int"},
{"name": "Year", "type": "int"},
{"name": "Age", "type": "int"},
{"name": "Name", "type": "string"},
{"name": "Movie", "type": "string"}
]
}
Once issue is resolved I will iterate through my avro file and send records to Kafka.

Python - Parse complex JSON with objectpath

i need parse terraform file, write in JSON format. I have to extract two data, resource and id, this is example file:
{
"version": 1,
"serial": 1,
"modules": [
{
"path": [
"root"
],
"outputs": {
},
"resources": {
"aws_security_group.vpc-xxxxxxx-test-1": {
"type": "aws_security_group",
"primary": {
"id": "sg-xxxxxxxxxxxxxx",
"attributes": {
"description": "test-1",
"name": "test-1"
}
}
},
"aws_security_group.vpc-xxxxxxx-test-2": {
"type": "aws_security_group",
"primary": {
"id": "sg-yyyyyyyyyyyy",
"attributes": {
"description": "test-2",
"name": "test-2"
}
}
}
}
}
]
}
I need export for any resources, the first key and value of id, in this case, aws_security_group.vpc-xxxxxxx-test-1 sg-xxxxxxxxxxxxxx and aws_security_group.vpc-xxxxxxx-test-2 sg-yyyyyyyyyyyy
I have tried to write this in python:
#!/usr/bin/python3.6
import json
import objectpath
with open('file.json') as json_file:
data = json.load(json_file)
json_tree = objectpath.Tree(data['modules'])
result = tuple(json_tree.execute('$..resources[0]'))
result is
('aws_security_group.vpc-xxxxxxx-test-1', 'aws_security_group.vpc-xxxxxxx-test-2')
It's'ok but I can't extract the id, any help is appreciated, also use other methods
Thanks
I don't know objectpath, but I think you need:
tree.execute('$..resources[0]..primary.id')
or even just
tree.execute('$..resources[0]..id')

How to modify nested JSON with python

I need to update (CRUD) a nested JSON file using Python. To be able to call python function(s)(to update/delete/create) entires and write it back to the json file.
Here is a sample file.
I am looking at the remap library but not sure if this will work.
{
"groups": [
{
"name": "group1",
"properties": [
{
"name": "Test-Key-String",
"value": {
"type": "String",
"encoding": "utf-8",
"data": "value1"
}
},
{
"name": "Test-Key-Integer",
"value": {
"type": "Integer",
"data": 1000
}
}
],
"groups": [
{
"name": "group-child",
"properties": [
{
"name": "Test-Key-String",
"value": {
"type": "String",
"encoding": "utf-8",
"data": "value1"
}
},
{
"name": "Test-Key-Integer",
"value": {
"type": "Integer",
"data": 1000
}
}
]
}
]
},
{
"name": "group2",
"properties": [
{
"name": "Test-Key2-String",
"value": {
"type": "String",
"encoding": "utf-8",
"data": "value2"
}
}
]
}
]
}
I feel like I'm missing something in your question. In any event, what I understand is that you want to read a json file, edit the data as a python object, then write it back out with the updated data?
Read the json file:
import json
with open("data.json") as f:
data = json.load(f)
That creates a dictionary (given the format you've given) that you can manipulate however you want. Assuming you want to write it out:
with open("data.json","w") as f:
json.dump(data,f)

Categories