Avro schema not respecting alias in schema definition - python

Avro schema schema.avsc:
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title", "nickname"]
}
]
}
Python script main.py:
from fastavro import writer, reader
from fastavro.schema import load_schema
schema = load_schema('schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
with open(avro_data, 'wb') as fout:
writer(fout, schema, data, validator=True)
with open(avro_data, 'rb') as fin:
for i in reader(fin, schema):
print(i)
When my json lines data.jsonl file looks like this:
{"id":"1","name":"foo"}
{"id":"2","name":"bar"}
My python script returns:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}
However, if my json lines data.jsonl file looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
My python script returns:
{'id': '1', 'name': None}
{'id': '2', 'name': None}
Any idea why the name column isn't respecting the aliases attribute I've defined in the avro schema file for that particular field?

Aliases are used when you have data written with an old schema that you want to read with a new schema. Your example only uses one schema, so aliases wouldn't work with just a single schema.
Let's use the following two schemas in an example. Here's an "old" schema which uses the title field:
old_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "title",
"type": ["string", "null"]
}
]
}
And a new schema where we want the new name field to be an alias of the old title field:
new_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title"]
}
]
}
If we use your second data.jsonl which looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
Then we can use a slightly modified version of your main.py so that the data is written with the old schema and then the new schema is passed to the reader so that the aliases are respected:
from fastavro import writer, reader
from fastavro.schema import load_schema
import jsonlines
old_schema = load_schema('old_schema.avsc')
new_schema = load_schema('new_schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
# Data is writen with old schema
with open(avro_data, 'wb') as fout:
writer(fout, old_schema, data, validator=True)
# And read with new schema
with open(avro_data, 'rb') as fin:
for i in reader(fin, new_schema):
print(i)
Now the output is correct:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}

Related

Python parse json object within object within object

I've been successful in reading in and parsing a json string using python including normalising and exploding arrays. However I have a second json format which I'm struggling with and making little head way.
I need to pull out the 'Entities' and 'Transitions' within the 'data' object.
My json structure is:
{
"model": {
"id": "639b2970ac4d16767484b2bd",
"name": "TestImport",
"description": "",
"type": "LineageModel",
"referenceModelType": null,
"owner": {
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
},
"members": {
"totalUsers": 1,
"totalGroups": 0,
"users": {
"Owner": [
{
"joinTime": "2022-12-15T14:04:32.076Z",
"modificationTime": "2022-12-15T14:04:32.076Z",
"email": "sxyz",
"username": null,
"hasPendingSiteInvite": false,
"isDisabled": false,
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
}
]
},
"groups": {}
},
"$type": "ModelInformation"
},
"data": {
"version": "simple",
"roots": [
"8a44e4d6-4062-4e1c-a46c-c7787cab4405",
"d1494635-9005-4337-8eab-227265b29332"
],
"entities": {
"29380f60-620e-4314-9969-4ad6fe5bbea6": {
"name": "Element",
"children": [],
"id": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"properties": {}
},
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b": {
"name": "Application",
"children": [
"29380f60-620e-4314-9969-4ad6fe5bbea6"
],
"id": "86361ab4-6002-4f3b-b6ca-7e35acd69f9b",
"properties": {}
},
"223d9749-feb2-425d-b512-17b5322cda96": {
"name": "S_Group",
"children": [
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b"
],
"id": "223d9749-feb2-425d-b512-17b5322cda96",
"properties": {}
}
},
"transitions": {
"c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5": {
"source": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"target": "040677a5-820f-4d17-ae50-1296c0e36273",
"id": "c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5",
"properties": {}
}
},
"metadata": {
"queries": "{\"version\":\"3\"}",
"views": "{\"views\":[{\"name\":\"MainView\",\"description\":\"\",\"applyOnLoad\":true,\"view\":{\"version\":20,\"zoom\":{\"scale\":1,\"layerWidth\":250,\"layerSpacing\":40},\"collapsed\":{\"expanded\":[\"961847ad-1693-47b5-aa29-88ee07700b5e\",\"0da88727-e272-44a4-adeb-5a0465970490\",\"223d9749-feb2-425d-b512-17b5322cda96\",\"86361ab4-6002-4f3b-b6ca-7e35acd69f9b\"],\"collapsed\":[]},\"trace\":{\"enabled\":true,\"lock\":null,\"highlightedTraceDepth\":1,\"isHighlightedTraceDepthAll\":true,\"isTraversable\":true},\"selection\":[],\"queries\":{\"styled\":[],\"filtered\":[],\"filterType\":{\"6eecd266-2c2e-46d1-a00a-ceec18a87cdb\":\"show\"},\"expandedModules\":[]},\"settings\":{\"hideEmptyContainers\":false,\"hideFilteredLayers\":false,\"expandFilteredEntities\":false,\"portHintsEnabled\":true,\"autoBundleTransitions\":1,\"autoStyleTransitions\":1,\"autoHideTransitions\":1,\"maxSpanningTransitionDepth\":10,\"rootEntityType\":\"Layer\"}},\"options\":{\"zoom\":true,\"collapsed\":true,\"trace\":true,\"selection\":true,\"queries\":true,\"settings\":true},\"id\":\"VIEW-tcPIi3jP\"}]}"
},
"queries": [
{
"id": "9eb5c7fe-8d4c-4d94-b1a7-fee08bb2f663",
"name": "TestID",
"description": "",
"displayRules": "[{\"id\":\"e119af2d-5061-43f3-901a-4c70f402bb5d\",\"type\":\"PROPERTY\",\"staticColour\":\"#E47833\",\"dynamicColouring\":false,\"cls\":\"dr-19\",\"property\":\"TestID\",\"prefix\":\"\",\"suffix\":\"\",\"center\":false,\"left\":false}]",
"querySource": "not isEmpty(TestID)",
"modulePath": "Uncategorised",
"importedFromModel": null
}
],
"propertyDefinitions": {
"propertyDefinitionsId": "2ab6a0c3-8d24-4235-9783-e241437bf860",
"modelId": "639b2970ac4d16767484b2bd",
"properties": {
"TestID": {
"type": "Number",
"defaultValue": null,
"options": [],
"optionInfos": {}
}
}
}
},
"importedModels": {},
"importedModelsForQueries": {},
"propertyDefinitionsForImports": {},
"templateCollections": {}
}
I have been using the following to convert a json to data frame:
fInput = 'filepath to json file'
with open(fInput, encoding='utf-16') as inputfile:
df = pd.read_json(inputfile)
fOutput = 'output file path'
df.to_csv(fOutput, encoding='utf-16, index = false)
I then normalise columns using which for other json formats works
pd.json_normalize(df['column'])
and I explode arrays in the columns using:
df2 = pd.DataFrame([d, tup.id) for df.intertuples() for d in tup.columnName])
What I can't work out is how to pull into a data frame the 'entities' object from the 'data' object. Once I can get to that then I should be able to parse the content.
I had got to:
df = df["data"]
df = df["entities"]
When I print that it looks promising, but if I try to output to csv it fails with "'dict' object has no attributes" so I'm going wrong somewhere. The traceback for the error is:
AttributeError
Input [48], in <cell line: 14>()
12 df = df["entities"]
13 print(df)
14 df.to_csv(fOutput, encoding='utf-16', index=false)
AttributeError: 'dict' object has no attribute 'to_csv'
Any pointers appreciated.
Couldn't you just do this?
I saved the json data you gave in a file named data.txt
import json
with open('data.txt','r') as file:
data = json.load(file)
entities = data['data']['entities']
transitions = data['data']['transitions']
print(f'Entities: {entities}\nTransitions: {transitions}')
Output
Entities: {'29380f60-620e-4314-9969-4ad6fe5bbea6': {'name': 'Element', 'children': [], 'id': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'properties': {}}, '86361ab4-6002-4f3b-b6ca-7e35acd69f9b': {'name': 'Application', 'children': ['29380f60-620e-4314-9969-4ad6fe5bbea6'], 'id': '86361ab4-6002-4f3b-b6ca-7e35acd69f9b', 'properties': {}}, '223d9749-feb2-425d-b512-17b5322cda96': {'name': 'S_Group', 'children': ['86361ab4-6002-4f3b-b6ca-7e35acd69f9b'], 'id': '223d9749-feb2-425d-b512-17b5322cda96', 'properties': {}}}
Transitions: {'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5': {'source': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'target': '040677a5-820f-4d17-ae50-1296c0e36273', 'id': 'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5', 'properties': {}}}

Python Avro, how to write data to a modified schema?

I'm new to Avro and I'm trying to perform basic tasks like read data from data.avro and now I want to write data to data.avro.
My problem is : ...is not an example of the schema...
I don't understand where is my mistake and I'd appreciate your help :
from avro import schema, datafile, io
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader
OUTFILE_NAME = '4_2.avro'
SCHEMA_STR = """{
"namespace": "advdaba",
"type": "record",
"name": "Conference",
"fields": [
{ "name": "name", "type": "string" },
{ "name": "time", "type": "long" },
{ "name": "location", "type": "string" },
{ "name": "speakers", "type": {"type":"array","items":"string"} },
{ "name": "participants", "type": {"type": "array", "items": "string"} },
{ "name": "seating", "type": {"type": "map", "values": "int"} }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
# read data writtent with the old schema
reader = DataFileReader(open("example.avro", "rb"), DatumReader())
#data = []
for example in reader:
print(example)
#data.append(example)
reader.close()
# generate data for new schema
data = {
'name': 'Foo',
'time': 25612345,
'location': 'Berne',
'speakers': ['Jean', 'Elton'],
'participants': ['John', 'Michel', 'Jacques'],
'seating': [{'John': 1}, {'Michel': 2}, {'Jacques': 3}]
}
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='deflate'
)
df_writer.append(data)
df_writer.close()
I think for what I understand, it's even possible to write the old and new data in the same .avro file
[EDIT]
after debugging the problem comes from seating
The problem was in seating
The solution is
'seating': {"John": 1, "Michel": 2, "Jacques": 3}

CSV to Avro with Python: Avro Schema Issue

I am trying to serialise my CSV file into Avro and then iterate through each row and send to a Kafka consumer. Currently I get an issue where the data being send through doesn't match my schema but I am unsure as to why.
below is code to read csv and serialise the rows in it and output to a file with Avro format.
import os, csv, avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from kafka import KafkaProducer
from collections import namedtuple
output_loc = '{}/avro.avro'.format(os.path.dirname(__file__))
CSV = '{}/oscar_age_male.csv'.format(os.path.dirname(__file__))
fields = ("Index","Year", "Age", "Name", "Movie")
csv_record = namedtuple('csv_record', fields)
def read_csv(path):
with open(path, 'rU') as data:
data.readline()
reader = csv.reader(data, delimiter=",")
for row in map(csv_record._make, reader):
print(row)
yield row
def parse_schema(path='{}/schema.avsc'.format(os.path.dirname(__file__))):
with open(path, 'r') as data:
return avro.schema.parse(data.read())
def serilialise_records(records, outpath=output_loc):
schema = parse_schema()
with open(outpath, 'w') as out:
writer = DataFileWriter(out, DatumWriter(), schema)
for record in records:
record = dict((f, getattr(record, f)) for f in record._fields)
writer.append(record)
serilialise_records(read_csv(CSV))
and here is the error is receive:
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'Index': '1', 'Year': '1928', 'Age': '44', 'Name': ' "Emil Jannings"', 'Movie': ' "The Last Command The Way of All Flesh"'} is not an example of the schema {
"type": "record",
"name": "Test",
"namespace": "avro_schema_test",
"fields": [
{
"type": "int",
"name": "Index"
},
{
"type": "int",
"name": "Year"
},
{
"type": "int",
"name": "Age"
},
{
"type": "string",
"name": "Name"
},
{
"type": "string",
"name": "Movie"
}
]
}
my Avro schema is:
{
"type": "record",
"namespace": "avro_schema_test",
"name": "Test",
"fields": [
{"name": "Index", "type": "int"},
{"name": "Year", "type": "int"},
{"name": "Age", "type": "int"},
{"name": "Name", "type": "string"},
{"name": "Movie", "type": "string"}
]
}
Once issue is resolved I will iterate through my avro file and send records to Kafka.

CSV file to JSON for nested array generic template using python (for csv to mongodb insert)

I want to create the JSON file from CSV file using the generic python script.
Found hone package from GitHub but some of the functionalities missing in that code.
csv to json
I want to code like generic template CSV to JSON.
[
{
"birth": {
"day": "7",
"month": "May",
"year": "1985"
},
"name": "Bob",
"reference": "TRUE",
"reference name": "Smith"
}
]
Only handled above type of JSON only.
[
{
"Type": "AwsEc2Instance",
"Id": "i-cafebabe",
"Partition": "aws",
"Region": "us-west-2",
"Tags": {
"billingCode": "Lotus-1-2-3",
"needsPatching": "true"
},
"Details": {
"AwsEc2Instance": {
"Type": "i3.xlarge",
"ImageId": "ami-abcd1234",
"IpV4Addresses": [ "54.194.252.215", "192.168.1.88" ],
"IpV6Addresses": [ "2001:db812341a2b::123" ],
"KeyName": "my_keypair",
"VpcId": "vpc-11112222",
"SubnetId": "subnet-56f5f633",
"LaunchedAt": "2018-05-08T16:46:19.000Z"
}
}
}
]
I want to handle nested array[] ,{}
I have done something like this before and below code can be modified as I have not seen your dataset.
dataframe = pd.read_excel('dataframefilepath', encoding='utf-8', header=0)
'''Adding to list to finally save it as JSON'''
df = []
for (columnName, columnData) in dataframe.iteritems():
if dataframe.columns.get_loc(columnName) > 0:
for indata, rwdata in dataframe.iterrows():
for insav, rwsave in df_to_Save.iterrows():
if rwdata.Selected_Prediction == rwsave.Selected_Prediction:
#print()
df_to_Save.loc[insav, 'Value_to_Save'] = rwdata[dataframe.columns.get_loc(columnName)]
#print(rwdata[dataframe.columns.get_loc(columnName)])
df.append(df_to_Save.set_index('Selected_Prediction').T.to_dict('record'))
df = eval(df)
'''Saving in JSON format'''
path_to_save = '\\your path'
with open(path_to_save, 'w') as json_file:
json.dump(df, json_file)

JSON Schema Generator Python

I am using this resource to generate the schema https://github.com/wolverdude/GenSON/
I have the below JSON File
{
'name':'Sam',
},
{
'name':'Jack',
}
so on ...
I am wondering how to iterate over a large JSON file. I want to parse each JSON file and pass it to GENSON to generate schema
{
"$schema": "http://json-schema.org/schema#",
"type": "object",
"properties": {
"name": {
"type": [
"string"
]
}
},
"required": [
"name"
]
}
I think you should:
import json
from genson import SchemaBuilder
builder = SchemaBuilder()
with open(filename, 'r') as f:
datastore = json.load(f)
builder.add_object(datastore )
builder.to_schema()
Where filename is your file path.

Categories