Validating Avro schema that is referencing another schema - python

I am using the Python 3 avro_validator library.
The schema I want to validate references other schemas in sperate avro files. The files are in the same folder. How do I compile all the referenced schemas using the library?
Python code as follows:
from avro_validator.schema import Schema
schema_file = 'basketEvent.avsc'
schema = Schema(schema_file)
parsed_schema = schema.parse()
data_to_validate = {"test": "test"}
parsed_schema.validate(data_to_validate)
The error I get back:
ValueError: Error parsing the field [contentBasket]: The type [ContentBasket] is not recognized by Avro
And example Avro file(s) below:
basketEvent.avsc
{
"type": "record",
"name": "BasketEvent",
"doc": "Indicates that a user action has taken place with a basket",
"fields": [
{
"default": "basket",
"doc": "Restricts this event to having type = basket",
"name": "event",
"type": {
"name": "BasketEventType",
"symbols": ["basket"],
"type": "enum"
}
},
{
"default": "create",
"doc": "What is being done with the basket. Note: create / delete / update will always follow a product event",
"name": "action",
"type": {
"name": "BasketEventAction",
"symbols": ["create","delete","update","view"],
"type": "enum"
}
},
{
"default": "ContentBasket",
"doc": "The set of values that are specific to a Basket event",
"name": "contentBasket",
"type": "ContentBasket"
},
{
"default": "ProductDetail",
"doc": "The set of values that are specific to a Product event",
"name": "productDetail",
"type": "ProductDetail"
},
{
"default": "Timestamp",
"doc": "The time stamp for the event being sent",
"name": "timestamp",
"type": "Timestamp"
}
]
}
contentBasket.avsc
{
"name": "ContentBasket",
"type": "record",
"doc": "The set of values that are specific to a Basket event",
"fields": [
{
"default": [],
"doc": "A range of details about product / basket availability",
"name": "availability",
"type": {
"type": "array",
"items": "Availability"
}
},
{
"default": [],
"doc": "A range of care pland applicable to the basket",
"name": "carePlan",
"type": {
"type": "array",
"items": "CarePlan"
}
},
{
"default": "Category",
"name": "category",
"type": "Category"
},
{
"default": "",
"doc": "Unique identfier for this basket",
"name": "id",
"type": "string"
},
{
"default": "Price",
"doc": "Overall pricing info about the basket as a whole - individual product pricings will be dealt with at a product level",
"name": "price",
"type": "Price"
}
]
}
availability.avsc
{
"name": "Availability",
"type": "record",
"doc": "A range of values relating to the availability of a product",
"fields": [
{
"default": [],
"doc": "A list of offers associated with the overall basket - product level offers will be dealt with on an individual product basis",
"name": "shipping",
"type": {
"type": "array",
"items": "Shipping"
}
},
{
"default": "",
"doc": "The status of the product",
"name": "stockStatus",
"type": {
"name": "StockStatus",
"symbols": ["in stock","out of stock",""],
"type": "enum"
}
},
{
"default": "",
"doc": "The ID for the store when the stock can be collected, if relevant",
"name": "storeId",
"type": "string"
},
{
"default": "",
"doc": "The status of the product",
"name": "type",
"type": {
"name": "AvailabilityType",
"symbols": ["collection","shipping",""],
"type": "enum"
}
}
]
}
maxDate.avsc
{
"type": "record",
"name": "MaxDate",
"doc": "Indicates the timestamp for latest day a delivery should be made",
"fields": [
{
"default": "Timestamp",
"doc": "The time stamp for the delivery",
"name": "timestamp",
"type": "Timestamp"
}
]
}
minDate.avsc
{
"type": "record",
"name": "MinDate",
"doc": "Indicates the timestamp for earliest day a delivery should be made",
"fields": [
{
"default": "Timestamp",
"doc": "The time stamp for the delivery",
"name": "timestamp",
"type": "Timestamp"
}
]
}
shipping.avsc
{
"name": "Shipping",
"type": "record",
"doc": "A range of values relating to shipping a product for delivery",
"fields": [
{
"default": "MaxDate",
"name": "maxDate",
"type": "MaxDate"
},
{
"default": "MinDate",
"name": "minDate",
"type": "minDate"
},
{
"default": 0,
"doc": "Revenue generated from shipping - note, once a specific shipping object is selected, the more detailed revenye data sits within the one of object in pricing - this is more just to define if shipping is free or not",
"name": "revenue",
"type": "int"
},
{
"default": "",
"doc": "The shipping supplier",
"name": "supplier",
"type": "string"
}
]
}
timestamp.avsc
{
"name": "Timestamp",
"type": "record",
"doc": "Timestamp for the action taking place",
"fields": [
{
"default": 0,
"name": "timestampMs",
"type": "long"
},
{
"default": "",
"doc": "Timestamp converted to a string in ISO format",
"name": "isoTimestamp",
"type": "string"
}
]
}

I'm not sure if that library supports what you are trying to do, but fastavro should.
If you put the first schema in a file called BasketEvent.avsc and the second schema in a file called ContentBasket.avsc then you can do the following:
from fastavro.schema import load_schema
from fastavro import validate
schema = load_schema("BasketEvent.avsc")
validate({"test": "test"}, schema)
Note that when I tried to do this I got an error of fastavro._schema_common.UnknownType: Availability because it seems that there are other referenced schemas that you haven't posted here.

Related

A Python script that can navigate a .json and export a .csv based on a search term

I want to take a .json of "_PRESET..." items and their "code-state"s with "actions" that contain other "code-state"s, "appearance"s, and "switch"s and turn it into .csv produced from the actions under a given "_PRESET...", including the "code-state"s and the "actions" listed under their individual entries.
This would allow a user to enter the "_PRESET..." name and receive a 3-column .csv file containing each action's "type", name, and "value". There are of course ways to export the entire .json easily, but I can't fathom a way to navigate it like is needed.
enters "_PRESET_Config_A" for
input.json:
{
"abc_data": {
"_PRESET_Config_A": {
"properties": {
"category": "configuration",
"name": "_PRESET_Config_A",
"collection": null,
"description": ""
},
"actions": {
"EN-R9": {
"type": "code_state",
"value": "on"
}
}
},
"PN4FP": {
"properties": {
"category": "uncategorized",
"name": "PN4FP",
"collection": null,
"description": ""
},
"actions": {
"E_xxxxxx_Default": {
"type": "appearance",
"value": "M_Red"
}
}
},
"HEDIS": {
"properties": {
"category": "uncategorized",
"name": "HEDIS",
"collection": null,
"description": ""
},
"actions": {
"E_xxxxxx_Default": {
"type": "appearance",
"value": "M_Purple"
}
}
},
"_PRESET_Config_B": {
"properties": {
"category": "configuration",
"name": "_PRESET_Config_A",
"collection": null,
"description": ""
},
"actions": {
"HEDIS": {
"type": "code_state",
"value": "on"
}
}
},
"EN-R9": {
"properties": {
"category": "uncategorized",
"name": "EN-R9",
"collection": null,
"description": ""
},
"actions": {
"PN4FP": {
"type": "code_state",
"value": "on"
},
"switch_StorageBin": {
"type": "switch",
"value": "00_w_Storage_Bin_R9"
}
}
}
}
}
Desired output.csv
type,name,value
code_state,EN-R9,on
code_state,PN4FP,on
appearance,E_xxxxxx_Default,M_Red
switch,switch_StorageBin,00_w_Storage_Bin_R9

Parsing Multiple AVRO (avsc files) which refer each other using python (fastavro)

I have a AVRO schema which is currently in single avsc file like below. Now I want to move address record to a different common avsc file which should be referenced from many other avsc file. So Customer and address will be separate avsc files. How can I separate them and and have customer avsc file reference address avsc file. Also how would both the files can be processed using python. I am currently using fast avro in python3 to process the single avsc file but open to use any other utility in python3 or pyspark.
File name - customer_details.avsc
[
{
"type": "record",
"namespace": "com.company.model",
"name": "AddressRecord",
"fields": [
{
"name": "streetaddress",
"type": "string"
},
{
"name": "city",
"type": "string"
},
{
"name": "state",
"type": "string"
},
{
"name": "zip",
"type": "string"
}
]
},
{
"namespace": "com.company.model",
"type": "record",
"name": "Customer",
"fields": [
{
"name": "firstname",
"type": "string"
},
{
"name": "lastname",
"type": "string"
},
{
"name": "email",
"type": "string"
},
{
"name": "phone",
"type": "string"
},
{
"name": "address",
"type": {
"type": "array",
"items": "com.company.model.AddressRecord"
}
}
]
}
]
import fastavro
s1 = fastavro.schema.load_schema('customer_details.avsc')
How can split the schema in different file where address record file can be referenced from other avsc file. Then how would I process multiple avsc files using fast Avro (Python) or any other python utility?
To do this, the schema for the AddressRecord should be in a file called com.company.model.AddressRecord.avsc with the following contents:
{
"type": "record",
"namespace": "com.company.model",
"name": "AddressRecord",
"fields": [
{
"name": "streetaddress",
"type": "string"
},
{
"name": "city",
"type": "string"
},
{
"name": "state",
"type": "string"
},
{
"name": "zip",
"type": "string"
}
]
}
The Customer schema doesn't necessarily need a special naming convention since it is the top level schema, but it's probably a good idea to follow the same convention. So it would be in a file named com.company.model.Customer.avsc with the following contents:
{
"namespace": "com.company.model",
"type": "record",
"name": "Customer",
"fields": [
{
"name": "firstname",
"type": "string"
},
{
"name": "lastname",
"type": "string"
},
{
"name": "email",
"type": "string"
},
{
"name": "phone",
"type": "string"
},
{
"name": "address",
"type": {
"type": "array",
"items": "com.company.model.AddressRecord"
}
}
]
}
The files must be in the same directory.
Then you should be able to do fastavro.schema.load_schema('com.company.model.Customer.avsc')

Validate Json schema with repeating json response in Python

I am getting none when I try to validate my Json schema with my Json response using Validate from Jsonschema.validate, while it shows matched on https://www.jsonschemavalidator.net/
Json Schema
{
"KPI": [{
"KPIDefinition": {
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"version": {
"type": "number"
},
"description": {
"type": "string"
},
"datatype": {
"type": "string"
},
"units": {
"type": "string"
}
},
"KPIGroups": [{
"id": {
"type": "number"
},
"name": {
"type": "string"
}
}]
}],
"response": [{
"Description": {
"type": "string"
}
}]
}
JSON Response
JSON Response
{
"KPI": [
{
"KPIDefinition": {
"id": "2",
"name": "KPI 2",
"version": 1,
"description": "This is KPI 2",
"datatype": "1",
"units": "perHour"
},
"KPIGroups": [
{
"id": 7,
"name": "Group 7"
}
]
},
{
"KPIDefinition": {
"id": "3",
"name": "Parameter 3",
"version": 1,
"description": "This is KPI 3",
"datatype": "1",
"units": "per Hour"
},
"KPIGroups": [
{
"id": 7,
"name": "Group 7"
}
]
}
],
"response": [
{
"Description": "RECORD Found"
}
]
}
Code
json_schema2 = {"KPI":[{"KPIDefinition":{"id_new":{"type":"number"},"name":{"type":"string"},"version":{"type":"number"},"description":{"type":"string"},"datatype":{"type":"string"},"units":{"type":"string"}},"KPIGroups":[{"id":{"type":"number"},"name":{"type":"string"}}]}],"response":[{"Description":{"type":"string"}}]}
json_resp = {"KPI":[{"KPIDefinition":{"id":"2","name":"Parameter 2","version":1,"description":"This is parameter 2 definition version 1","datatype":"1","units":"kN"},"KPIGroups":[{"id":7,"name":"Group 7"}]},{"KPIDefinition":{"id":"3","name":"Parameter 3","version":1,"description":"This is parameter 3 definition version 1","datatype":"1","units":"kN"},"KPIGroups":[{"id":7,"name":"Group 7"}]}],"response":[{"Description":"RECORD FETCHED"}]}
print(jsonschema.validate(instance=json_resp, schema=json_schema2))
Validation is not being done correctly, I changed the dataType and key name in my response but still, it is not raising an exception or error.
jsonschema.validate(..) is not supposed to return anything.
Your schema object and the JSON object are both okay and validation has passed if it didn't raise any exceptions -- which seems to be the case here.
That being said, you should wrap your call within a try-except block so as to be able to catch validation errors.
Something like:
try:
jsonschema.validate(...)
print("Validation passed!")
except ValidationError:
print("Validation failed")
# similarly catch SchemaError too if needed.
Update: Your schema is invalid. As it stands, it will validate almost all inputs. A schema JSON should be an object (dict) that should have fields like "type" and based on the type, it might have other required fields like "items" or "properties". Please read up on how to write JSONSchema.
Here's a schema I wrote for your JSON:
{
"type": "object",
"required": [
"KPI",
"response"
],
"properties": {
"KPI": {
"type": "array",
"items": {
"type": "object",
"required": ["KPIDefinition","KPIGroups"],
"properties": {
"KPIDefinition": {
"type": "object",
"required": ["id","name"],
"properties": {
"id": {"type": "string"},
"name": {"type": "string"},
"version": {"type": "integer"},
"description": {"type": "string"},
"datatype": {"type": "string"},
"units": {"type": "string"},
},
"KPIGroups": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "name"],
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"}
}
}
}
}
}
}
},
"response": {
"type": "array",
"items": {
"type": "object",
"required": ["Description"],
"properties": {
"Description": {"type": "string"}
}
}
}
}
}

JSON schema testing

I am developing a json schema and I am trying to test if files validate against it properly. Still new to the whole json schema world (since today), apologies if my terminology is not correct.
I have different types of files, and they will differ with regards to their biomaterial_type. Each of them should be tested for "#/definitions/basic", some of them for "#/definitions/donor", and they all will have unique fields to test for.
Here is a (shortened) example containing one biomaterial_type:
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"basic": {
"type": "object",
"description": "Objects shared across all samples",
"properties": {
"sample_ontology_uri" : {
"type": "array", "minItems": 1,
"items": {
"type": "string",
"format": "uri",
"description": "(Ontology: EFO) links to sample ontology information."}},
"disease_ontology_uri" : {
"type": "array", "minItems": 1,
"items": {
"type": "string",
"format": "uri",
"description": "(Ontology: NCIM)"}},
"disease" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "Free form field "}},
"biomaterial_provider" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The name of the company, laboratory or person that provided the biological material."}},
"biomaterial_type" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The type of the biosample used (Cell Line, Primary Cell, Primary Cell Culture, Primary Tissue)",
"enum":["Cell Line", "Primary Cell", "Primary Cell Culture", "Primary Tissue"]}},
"treatment" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "Any artificial modification (differentiation, activation, genome editing, etc)."}},
"biological_replicates": {
"type": "array",
"items": {
"type": "string",
"description": "List of biological replicate sample accessions"}}
},
"required": ["sample_ontology_curie", "disease_ontology_curie", "disease", "biomaterial_provider", "biomaterial_type", "treatment", "biological_replicates"]
},
"donor": {
"type": "object",
"description": "Additional set of properties for samples coming from a donor.",
"properties": {
"donor_id" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "An identifying designation for the donor that provided the cells/tissues."}},
"donor_age" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"description": "The age of the donor that provided the cells/tissues. NA if not available. If over 90 years enter as 90+. If entering a range of ages use the format “{age}-{age}”.",
"oneOf": [
{ "type": "number" },
{ "type": "string", "enum": ["90+", "NA"] },
{ "type": "string", "format": "uri" }
]
}},
"donor_age_unit" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The unit of measurement used to describe the age of the sample (year, month, week, day)",
"enum": ["year", "month", "week", "day"]}},
"donor_life_stage": {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The stage or phase of the donor when the sample was taken (embryonic, fetal, postnatal, newborn, child, adult, unknown)",
"enum": ["embryonic", "fetal", "postnatal", "newborn", "child", "adult", "unknown"]}},
"donor_health_status" : {
"type": "array", "minItems": 1, "maxItems": 1, "items": {
"type": "string",
"description": "The health status of the donor that provided the primary cell. NA if not available."}},
"donor_health_status_ontology_uri" : {
"type": "array", "minItems": 1,
"items": {
"type": "string",
"format": "uri",
"description": "(Ontology: NCIM) "}},
"donor_sex" : {"type": "array", "minItems": 1, "maxItems": 1, "items": {"type": "string", "enum": ["Male", "Female", "Unknown", "Mixed"], "description": "'Male', 'Female', 'Unknown', or 'Mixed' for pooled samples."}},
"donor_ethnicity" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The ethnicity of the donor that provided the primary cell. NA if not available. If dealing with small/vulnerable populations consider identifiability issues."}}
},
"required": ["donor_id", "donor_age", "donor_age_unit", "donor_life_stage", "donor_health_status_uri", "donor_health_status", "donor_sex", "donor_ethnicity"]
}
},
"type" : "object",
"if":
{"properties":
{ "biomaterial_type": {"const": "Primary Tissue"}},
"required": ["biomaterial_type"] },
"then": {
"allOf": [
{ "$ref": "#/definitions/donor" },
{
"properties": {
"tissue_type" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The type of tissue."}},
"tissue_depot" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "Details about the anatomical location from which the primary tissue was collected."}},
"collection_method" : {
"type": "array", "minItems": 1, "maxItems": 1,
"items": {
"type": "string",
"description": "The protocol for collecting the primary tissue."}}
},
"required": ["tissue_type", "tissue_depot", "collection_method"]
}
]
}
}
Additional biomaterial_type will be added via additional if-conditions.
Here is a example json:
{
"SAMPLE_SET": {
"SAMPLE": [
{
"TITLE": "Homo sapiens male embryo (108 days) small intestine tissue",
"SAMPLE_NAME": {
"TAXON_ID": "9606",
"SCIENTIFIC_NAME": "Homo sapiens",
"COMMON_NAME": "human"
},
"SAMPLE_ATTRIBUTES": {
"SAMPLE_ATTRIBUTE": [
{
"TAG": "SAMPLE_ONTOLOGY_URI",
"VALUE": "http://purl.obolibrary.org/obo/UBERON:0002108"
},
{
"TAG": "DISEASE_ONTOLOGY_URI",
"VALUE": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C115935"
},
{
"TAG": "DISEASE",
"VALUE": "Healthy"
},
{
"TAG": "BIOMATERIAL_PROVIDER",
"VALUE": "Ian Glass at Congenital Defects Lab, University of Washington"
},
{
"TAG": "BIOMATERIAL_TYPE",
"VALUE": "Primary Tissue"
},
{
"TAG": "TISSUE_TYPE",
"VALUE": "small intestine"
},
{
"TAG": "TISSUE_DEPOT",
"VALUE": "Ian Glass at Congenital Defects Lab, University of Washington"
},
{
"TAG": "COLLECTION_METHOD",
"VALUE": "unknown"
},
{
"TAG": "DONOR_ID",
"VALUE": "ENCDO119ASK"
},
{
"TAG": "DONOR_AGE",
"VALUE": "NA"
},
{
"TAG": "DONOR_AGE_UNIT",
"VALUE": "day"
},
{
"TAG": "DONOR_LIFE_STAGE",
"VALUE": "embryonic"
},
{
"TAG": "DONOR_HEALTH_STATUS_ONTOLOGY_URI",
"VALUE": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C115935"
},
{
"TAG": "DONOR_HEALTH_STATUS",
"VALUE": "Healthy"
},
{
"TAG": "DONOR_SEX",
"VALUE": "Male"
},
{
"TAG": "DONOR_ETHNICITY",
"VALUE": "NA"
}
]
},
"_accession": "ENCBS054KUO",
"_center_name": "ENCODE"
},
]
}
}
I am trying to test if schema makes sense using jsonschema with python:
import json
import jsonschema
from jsonschema import validate
data = ''
schema = ''
with open('data.json', 'r') as file:
data = file.read()
with open(schema.json, 'r') as file:
schema = file.read()
try:
jsonschema.validate(json.loads(data), json.loads(schema))
print('ok')
except jsonschema.ValidationError as e:
print (e.message)
except jsonschema.SchemaError as e:
print (e)
I always get "ok", even if I provide json data with errors.
Is the problem with my Python script or with my schema?
Thanks for any pointers.

avro-python3 doesn't provide schema evolution?

I try to recreate a schema evolution case with avro-python3 (backward compatibility).
I have two schemas:
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
schema_v1 = avro.schema.Parse("""
{
"type": "record",
"namespace": "com.example",
"name": "CustomerV1",
"fields": [
{ "name": "first_name", "type": "string", "doc": "First Name of Customer" },
{ "name": "last_name", "type": "string", "doc": "Last Name of Customer" },
{ "name": "age", "type": "int", "doc": "Age at the time of registration" },
{ "name": "height", "type": "float", "doc": "Height at the time of registration in cm" },
{ "name": "weight", "type": "float", "doc": "Weight at the time of registration in kg" },
{ "name": "automated_email", "type": "boolean", "default": true, "doc": "Field indicating if the user is enrolled in marketing emails" }
]
}
""")
schema_v2 = avro.schema.Parse("""
{
"type": "record",
"namespace": "com.example",
"name": "CustomerV2",
"fields": [
{ "name": "first_name", "type": "string", "doc": "First Name of Customer" },
{ "name": "last_name", "type": "string", "doc": "Last Name of Customer" },
{ "name": "age", "type": "int", "doc": "Age at the time of registration" },
{ "name": "height", "type": "float", "doc": "Height at the time of registration in cm" },
{ "name": "weight", "type": "float", "doc": "Weight at the time of registration in kg" },
{ "name": "phone_number", "type": ["null", "string"], "default": null, "doc": "optional phone number"},
{ "name": "email", "type": "string", "default": "missing#example.com", "doc": "email address"}
]
}
""")
The second schema doesn't have automated_email field but has two additional fields: phone_number and email.
According to avro schema evolution rules if I write an avro record with schema_v1:
writer = DataFileWriter(open("customer_v1.avro", "wb"), DatumWriter(), schema_v1)
writer.append({
"first_name": "John",
"last_name": "Doe",
"age" : 34,
"height": 178.0,
"weight": 75.0,
"automated_email": True
})
writer.close()
... i can read it with schema_v2 provided there are default values for non-existing fields
reader = DataFileReader(open("customer_v1.avro", "rb"), DatumReader(reader_schema=schema_v2))
for field in reader:
print(field)
reader.close()
But I get the following error
SchemaResolutionException: Schemas do not match.
I know this works in Java. This is an example from a video course.
Is there a way to make it work in python?
fastavro, an alternative python implementation, handles this just fine.
The code to write with the first schema is here:
s1 = {
"type": "record",
"namespace": "com.example",
"name": "CustomerV1",
"fields": [
{"name": "first_name", "type": "string", "doc": "First Name of Customer"},
{"name": "last_name", "type": "string", "doc": "Last Name of Customer"},
{"name": "age", "type": "int", "doc": "Age at the time of registration"},
{
"name": "height",
"type": "float",
"doc": "Height at the time of registration in cm",
},
{
"name": "weight",
"type": "float",
"doc": "Weight at the time of registration in kg",
},
{
"name": "automated_email",
"type": "boolean",
"default": True,
"doc": "Field indicating if the user is enrolled in marketing emails",
},
],
}
record = {
"first_name": "John",
"last_name": "Doe",
"age": 34,
"height": 178.0,
"weight": 75.0,
"automated_email": True,
}
import fastavro
with open("test.avro", "wb") as fp:
fastavro.writer(fp, fastavro.parse_schema(s1), [record])
And to read with the second schema:
s2 = {
"type": "record",
"namespace": "com.example",
"name": "CustomerV2",
"fields": [
{"name": "first_name", "type": "string", "doc": "First Name of Customer"},
{"name": "last_name", "type": "string", "doc": "Last Name of Customer"},
{"name": "age", "type": "int", "doc": "Age at the time of registration"},
{
"name": "height",
"type": "float",
"doc": "Height at the time of registration in cm",
},
{
"name": "weight",
"type": "float",
"doc": "Weight at the time of registration in kg",
},
{
"name": "phone_number",
"type": ["null", "string"],
"default": None,
"doc": "optional phone number",
},
{
"name": "email",
"type": "string",
"default": "missing#example.com",
"doc": "email address",
},
],
}
import fastavro
with open("test.avro", "rb") as fp:
for record in fastavro.reader(fp, fastavro.parse_schema(s2)):
print(record)
The output as the new fields as expected:
{'first_name': 'John', 'last_name': 'Doe', 'age': 34, 'height': 178.0, 'weight': 75.0, 'phone_number': None, 'email': 'missing#example.com'}
If you change the second schema from CustomerV2 to CustomerV1 it works with avro-python3 version 1.10.0.

Categories