Read and write schema when using the python avro library - python

The avro specification allows using different write and read schema provided they match. The specification further allows aliases to cater for differences between the read and write schema. The following python 2.7 tries to illustrate this.
import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
write_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(write_schema))
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
read_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "first_name", "type": "string", "aliases": ["name"]},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
reader.close()
This code has the following error message:
/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 /Users/simonshapiro/python_beam/src/avrov_test.py
Traceback (most recent call last):
File "/Users/simonshapiro/python_beam/src/avrov_test.py", line 67, in <module>
writer.append({"name": "Alyssa", "favorite_number": 256})
File "/Library/Python/2.7/site-packages/avro/datafile.py", line 196, in append
self.datum_writer.write(datum, self.buffer_encoder)
File "/Library/Python/2.7/site-packages/avro/io.py", line 768, in write
if not validate(self.writers_schema, datum):
File "/Library/Python/2.7/site-packages/avro/io.py", line 103, in validate
schema_type = expected_schema.type
AttributeError: 'dict' object has no attribute 'type'
Process finished with exit code 1
When it is run without different schema using this line
reader = DataFileReader(open("users.avro", "rb"), DatumReader())
it works fine.

Well after some more work I have discovered that the schemas were not set up correctly. This code works as intended:
import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
write_schema = avro.schema.parse(json.dumps({
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}))
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), write_schema)
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
read_schema = avro.schema.parse(json.dumps({
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "first_name", "type": "string", "default": "", "aliases": ["name"]},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}))
# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
new_schema = reader.get_meta("avro.schema")
users = []
for user in reader:
users.append(user)
reader.close()

Related

Python Avro, how to write data to a modified schema?

I'm new to Avro and I'm trying to perform basic tasks like read data from data.avro and now I want to write data to data.avro.
My problem is : ...is not an example of the schema...
I don't understand where is my mistake and I'd appreciate your help :
from avro import schema, datafile, io
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader
OUTFILE_NAME = '4_2.avro'
SCHEMA_STR = """{
"namespace": "advdaba",
"type": "record",
"name": "Conference",
"fields": [
{ "name": "name", "type": "string" },
{ "name": "time", "type": "long" },
{ "name": "location", "type": "string" },
{ "name": "speakers", "type": {"type":"array","items":"string"} },
{ "name": "participants", "type": {"type": "array", "items": "string"} },
{ "name": "seating", "type": {"type": "map", "values": "int"} }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
# read data writtent with the old schema
reader = DataFileReader(open("example.avro", "rb"), DatumReader())
#data = []
for example in reader:
print(example)
#data.append(example)
reader.close()
# generate data for new schema
data = {
'name': 'Foo',
'time': 25612345,
'location': 'Berne',
'speakers': ['Jean', 'Elton'],
'participants': ['John', 'Michel', 'Jacques'],
'seating': [{'John': 1}, {'Michel': 2}, {'Jacques': 3}]
}
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='deflate'
)
df_writer.append(data)
df_writer.close()
I think for what I understand, it's even possible to write the old and new data in the same .avro file
[EDIT]
after debugging the problem comes from seating
The problem was in seating
The solution is
'seating': {"John": 1, "Michel": 2, "Jacques": 3}

CSV to Avro with Python: Avro Schema Issue

I am trying to serialise my CSV file into Avro and then iterate through each row and send to a Kafka consumer. Currently I get an issue where the data being send through doesn't match my schema but I am unsure as to why.
below is code to read csv and serialise the rows in it and output to a file with Avro format.
import os, csv, avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from kafka import KafkaProducer
from collections import namedtuple
output_loc = '{}/avro.avro'.format(os.path.dirname(__file__))
CSV = '{}/oscar_age_male.csv'.format(os.path.dirname(__file__))
fields = ("Index","Year", "Age", "Name", "Movie")
csv_record = namedtuple('csv_record', fields)
def read_csv(path):
with open(path, 'rU') as data:
data.readline()
reader = csv.reader(data, delimiter=",")
for row in map(csv_record._make, reader):
print(row)
yield row
def parse_schema(path='{}/schema.avsc'.format(os.path.dirname(__file__))):
with open(path, 'r') as data:
return avro.schema.parse(data.read())
def serilialise_records(records, outpath=output_loc):
schema = parse_schema()
with open(outpath, 'w') as out:
writer = DataFileWriter(out, DatumWriter(), schema)
for record in records:
record = dict((f, getattr(record, f)) for f in record._fields)
writer.append(record)
serilialise_records(read_csv(CSV))
and here is the error is receive:
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'Index': '1', 'Year': '1928', 'Age': '44', 'Name': ' "Emil Jannings"', 'Movie': ' "The Last Command The Way of All Flesh"'} is not an example of the schema {
"type": "record",
"name": "Test",
"namespace": "avro_schema_test",
"fields": [
{
"type": "int",
"name": "Index"
},
{
"type": "int",
"name": "Year"
},
{
"type": "int",
"name": "Age"
},
{
"type": "string",
"name": "Name"
},
{
"type": "string",
"name": "Movie"
}
]
}
my Avro schema is:
{
"type": "record",
"namespace": "avro_schema_test",
"name": "Test",
"fields": [
{"name": "Index", "type": "int"},
{"name": "Year", "type": "int"},
{"name": "Age", "type": "int"},
{"name": "Name", "type": "string"},
{"name": "Movie", "type": "string"}
]
}
Once issue is resolved I will iterate through my avro file and send records to Kafka.

Why is this datum not an example of the avro schema in python?

I'm having some trouble decoding an Avro message from Kafka in Python using kafka-python. To boil it down, I'm focusing on just decoding the message using the avro package. I have written a test with the schema and example from the official avro docs: https://avro.apache.org/docs/current/gettingstartedpython.html.
repl.it
from avro.io import DatumWriter, DatumReader, BinaryEncoder, BinaryDecoder
import avro.schema
from io import BytesIO
schema = avro.schema.parse("""
{
"type": "record",
"name": "User",
"namespace": "example.avro",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "favorite_number",
"type": [
"int",
"null"
]
},
{
"name": "favorite_color",
"type": [
"string",
"null"
]
}
]
}
""")
wb = BytesIO()
encoder = BinaryEncoder(wb)
writer = DatumWriter(schema)
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
rb = BytesIO(wb.getvalue())
decoder = BinaryDecoder(rb)
reader = DatumReader(schema)
msg = reader.read(decoder)
print(msg)
I'm receiving an error that the datum {"name":"Alyssa","favorite_number":256,"favorite_color":"blue"} is not an example of the schema. What am I doing wrong given that this schema and datum come straight from the official Avro docs for Python?
Traceback (most recent call last):
File "main.py", line 36, in <module>
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
File "/opt/virtualenvs/python3/lib/python3.8/site-packages/avro/io.py", line 979, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {"name":"Alyssa","favorite_number":256,"favorite_color":"blue"} is not an example of the schema {
"type": "record",
"name": "User",
"namespace": "example.avro",
"fields": [
{
"type": "string",
"name": "name"
},
{
"type": [
"int",
"null"
],
"name": "favorite_number"
},
{
"type": [
"string",
"null"
],
"name": "favorite_color"
}
]
}
You currently have
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
So the datum you are providing is a string. If you change it to a dictionary like this:
writer.write({"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}, encoder)
Then it works.

Unable to register schema using register() in python

I am trying to register schema to confluent schema registry using python.
from schema_registry.client import SchemaRegistryClient
subject_name = "new-schema"
schema_url = "https://{{ schemaRegistry }}:8081"
sr = SchemaRegistryClient(schema_url)
schema = {"namespace": "example.avro",
"type": "record",
"name": "user",
"fields": [
{"name": "fname", "type": "string"},
{"name": "favorite_number", "type": "int"}
]
}
my_schema = sr.register(subject_name, schema)
I am getting error as
AttributeError: 'dict' object has no attribute 'name'
This is a valid avro schema. Still getting this error. What is that I am missing in here?
Any help would be appreciated.
Instead of a dict, try to pass schema_registry.client.schema.AvroSchema:
from schema_registry.client import SchemaRegistryClient, schema
schema_ = schema.AvroSchema({
"namespace": "example.avro",
"type": "record",
"name": "user",
"fields": [
{"name": "fname", "type": "string"},
{"name": "favorite_number", "type": "int"}
]
})
my_schema = sr.register(subject_name, schema_)

How to nest records in an Avro schema?

I'm trying to get Python to parse Avro schemas such as the following...
from avro import schema
mySchema = """
{
"name": "person",
"type": "record",
"fields": [
{"name": "firstname", "type": "string"},
{"name": "lastname", "type": "string"},
{
"name": "address",
"type": "record",
"fields": [
{"name": "streetaddress", "type": "string"},
{"name": "city", "type": "string"}
]
}
]
}"""
parsedSchema = schema.parse(mySchema)
...and I get the following exception:
avro.schema.SchemaParseException: Type property "record" not a valid Avro schema: Could not make an Avro Schema object from record.
What am I doing wrong?
According to other sources on the web I would rewrite your second address definition:
mySchema = """
{
"name": "person",
"type": "record",
"fields": [
{"name": "firstname", "type": "string"},
{"name": "lastname", "type": "string"},
{
"name": "address",
"type": {
"type" : "record",
"name" : "AddressUSRecord",
"fields" : [
{"name": "streetaddress", "type": "string"},
{"name": "city", "type": "string"}
]
}
}
]
}"""
Every time we provide the type as named type, the field needs to be given as:
"name":"some_name",
"type": {
"name":"CodeClassName",
"type":"record/enum/array"
}
However, if the named type is union, then we do not need an extra type field and should be usable as:
"name":"some_name",
"type": [{
"name":"CodeClassName1",
"type":"record",
"fields": ...
},
{
"name":"CodeClassName2",
"type":"record",
"fields": ...
}]
Hope this clarifies further!

Categories