CSV to Avro with Python: Avro Schema Issue

CSV to Avro with Python: Avro Schema Issue - python

I am trying to serialise my CSV file into Avro and then iterate through each row and send to a Kafka consumer. Currently I get an issue where the data being send through doesn't match my schema but I am unsure as to why.
below is code to read csv and serialise the rows in it and output to a file with Avro format.
import os, csv, avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from kafka import KafkaProducer
from collections import namedtuple
output_loc = '{}/avro.avro'.format(os.path.dirname(__file__))
CSV = '{}/oscar_age_male.csv'.format(os.path.dirname(__file__))
fields = ("Index","Year", "Age", "Name", "Movie")
csv_record = namedtuple('csv_record', fields)
def read_csv(path):
with open(path, 'rU') as data:
data.readline()
reader = csv.reader(data, delimiter=",")
for row in map(csv_record._make, reader):
print(row)
yield row
def parse_schema(path='{}/schema.avsc'.format(os.path.dirname(__file__))):
with open(path, 'r') as data:
return avro.schema.parse(data.read())
def serilialise_records(records, outpath=output_loc):
schema = parse_schema()
with open(outpath, 'w') as out:
writer = DataFileWriter(out, DatumWriter(), schema)
for record in records:
record = dict((f, getattr(record, f)) for f in record._fields)
writer.append(record)
serilialise_records(read_csv(CSV))
and here is the error is receive:
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'Index': '1', 'Year': '1928', 'Age': '44', 'Name': ' "Emil Jannings"', 'Movie': ' "The Last Command The Way of All Flesh"'} is not an example of the schema {
"type": "record",
"name": "Test",
"namespace": "avro_schema_test",
"fields": [
{
"type": "int",
"name": "Index"
},
{
"type": "int",
"name": "Year"
},
{
"type": "int",
"name": "Age"
},
{
"type": "string",
"name": "Name"
},
{
"type": "string",
"name": "Movie"
}
]
}
my Avro schema is:
{
"type": "record",
"namespace": "avro_schema_test",
"name": "Test",
"fields": [
{"name": "Index", "type": "int"},
{"name": "Year", "type": "int"},
{"name": "Age", "type": "int"},
{"name": "Name", "type": "string"},
{"name": "Movie", "type": "string"}
]
}
Once issue is resolved I will iterate through my avro file and send records to Kafka.

Related

Convert unformatted JSON file to CSV

I am trying to convert a JSON file into CSV. The issue is JSON file is not formatted uniformly.
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2022-12-21T16:34:35.000+0000",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b0000212gxh",
"Id": "00Q3b0000212gxhEAA"
},
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2020-03-31T23:25:03.000+0000",
"Verification_Status__c": "Invalid",
"Verification_Date__c": "2022-08-05",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b00001t0uNf",
"Id": "00Q3b00001t0uNfEAI"
},
Here is the snippet from the JSON file, but Verification_Status__c,Verification_Date__c is missing from the 2nd entry.
I used this code
import json
import csv
# Open the JSON file & load its data
with open('duplicate.json') as dat_file:
data = json.load(dat_file)
stud_data = data['records']
# Opening a CSV file for writing in write mode
data_file = open('data_file.csv', 'w')
csv_writer = csv.writer(data_file)
count = 0
for cnt in stud_data:
if count == 0:
header = cnt.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(cnt.values())
data_file.close()
but I am getting scrambled data in CSV file

Can use a csv.DictWriter if the records appear in different order or if the keys are missing from some records.
If there are nested objects in the JSON then they need to be flattened to export in the CSV output.
For the DictWriter, you will need the full set of keys when creating it, so can either create a fixed list at start or do two-passes over the data where the first pass will find the full set of keys and second pass creates the CSV file.
import json
import csv
data = """{
"records":[
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2022-12-21T16:34:35.000+0000",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b0000212gxh",
"Id": "00Q3b0000212gxhEAA"
},
{
"attributes": {
"type": "Lead",
"url": "xyz"
},
"FirstName": "Bradford",
"LastName": "Cosenza",
"School_District__c": "Ross County",
"Status": "Open",
"CreatedDate": "2020-03-31T23:25:03.000+0000",
"Verification_Status__c": "Invalid",
"Verification_Date__c": "2022-08-05",
"Email": "something#something.com",
"Lead_ID__c": "00Q3b00001t0uNf",
"Id": "00Q3b00001t0uNfEAI"
}
]}"""
# full set of keys in JSON for the CSV columns
keys = ["Id",
"FirstName",
"LastName",
"School_District__c",
"Status",
"CreatedDate",
"Verification_Status__c",
"Verification_Date__c",
"Email",
"Lead_ID__c",
"type",
"url"
]
Next convert data to list of dictionary objects
and write output to CSV file.
# Open the JSON file & load its data
# use json.loads() to load from string or json.load() to load from file
data = json.loads(data)
stud_data = data['records']
# Opening a CSV file for writing in write mode
with open('data_file.csv', 'w', newline='') as data_file:
csv_writer = csv.DictWriter(data_file, fieldnames=keys)
csv_writer.writeheader()
for row in stud_data:
# flatten the sub-elements in attributes object
attrs = row.pop("attributes", None)
if attrs:
for k,v in attrs.items():
row[k] = v
csv_writer.writerow(row)
Output:
Id,FirstName,LastName,School_District__c,Status,CreatedDate,Verification_Status__c,Verification_Date__c,Email,Lead_ID__c,type,url
00Q3b0000212gxhEAA,Bradford,Cosenza,Ross County,Open,2022-12-21T16:34:35.000+0000,,,something#something.com,00Q3b0000212gxh,Lead,xyz
00Q3b00001t0uNfEAI,Bradford,Cosenza,Ross County,Open,2020-03-31T23:25:03.000+0000,Invalid,2022-08-05,something#something.com,00Q3b00001t0uNf,Lead,xyz

Python Avro, how to write data to a modified schema?

I'm new to Avro and I'm trying to perform basic tasks like read data from data.avro and now I want to write data to data.avro.
My problem is : ...is not an example of the schema...
I don't understand where is my mistake and I'd appreciate your help :
from avro import schema, datafile, io
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader
OUTFILE_NAME = '4_2.avro'
SCHEMA_STR = """{
"namespace": "advdaba",
"type": "record",
"name": "Conference",
"fields": [
{ "name": "name", "type": "string" },
{ "name": "time", "type": "long" },
{ "name": "location", "type": "string" },
{ "name": "speakers", "type": {"type":"array","items":"string"} },
{ "name": "participants", "type": {"type": "array", "items": "string"} },
{ "name": "seating", "type": {"type": "map", "values": "int"} }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
# read data writtent with the old schema
reader = DataFileReader(open("example.avro", "rb"), DatumReader())
#data = []
for example in reader:
print(example)
#data.append(example)
reader.close()
# generate data for new schema
data = {
'name': 'Foo',
'time': 25612345,
'location': 'Berne',
'speakers': ['Jean', 'Elton'],
'participants': ['John', 'Michel', 'Jacques'],
'seating': [{'John': 1}, {'Michel': 2}, {'Jacques': 3}]
}
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='deflate'
)
df_writer.append(data)
df_writer.close()
I think for what I understand, it's even possible to write the old and new data in the same .avro file
[EDIT]
after debugging the problem comes from seating

The problem was in seating
The solution is
'seating': {"John": 1, "Michel": 2, "Jacques": 3}

Avro schema not respecting alias in schema definition

Avro schema schema.avsc:
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title", "nickname"]
}
]
}
Python script main.py:
from fastavro import writer, reader
from fastavro.schema import load_schema
schema = load_schema('schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
with open(avro_data, 'wb') as fout:
writer(fout, schema, data, validator=True)
with open(avro_data, 'rb') as fin:
for i in reader(fin, schema):
print(i)
When my json lines data.jsonl file looks like this:
{"id":"1","name":"foo"}
{"id":"2","name":"bar"}
My python script returns:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}
However, if my json lines data.jsonl file looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
My python script returns:
{'id': '1', 'name': None}
{'id': '2', 'name': None}
Any idea why the name column isn't respecting the aliases attribute I've defined in the avro schema file for that particular field?

Aliases are used when you have data written with an old schema that you want to read with a new schema. Your example only uses one schema, so aliases wouldn't work with just a single schema.
Let's use the following two schemas in an example. Here's an "old" schema which uses the title field:
old_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "title",
"type": ["string", "null"]
}
]
}
And a new schema where we want the new name field to be an alias of the old title field:
new_schema.avsc
{
"namespace": "standard",
"type": "record",
"name": "agent",
"aliases":["agents"],
"fields": [
{
"name": "id",
"type": ["string", "null"]
},
{
"name": "name",
"type": ["string", "null"],
"aliases":["title"]
}
]
}
If we use your second data.jsonl which looks like this:
{"id":"1","title":"foo"}
{"id":"2","title":"bar"}
Then we can use a slightly modified version of your main.py so that the data is written with the old schema and then the new schema is passed to the reader so that the aliases are respected:
from fastavro import writer, reader
from fastavro.schema import load_schema
import jsonlines
old_schema = load_schema('old_schema.avsc')
new_schema = load_schema('new_schema.avsc')
avro_data = 'agent.avro'
data = jsonlines.open('data.jsonl')
# Data is writen with old schema
with open(avro_data, 'wb') as fout:
writer(fout, old_schema, data, validator=True)
# And read with new schema
with open(avro_data, 'rb') as fin:
for i in reader(fin, new_schema):
print(i)
Now the output is correct:
{'id': '1', 'name': 'foo'}
{'id': '2', 'name': 'bar'}

Why is this datum not an example of the avro schema in python?

I'm having some trouble decoding an Avro message from Kafka in Python using kafka-python. To boil it down, I'm focusing on just decoding the message using the avro package. I have written a test with the schema and example from the official avro docs: https://avro.apache.org/docs/current/gettingstartedpython.html.
repl.it
from avro.io import DatumWriter, DatumReader, BinaryEncoder, BinaryDecoder
import avro.schema
from io import BytesIO
schema = avro.schema.parse("""
{
"type": "record",
"name": "User",
"namespace": "example.avro",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "favorite_number",
"type": [
"int",
"null"
]
},
{
"name": "favorite_color",
"type": [
"string",
"null"
]
}
]
}
""")
wb = BytesIO()
encoder = BinaryEncoder(wb)
writer = DatumWriter(schema)
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
rb = BytesIO(wb.getvalue())
decoder = BinaryDecoder(rb)
reader = DatumReader(schema)
msg = reader.read(decoder)
print(msg)
I'm receiving an error that the datum {"name":"Alyssa","favorite_number":256,"favorite_color":"blue"} is not an example of the schema. What am I doing wrong given that this schema and datum come straight from the official Avro docs for Python?
Traceback (most recent call last):
File "main.py", line 36, in <module>
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
File "/opt/virtualenvs/python3/lib/python3.8/site-packages/avro/io.py", line 979, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {"name":"Alyssa","favorite_number":256,"favorite_color":"blue"} is not an example of the schema {
"type": "record",
"name": "User",
"namespace": "example.avro",
"fields": [
{
"type": "string",
"name": "name"
},
{
"type": [
"int",
"null"
],
"name": "favorite_number"
},
{
"type": [
"string",
"null"
],
"name": "favorite_color"
}
]
}

You currently have
writer.write('{"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}', encoder)
So the datum you are providing is a string. If you change it to a dictionary like this:
writer.write({"name":"Alyssa","favorite_number":256,"favorite_color":"blue"}, encoder)
Then it works.

Read and write schema when using the python avro library

The avro specification allows using different write and read schema provided they match. The specification further allows aliases to cater for differences between the read and write schema. The following python 2.7 tries to illustrate this.
import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
write_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(write_schema))
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
read_schema = {
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "first_name", "type": "string", "aliases": ["name"]},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
reader.close()
This code has the following error message:
/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 /Users/simonshapiro/python_beam/src/avrov_test.py
Traceback (most recent call last):
File "/Users/simonshapiro/python_beam/src/avrov_test.py", line 67, in <module>
writer.append({"name": "Alyssa", "favorite_number": 256})
File "/Library/Python/2.7/site-packages/avro/datafile.py", line 196, in append
self.datum_writer.write(datum, self.buffer_encoder)
File "/Library/Python/2.7/site-packages/avro/io.py", line 768, in write
if not validate(self.writers_schema, datum):
File "/Library/Python/2.7/site-packages/avro/io.py", line 103, in validate
schema_type = expected_schema.type
AttributeError: 'dict' object has no attribute 'type'
Process finished with exit code 1
When it is run without different schema using this line
reader = DataFileReader(open("users.avro", "rb"), DatumReader())
it works fine.

Well after some more work I have discovered that the schemas were not set up correctly. This code works as intended:
import uuid
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
write_schema = avro.schema.parse(json.dumps({
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}))
writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), write_schema)
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()
read_schema = avro.schema.parse(json.dumps({
"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "first_name", "type": "string", "default": "", "aliases": ["name"]},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]}
]
}))
# 1. open avro and extract passport + data
reader = DataFileReader(open("users.avro", "rb"), DatumReader(write_schema, read_schema))
new_schema = reader.get_meta("avro.schema")
users = []
for user in reader:
users.append(user)
reader.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

CSV to Avro with Python: Avro Schema Issue - python

Related

Convert unformatted JSON file to CSV

Python Avro, how to write data to a modified schema?

Avro schema not respecting alias in schema definition

Why is this datum not an example of the avro schema in python?

Read and write schema when using the python avro library

Categories

Resources