I'm working on the PoC about Amundsen Lyft for introducing data catalog.
So, I use the file 'sample_mysql_loader.py' and insert database information.
OS: Ubuntu 18.04
Python: 3.6.9
pip: 21.3.1
elasticsearch: 8.0.0
neo4j: 3.5.26
I installed Amundsen Lyft using quickstart on Docker
import logging
import sys
import textwrap
import uuid
from elasticsearch import Elasticsearch
from pyhocon import ConfigFactory
from sqlalchemy.ext.declarative import declarative_base
from databuilder.extractor.mysql_metadata_extractor import MysqlMetadataExtractor
from databuilder.extractor.neo4j_extractor import Neo4jExtractor
from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher import neo4j_csv_publisher
from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer
import pymysql
pymysql.install_as_MySQLdb()
es_host = None
neo_host = None
if len(sys.argv) > 1:
es_host = sys.argv[1]
if len(sys.argv) > 2:
neo_host = sys.argv[2]
es = Elasticsearch([
{'host': es_host or 'localhost'},
])
DB_FILE = '/tmp/test.db'
SQLITE_CONN_STRING = 'sqlite:////tmp/test.db'
Base = declarative_base()
NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687'
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
LOGGER = logging.getLogger(__name__)
# todo: connection string needs to change
def connection_string():
***user = 'user_name'
password='password'
host = 'host'
port = 'port'
db = 'db_name'***
return "mysql://%s:%s#%s:%s/%s" % (user, ***password,*** host, port, db)
def run_mysql_job():
where_clause_suffix = textwrap.dedent("""
where c.table_schema = '***db_name***'
""")
tmp_folder = '/var/tmp/amundsen/table_metadata'
node_files_folder = f'{tmp_folder}/nodes/'
relationship_files_folder = f'{tmp_folder}/relationships/'
job_config = ConfigFactory.from_dict({
f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True,
f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds}
})
job = DefaultJob(conf=job_config,
task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()),
publisher=Neo4jCsvPublisher())
return job
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
cypher_query=None,
elasticsearch_mapping=None):
"""
:param elasticsearch_index_alias: alias for Elasticsearch used in
amundsensearchlibrary/search_service/config.py as an index
:param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
`table_search_index`
:param model_name: the Databuilder model class used in transporting between Extractor and Loader
:param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
it uses the `Table` query baked into the Extractor
:param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
if None is given (default) it uses the `Table` query baked into the Publisher
"""
# loader saves data to this location and publisher reads it from here
extracted_search_data_path = '/var/tmp/amundsen/search_data.json'
task = DefaultTask(loader=FSElasticsearchJSONLoader(),
extractor=Neo4jSearchDataExtractor(),
transformer=NoopTransformer())
# elastic search client instance
elasticsearch_client = es
# unique name of new index in Elasticsearch
elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
job_config = ConfigFactory.from_dict({
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w',
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r',
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}':
elasticsearch_client,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}':
elasticsearch_new_index_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}':
elasticsearch_doc_type_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}':
elasticsearch_index_alias,
})
# only optionally add these keys, so need to dynamically `put` them
if cypher_query:
job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}',
cypher_query)
if elasticsearch_mapping:
job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}',
elasticsearch_mapping)
job = DefaultJob(conf=job_config,
task=task,
publisher=ElasticsearchPublisher())
return job
if __name__ == "__main__":
# Uncomment next line to get INFO level logging
# logging.basicConfig(level=logging.INFO)
loading_job = run_mysql_job()
loading_job.launch()
job_es_table = create_es_publisher_sample_job(
elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument')
job_es_table.launch()
But, I got the error message below. How can I connect my database to Amundsen Lyft?
elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'Root mapping definition has unsupported parameters: [table : {properties={schema={analyzer=simple, type=text, fields={raw={type=keyword}}}, cluster={type=text}, description={analyzer=simple, type=text}, display_name={type=keyword}, column_descriptions={analyzer=simple, type=text}, programmatic_descriptions={analyzer=simple, type=text}, tags={type=keyword}, badges={type=keyword}, database={analyzer=simple, type=text, fields={raw={type=keyword}}}, total_usage={type=long}, name={analyzer=simple, type=text, fields={raw={type=keyword}}}, last_updated_timestamp={format=epoch_second, type=date}, unique_usage={type=long}, column_names={analyzer=simple, type=text, fields={raw={type=keyword}}}, key={type=keyword}}}]')
Related
When I tried to run my multiprocessing script in terminal, I keep getting this error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
This is my script:
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY', ['f408195', pw],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2', ['f408195', pw_2],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()
I have been trying to patch the list_blobs() function of ContainerClient, have not been able to do this successfully, this code outputs a MagicMock() function - but the function isn't patched as I would expect it to be (Trying to patch with a list ['Blob1', 'Blob2'].
#################Script File
import sys
from datetime import datetime, timedelta
import pyspark
import pytz
import yaml
# from azure.storage.blob import BlobServiceClient, ContainerClient
from pyspark.dbutils import DBUtils as dbutils
import azure.storage.blob
# Open Config
def main():
spark_context = pyspark.SparkContext.getOrCreate()
spark_context.addFile(sys.argv[1])
stream = None
stream = open(sys.argv[1], "r")
config = yaml.load(stream, Loader=yaml.FullLoader)
stream.close()
account_key = dbutils.secrets.get(scope=config["Secrets"]["Scope"], key=config["Secrets"]["Key Name"])
target_container = config["Storage Configuration"]["Container"]
target_account = config["Storage Configuration"]["Account"]
days_history_to_keep = config["Storage Configuration"]["Days History To Keep"]
connection_string = (
"DefaultEndpointsProtocol=https;AccountName="
+ target_account
+ ";AccountKey="
+ account_key
+ ";EndpointSuffix=core.windows.net"
)
blob_service_client: azure.storage.blob.BlobServiceClient = (
azure.storage.blob.BlobServiceClient.from_connection_string(connection_string)
)
container_client: azure.storage.blob.ContainerClient = (
blob_service_client.get_container_client(target_container)
)
blobs = container_client.list_blobs()
print(blobs)
print(blobs)
utc = pytz.UTC
delete_before_date = utc.localize(
datetime.today() - timedelta(days=days_history_to_keep)
)
for blob in blobs:
if blob.creation_time < delete_before_date:
print("Deleting Blob: " + blob.name)
container_client.delete_blob(blob, delete_snapshots="include")
if __name__ == "__main__":
main()
#################Test File
import unittest
from unittest import mock
import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
#mock.patch("DeleteOldBlobs.azure.storage.blob.ContainerClient")
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_main(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient, mock_containerclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
mock_dbutils.secrets.get.return_value = "A Secret"
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
# execute test
DeleteOldBlobs.main()
# TODO assert actions taken
# mock_sys.argv.__get__.assert_called_with()
# dbutils.secrets.get(scope=config['Secrets']['Scope'], key=config['Secrets']['Key Name'])
if __name__ == "__main__":
unittest.main()
Output:
<MagicMock name='BlobServiceClient.from_connection_string().get_container_client().list_blobs()' id='1143355577232'>
What am I doing incorrectly here?
I'm not able to execute your code in this moment, but I have tried to simulate it. To do this I have created the following 3 files in the path: /<path-to>/pkg/sub_pkg1 (where pkg and sub_pkg1 are packages).
File ContainerClient.py
def list_blobs(self):
return "blob1"
File DeleteOldBlobs.py
from pkg.sub_pkg1 import ContainerClient
# Open Config
def main():
blobs = ContainerClient.list_blobs()
print(blobs)
print(blobs)
File DeleteBlobsTest.py
import unittest
from unittest import mock
from pkg.sub_pkg1 import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
def test_main(self):
mock_containerclient = mock.MagicMock()
with mock.patch("DeleteOldBlobs.ContainerClient.list_blobs", mock_containerclient.list_blobs):
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
DeleteOldBlobs.main()
if __name__ == '__main__':
unittest.main()
If you execute the test code you obtain the output:
['ablob1', 'ablob2']
['ablob1', 'ablob2']
This output means that the function list_blobs() is mocked by mock_containerclient.list_blobs.
I don't know if the content of this post can be useful for you, but I'm not able to simulate better your code in this moment.
I hope you can inspire to my code to find your real solution.
The structure of the answer didn't match my solution, perhaps both will work but it was important for me to patch pyspark even though i never call it, or exceptions would get thrown when my code tried to interact with spark.
Perhaps this will be useful to someone:
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_list_blobs_called_once(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
account_key = 'Secret Key'
mock_dbutils.secrets.get.return_value = account_key
bsc_mock: mock.Mock = mock.Mock()
container_client_mock = mock.Mock()
blob1 = Blob('newblob', datetime.today())
blob2 = Blob('oldfile', datetime.today() - timedelta(days=20))
container_client_mock.list_blobs.return_value = [blob1, blob2]
bsc_mock.get_container_client.return_value = container_client_mock
mock_blobserviceclient.from_connection_string.return_value = bsc_mock
# execute test
DeleteOldBlobs.main()
#Assert Results
container_client_mock.list_blobs.assert_called_once()
am trying to read data from Azure event hub and store this dataframe to Mysql table in spark streaming mode.
below is the my pyspark code
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import datetime as dt
from pyspark.sql import DataFrameWriter
try:
session = SparkSession.builder.master("local").appName("dataingestion").config("")
spark = session.getOrCreate()
print("Successfully build spark session : ")
except:
print("Fail to build spark session : ")
raise Exception
startoffset = "#latest"
positionKey1 = {
"ehName": eventhubName,
"partitionId": 0
}
endTime = dt.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
eventPosition1 = {
"offset": startoffset,
"seqNo": 0,
"enqueuedTime": endTime,
"isInclusive": True
}
positionMap = {
json.dumps(positionKey1) : eventPosition1
}
# Place the map into the main Event Hub config dictionary
ehreadConf = {}
ehreadConf["eventhubs.startingPositions"] = json.dumps(positionMap)
ehreadConf['eventhubs.connectionString'] = event_hub_sas_key
ehreadConf['eventhubs.consumerGroup'] = "$Default"
ehreadConf['eventhubs.maxEventsPerTrigger'] = 5000
try:
inputStream = spark.readStream.format("eventhubs").options(**ehreadConf).load()
print("Successfully connected the event hub : ")
print("Check streaming is started or not : ", inputStream.isStreaming)
print("Schema of inputStream : ", inputStream.printSchema())
except Exception:
print("Fail to connect with Azure event hub : ")
raise Exception
inputStream = inputStream.withColumn("body", inputStream["body"].cast("string"))
db_target_properties = {"user": username, "password": password, "driver": driver}
def foreach_batch_function(sparkDf,epoach_id):
sparkDf.write.option("batchsize","5000").jdbc(url=url, table="demo",properties=db_target_properties, mode="append")
pass
query = sparkDf.writeStream.outputMode("append").format("com.microsoft.sqlserver.jdbc.spark").option("truncate", 'false').option(
"checkpointLocation", "dbfs:/FileStore/lambda-StreamJob/AzureSql/checkpoint").trigger(processingTime='8 seconds').foreachBatch(foreach_batch_function).start()
query.awaitTermination()
spark.stop()
but am unable to store this spark dataframe into mysql table.
am getting an error like data source jdbc dose not support spark streaming.
py4j.protocol.Py4JJavaError: An error occurred while calling o68.start.
: java.lang.UnsupportedOperationException: Data source jdbc does not support streamed writing>
Like the error notes, support for writing from a stream to a JDBC sink is not present in Spark Yet (I guess).
Instead of writing stream directly into mysq, you can do a forEachBatch in the streamDf and do the write.jdbc operation.
server_name = "jdbc:mysql://localhost:3306"
database_name = "eventhub"
jdbcurl = server_name + "/" + database_name
table_name = "stream_cdr_data"
db_properties = {"user":""user"", "password":"data#123"}
def write_to_mysql(df, epoch_id):
dfwriter = df.write.mode("append")
dfwriter.jdbc(url=jdbcurl, table=table_name, properties=db_properties) # if this is not working use below
#df.write.jdbc(url=jdbcurl, table=table_name, properties=db_properties, mode="append")
pass
query = sparkDf.writeStream.outputMode("append").foreachBatch(write_to_mysql).start()
query.awaitTermination()
For below Issue (OP asked in below comment section)
request/expected seqNo xxxx is less than the received seqNo xxxx. The earliest seqNo is yyyy and the last seqNo is yyyy
Take a look at the FAQ !!!!!
I am using confluent-kafka and I need to serialize my keys as strings and produce some messages. I have a working code for the case where I retrieve the schema from the schema registry and use it to produce a message. The problem is that it fails when I am trying to read the schema from a local file instead.
The code below is the working one for the schema registry:
import argparse
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka import SerializingProducer
import avro.schema
SCHEMA_HOST = '192.168.40.10'
TOPIC = 'my_topic'
SCHEMA = 'path/to/schema.avsc'
# Just parse argumments
parser = argparse.ArgumentParser(description="Avro Kafka Generator")
parser.add_argument('--schema_registry_host', default=SCHEMA_HOST, help="schema registry host")
parser.add_argument('--schema', type=str, default=SCHEMA, help="schema to produce under")
parser.add_argument('--topic', type=str, default=TOPIC, help="topic to publish to")
parser.add_argument('--frequency', type=float, default=1.0, help="number of message per second")
args = parser.parse_args()
# Actual code
schema_registry_conf = {'url': "http://{}:8081".format(SCHEMA_HOST)}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)
schema = schema_registry_client.get_latest_version(subject_name=TOPIC + "-value")
# schema = schema_registry_client.get_schema(schema.schema_id)
schema = schema_registry_client.get_schema(schema.schema_id)
schema_str = schema.schema_str
pro_conf = {"auto.register.schemas": False}
avro_serializer = AvroSerializer(schema_registry_client=schema_registry_client, schema_str=schema_str, conf=pro_conf)
conf = {'bootstrap.servers': "{}:9095".format(args.schema_registry_host),
'schema.registry.url': "http://{}:8081".format(args.schema_registry_host)}
# avro_producer = AvroProducer(conf, default_value_schema=value_schema)
producer_conf = {'bootstrap.servers': "{}:9095".format(SCHEMA_HOST),
'key.serializer': StringSerializer('utf_8'),
'value.serializer': avro_serializer}
avro_producer = SerializingProducer(producer_conf)
But when I try to use a variation for the local file it fails:
# Read schema from local file
value_schema = avro.schema.Parse(open(args.schema, "r").read())
schema_str = open(args.schema, "r").read().replace(' ', '').replace('\n', '')
pro_conf = {"auto.register.schemas": True}
avro_serializer = AvroSerializer(schema_registry_client=schema_registry_client, schema_str=schema_str, conf=pro_conf)
this part is common to both versions:
producer_conf = {'bootstrap.servers': "{}:9095".format(SCHEMA_HOST),
'key.serializer': StringSerializer('utf_8'),
'value.serializer': avro_serializer}
avro_producer = SerializingProducer(producer_conf)
avro_producer.produce(topic=args.topic, value=message)
The error I am getting is the following;
KafkaError{code=_VALUE_SERIALIZATION,val=-161,str="'RecordSchema'
object has no attribute 'lookup_schema'"}
Obviously, it's not the best approach and I guess if it worked the code seems ugly and error prone. But it doesn't even work so, I need some help on how I could read a local schema and use the AvroSerializer afterwards.
Getting "configparser.NoSectionError: No section: 'database'" error while running from unnitest file.
.py file
import psycopg2 as pg2
from configparser import ConfigParser
parser = ConfigParser()
configFilePath = r'dev.ini'
parser.read(configFilePath)
print('1')
conn = pg2.connect(user=parser.get('database', 'user'),
password=parser.get('database', 'password'),
host=parser.get('database', 'host'),
port=parser.get('database', 'port'),
database=parser.get('database', 'database'))
print(conn.get_dsn_parameters(), "\n")
print('2')
def get_data_from_query(query):
"""return query output"""
cur = conn.cursor()
cur.execute(query)
data_from_query = cur.fetchall()
return data_from_query
def economical_bowlers(year,data_table_1="matches",data_table_2="deliveries"):
query = """select c.bowler , sum(c.sum*6/c.count)
from (select a.bowler ,sum(a.total_runs - a.bye_runs - a.legbye_runs ),count(a.ball)
from {2} a inner join {1} b
on a.match_id=b.id
where b.season='{0}'
group by a.bowler)
c group by c.bowler;""".format(year,data_table_1,data_table_2)
data = get_data_from_query(query)
return dict(data)
.py unittest file
import unittest
import os
import sys
sys.path.insert(2, os.path.join(os.getcwd(), '..'))
import economical_bowlers
from Extractor import extractor
import ipl_project_sql
class Economical_bowlers(unittest.TestCase):
def test_economical_bowlers(self):
expected_matches_data_of_all_teams = {'AD Russell': 10, 'P Kumar': 6}
calculated_matches_data_of_all_teams = ipl_project_sql.economical_bowlers('2015', data_table_1='matches',
data_table_2='deliveries')
self.assertEqual(expected_matches_data_of_all_teams,
calculated_matches_data_of_all_teams)
if __name__ == '__main__':
unittest.main()
Both files work perfectly when are independent.The second file just test the first files output.I just want to remove error.
I had one config file for encryption which needed to be in safe folder as that of test folder.