Getting "configparser.NoSectionError: No section: 'database'" error while running from unnitest file.
.py file
import psycopg2 as pg2
from configparser import ConfigParser
parser = ConfigParser()
configFilePath = r'dev.ini'
parser.read(configFilePath)
print('1')
conn = pg2.connect(user=parser.get('database', 'user'),
password=parser.get('database', 'password'),
host=parser.get('database', 'host'),
port=parser.get('database', 'port'),
database=parser.get('database', 'database'))
print(conn.get_dsn_parameters(), "\n")
print('2')
def get_data_from_query(query):
"""return query output"""
cur = conn.cursor()
cur.execute(query)
data_from_query = cur.fetchall()
return data_from_query
def economical_bowlers(year,data_table_1="matches",data_table_2="deliveries"):
query = """select c.bowler , sum(c.sum*6/c.count)
from (select a.bowler ,sum(a.total_runs - a.bye_runs - a.legbye_runs ),count(a.ball)
from {2} a inner join {1} b
on a.match_id=b.id
where b.season='{0}'
group by a.bowler)
c group by c.bowler;""".format(year,data_table_1,data_table_2)
data = get_data_from_query(query)
return dict(data)
.py unittest file
import unittest
import os
import sys
sys.path.insert(2, os.path.join(os.getcwd(), '..'))
import economical_bowlers
from Extractor import extractor
import ipl_project_sql
class Economical_bowlers(unittest.TestCase):
def test_economical_bowlers(self):
expected_matches_data_of_all_teams = {'AD Russell': 10, 'P Kumar': 6}
calculated_matches_data_of_all_teams = ipl_project_sql.economical_bowlers('2015', data_table_1='matches',
data_table_2='deliveries')
self.assertEqual(expected_matches_data_of_all_teams,
calculated_matches_data_of_all_teams)
if __name__ == '__main__':
unittest.main()
Both files work perfectly when are independent.The second file just test the first files output.I just want to remove error.
I had one config file for encryption which needed to be in safe folder as that of test folder.
Related
When I tried to run my multiprocessing script in terminal, I keep getting this error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
This is my script:
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY', ['f408195', pw],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2', ['f408195', pw_2],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()
I have been trying to patch the list_blobs() function of ContainerClient, have not been able to do this successfully, this code outputs a MagicMock() function - but the function isn't patched as I would expect it to be (Trying to patch with a list ['Blob1', 'Blob2'].
#################Script File
import sys
from datetime import datetime, timedelta
import pyspark
import pytz
import yaml
# from azure.storage.blob import BlobServiceClient, ContainerClient
from pyspark.dbutils import DBUtils as dbutils
import azure.storage.blob
# Open Config
def main():
spark_context = pyspark.SparkContext.getOrCreate()
spark_context.addFile(sys.argv[1])
stream = None
stream = open(sys.argv[1], "r")
config = yaml.load(stream, Loader=yaml.FullLoader)
stream.close()
account_key = dbutils.secrets.get(scope=config["Secrets"]["Scope"], key=config["Secrets"]["Key Name"])
target_container = config["Storage Configuration"]["Container"]
target_account = config["Storage Configuration"]["Account"]
days_history_to_keep = config["Storage Configuration"]["Days History To Keep"]
connection_string = (
"DefaultEndpointsProtocol=https;AccountName="
+ target_account
+ ";AccountKey="
+ account_key
+ ";EndpointSuffix=core.windows.net"
)
blob_service_client: azure.storage.blob.BlobServiceClient = (
azure.storage.blob.BlobServiceClient.from_connection_string(connection_string)
)
container_client: azure.storage.blob.ContainerClient = (
blob_service_client.get_container_client(target_container)
)
blobs = container_client.list_blobs()
print(blobs)
print(blobs)
utc = pytz.UTC
delete_before_date = utc.localize(
datetime.today() - timedelta(days=days_history_to_keep)
)
for blob in blobs:
if blob.creation_time < delete_before_date:
print("Deleting Blob: " + blob.name)
container_client.delete_blob(blob, delete_snapshots="include")
if __name__ == "__main__":
main()
#################Test File
import unittest
from unittest import mock
import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
#mock.patch("DeleteOldBlobs.azure.storage.blob.ContainerClient")
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_main(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient, mock_containerclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
mock_dbutils.secrets.get.return_value = "A Secret"
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
# execute test
DeleteOldBlobs.main()
# TODO assert actions taken
# mock_sys.argv.__get__.assert_called_with()
# dbutils.secrets.get(scope=config['Secrets']['Scope'], key=config['Secrets']['Key Name'])
if __name__ == "__main__":
unittest.main()
Output:
<MagicMock name='BlobServiceClient.from_connection_string().get_container_client().list_blobs()' id='1143355577232'>
What am I doing incorrectly here?
I'm not able to execute your code in this moment, but I have tried to simulate it. To do this I have created the following 3 files in the path: /<path-to>/pkg/sub_pkg1 (where pkg and sub_pkg1 are packages).
File ContainerClient.py
def list_blobs(self):
return "blob1"
File DeleteOldBlobs.py
from pkg.sub_pkg1 import ContainerClient
# Open Config
def main():
blobs = ContainerClient.list_blobs()
print(blobs)
print(blobs)
File DeleteBlobsTest.py
import unittest
from unittest import mock
from pkg.sub_pkg1 import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
def test_main(self):
mock_containerclient = mock.MagicMock()
with mock.patch("DeleteOldBlobs.ContainerClient.list_blobs", mock_containerclient.list_blobs):
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
DeleteOldBlobs.main()
if __name__ == '__main__':
unittest.main()
If you execute the test code you obtain the output:
['ablob1', 'ablob2']
['ablob1', 'ablob2']
This output means that the function list_blobs() is mocked by mock_containerclient.list_blobs.
I don't know if the content of this post can be useful for you, but I'm not able to simulate better your code in this moment.
I hope you can inspire to my code to find your real solution.
The structure of the answer didn't match my solution, perhaps both will work but it was important for me to patch pyspark even though i never call it, or exceptions would get thrown when my code tried to interact with spark.
Perhaps this will be useful to someone:
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_list_blobs_called_once(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
account_key = 'Secret Key'
mock_dbutils.secrets.get.return_value = account_key
bsc_mock: mock.Mock = mock.Mock()
container_client_mock = mock.Mock()
blob1 = Blob('newblob', datetime.today())
blob2 = Blob('oldfile', datetime.today() - timedelta(days=20))
container_client_mock.list_blobs.return_value = [blob1, blob2]
bsc_mock.get_container_client.return_value = container_client_mock
mock_blobserviceclient.from_connection_string.return_value = bsc_mock
# execute test
DeleteOldBlobs.main()
#Assert Results
container_client_mock.list_blobs.assert_called_once()
I'm working on the PoC about Amundsen Lyft for introducing data catalog.
So, I use the file 'sample_mysql_loader.py' and insert database information.
OS: Ubuntu 18.04
Python: 3.6.9
pip: 21.3.1
elasticsearch: 8.0.0
neo4j: 3.5.26
I installed Amundsen Lyft using quickstart on Docker
import logging
import sys
import textwrap
import uuid
from elasticsearch import Elasticsearch
from pyhocon import ConfigFactory
from sqlalchemy.ext.declarative import declarative_base
from databuilder.extractor.mysql_metadata_extractor import MysqlMetadataExtractor
from databuilder.extractor.neo4j_extractor import Neo4jExtractor
from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher import neo4j_csv_publisher
from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer
import pymysql
pymysql.install_as_MySQLdb()
es_host = None
neo_host = None
if len(sys.argv) > 1:
es_host = sys.argv[1]
if len(sys.argv) > 2:
neo_host = sys.argv[2]
es = Elasticsearch([
{'host': es_host or 'localhost'},
])
DB_FILE = '/tmp/test.db'
SQLITE_CONN_STRING = 'sqlite:////tmp/test.db'
Base = declarative_base()
NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687'
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
LOGGER = logging.getLogger(__name__)
# todo: connection string needs to change
def connection_string():
***user = 'user_name'
password='password'
host = 'host'
port = 'port'
db = 'db_name'***
return "mysql://%s:%s#%s:%s/%s" % (user, ***password,*** host, port, db)
def run_mysql_job():
where_clause_suffix = textwrap.dedent("""
where c.table_schema = '***db_name***'
""")
tmp_folder = '/var/tmp/amundsen/table_metadata'
node_files_folder = f'{tmp_folder}/nodes/'
relationship_files_folder = f'{tmp_folder}/relationships/'
job_config = ConfigFactory.from_dict({
f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True,
f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds}
})
job = DefaultJob(conf=job_config,
task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()),
publisher=Neo4jCsvPublisher())
return job
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
cypher_query=None,
elasticsearch_mapping=None):
"""
:param elasticsearch_index_alias: alias for Elasticsearch used in
amundsensearchlibrary/search_service/config.py as an index
:param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
`table_search_index`
:param model_name: the Databuilder model class used in transporting between Extractor and Loader
:param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
it uses the `Table` query baked into the Extractor
:param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
if None is given (default) it uses the `Table` query baked into the Publisher
"""
# loader saves data to this location and publisher reads it from here
extracted_search_data_path = '/var/tmp/amundsen/search_data.json'
task = DefaultTask(loader=FSElasticsearchJSONLoader(),
extractor=Neo4jSearchDataExtractor(),
transformer=NoopTransformer())
# elastic search client instance
elasticsearch_client = es
# unique name of new index in Elasticsearch
elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
job_config = ConfigFactory.from_dict({
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w',
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r',
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}':
elasticsearch_client,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}':
elasticsearch_new_index_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}':
elasticsearch_doc_type_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}':
elasticsearch_index_alias,
})
# only optionally add these keys, so need to dynamically `put` them
if cypher_query:
job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}',
cypher_query)
if elasticsearch_mapping:
job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}',
elasticsearch_mapping)
job = DefaultJob(conf=job_config,
task=task,
publisher=ElasticsearchPublisher())
return job
if __name__ == "__main__":
# Uncomment next line to get INFO level logging
# logging.basicConfig(level=logging.INFO)
loading_job = run_mysql_job()
loading_job.launch()
job_es_table = create_es_publisher_sample_job(
elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument')
job_es_table.launch()
But, I got the error message below. How can I connect my database to Amundsen Lyft?
elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'Root mapping definition has unsupported parameters: [table : {properties={schema={analyzer=simple, type=text, fields={raw={type=keyword}}}, cluster={type=text}, description={analyzer=simple, type=text}, display_name={type=keyword}, column_descriptions={analyzer=simple, type=text}, programmatic_descriptions={analyzer=simple, type=text}, tags={type=keyword}, badges={type=keyword}, database={analyzer=simple, type=text, fields={raw={type=keyword}}}, total_usage={type=long}, name={analyzer=simple, type=text, fields={raw={type=keyword}}}, last_updated_timestamp={format=epoch_second, type=date}, unique_usage={type=long}, column_names={analyzer=simple, type=text, fields={raw={type=keyword}}}, key={type=keyword}}}]')
My first File "sk_read_write.py" is as follows:
from spark_conn import *
from Table_structure import *
class read_write1:
def sk_read_write1(self,schema,spark):
df3 = spark.read.option("multiline", "true").option("mode", "PERMISSIVE").schema(schema).json(
"C:\\Users\\komu0\\Desktop\\Read\\*.json")
print(Spark_connect_1.connection())
df3.write.format('jdbc').options( url= Spark_connect_1.connection+str(connection._port),
driver='com.mysql.cj.jdbc.Driver',
dbtable='sparktable',
user=connection._username,
password=Spark_connect_1.connection._pass).\
mode('append').save()
My Other file is spark_conn.py:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType
from aws_config import *
from Table_structure import*
class Spark_connect_1:
dict1 = get_secret()
def connection(self):
dict1 = get_secret()
_username = dict1['username']
_pass = dict1['password']
_host = dict1['host']
_port = dict1['port']
_dbname = dict1['dbname']
spark = SparkSession.builder \
.master("local[1]") \
.appName("JSON_MYSQL") \
.getOrCreate()
return spark
I want to use the variable in " _port" into sk_read_write file.
I have tried to import spark_conn into sk_read_write file and use
"Spark_connect_1.connection._port" (to get port name) but not working, please suggest how to proceed
You can access the port name by using a class variable for _port
Example
base_file.py
class ABC:
_port = "Some value"
def sample(self):
ABC._port = "another value"
print("Value of port is {}".format(ABC._port))
test_file.py
from base_file import ABC
#before changing value:
before = ABC()
before.sample()
I want to do incremental import from user_location_history and after incremental import want to save the last id of in the user_location_updated,so that it can get automated for future.
#!/usr/bin/python
import subprocess
import time
import subprocess
import MySQLdb
import datetime
import sys
import pytz
import os
from subprocess import call
def get_mysql_cursor():
conn_1 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.20', db='bazooka')
conn_2 = MySQLdb.connect(user='db', passwd='bazookadb', host='10.216.204.7', db='bazooka')
#print conn_1,conn_2
return conn_1.cursor(),conn_2.cursor()
def get_records():
cur_1,cur_2 = get_mysql_cursor()
cur_1.execute("select updated from user_location_updated")
cur_2.execute("select max(moving_date) from user_location_history")
return cur_1.fetchone(),cur_2.fetchone()
def update_records(update_date):
cur_1,cur_2 = get_mysql_cursor()
print update_date
query = "update user_location_updated set updated = '"+str(update_date)+"' where id='1' "
print query
result = cur_1.execute(query)
print result
result = get_records()
update_result = update_records(result[1][0])
print result[0][0]
print result[1][0]
sqoopcom = "sqoop import --connect jdbc:mysql://10.216.204.7:3306/bazooka --username db --password bazookadb --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' --table user_location_history -m 1 --hive-delims-replacement ' ' --as-textfile --incremental append --check-column moving_date --last-value 2016-08-04 19:00:36 --target-dir hdfs://example:9000/user/bigdata/sqoopip --verbose"
#os.system(sqoopcom)
exec (sqoopcom)
----but this code is giving error
Wrap --last-value in single quotes.
Use --last-value '2016-08-04 19:00:36'