My first File "sk_read_write.py" is as follows:
from spark_conn import *
from Table_structure import *
class read_write1:
def sk_read_write1(self,schema,spark):
df3 = spark.read.option("multiline", "true").option("mode", "PERMISSIVE").schema(schema).json(
"C:\\Users\\komu0\\Desktop\\Read\\*.json")
print(Spark_connect_1.connection())
df3.write.format('jdbc').options( url= Spark_connect_1.connection+str(connection._port),
driver='com.mysql.cj.jdbc.Driver',
dbtable='sparktable',
user=connection._username,
password=Spark_connect_1.connection._pass).\
mode('append').save()
My Other file is spark_conn.py:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType
from aws_config import *
from Table_structure import*
class Spark_connect_1:
dict1 = get_secret()
def connection(self):
dict1 = get_secret()
_username = dict1['username']
_pass = dict1['password']
_host = dict1['host']
_port = dict1['port']
_dbname = dict1['dbname']
spark = SparkSession.builder \
.master("local[1]") \
.appName("JSON_MYSQL") \
.getOrCreate()
return spark
I want to use the variable in " _port" into sk_read_write file.
I have tried to import spark_conn into sk_read_write file and use
"Spark_connect_1.connection._port" (to get port name) but not working, please suggest how to proceed
You can access the port name by using a class variable for _port
Example
base_file.py
class ABC:
_port = "Some value"
def sample(self):
ABC._port = "another value"
print("Value of port is {}".format(ABC._port))
test_file.py
from base_file import ABC
#before changing value:
before = ABC()
before.sample()
Related
I have been trying to patch the list_blobs() function of ContainerClient, have not been able to do this successfully, this code outputs a MagicMock() function - but the function isn't patched as I would expect it to be (Trying to patch with a list ['Blob1', 'Blob2'].
#################Script File
import sys
from datetime import datetime, timedelta
import pyspark
import pytz
import yaml
# from azure.storage.blob import BlobServiceClient, ContainerClient
from pyspark.dbutils import DBUtils as dbutils
import azure.storage.blob
# Open Config
def main():
spark_context = pyspark.SparkContext.getOrCreate()
spark_context.addFile(sys.argv[1])
stream = None
stream = open(sys.argv[1], "r")
config = yaml.load(stream, Loader=yaml.FullLoader)
stream.close()
account_key = dbutils.secrets.get(scope=config["Secrets"]["Scope"], key=config["Secrets"]["Key Name"])
target_container = config["Storage Configuration"]["Container"]
target_account = config["Storage Configuration"]["Account"]
days_history_to_keep = config["Storage Configuration"]["Days History To Keep"]
connection_string = (
"DefaultEndpointsProtocol=https;AccountName="
+ target_account
+ ";AccountKey="
+ account_key
+ ";EndpointSuffix=core.windows.net"
)
blob_service_client: azure.storage.blob.BlobServiceClient = (
azure.storage.blob.BlobServiceClient.from_connection_string(connection_string)
)
container_client: azure.storage.blob.ContainerClient = (
blob_service_client.get_container_client(target_container)
)
blobs = container_client.list_blobs()
print(blobs)
print(blobs)
utc = pytz.UTC
delete_before_date = utc.localize(
datetime.today() - timedelta(days=days_history_to_keep)
)
for blob in blobs:
if blob.creation_time < delete_before_date:
print("Deleting Blob: " + blob.name)
container_client.delete_blob(blob, delete_snapshots="include")
if __name__ == "__main__":
main()
#################Test File
import unittest
from unittest import mock
import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
#mock.patch("DeleteOldBlobs.azure.storage.blob.ContainerClient")
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_main(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient, mock_containerclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
mock_dbutils.secrets.get.return_value = "A Secret"
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
# execute test
DeleteOldBlobs.main()
# TODO assert actions taken
# mock_sys.argv.__get__.assert_called_with()
# dbutils.secrets.get(scope=config['Secrets']['Scope'], key=config['Secrets']['Key Name'])
if __name__ == "__main__":
unittest.main()
Output:
<MagicMock name='BlobServiceClient.from_connection_string().get_container_client().list_blobs()' id='1143355577232'>
What am I doing incorrectly here?
I'm not able to execute your code in this moment, but I have tried to simulate it. To do this I have created the following 3 files in the path: /<path-to>/pkg/sub_pkg1 (where pkg and sub_pkg1 are packages).
File ContainerClient.py
def list_blobs(self):
return "blob1"
File DeleteOldBlobs.py
from pkg.sub_pkg1 import ContainerClient
# Open Config
def main():
blobs = ContainerClient.list_blobs()
print(blobs)
print(blobs)
File DeleteBlobsTest.py
import unittest
from unittest import mock
from pkg.sub_pkg1 import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
def test_main(self):
mock_containerclient = mock.MagicMock()
with mock.patch("DeleteOldBlobs.ContainerClient.list_blobs", mock_containerclient.list_blobs):
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
DeleteOldBlobs.main()
if __name__ == '__main__':
unittest.main()
If you execute the test code you obtain the output:
['ablob1', 'ablob2']
['ablob1', 'ablob2']
This output means that the function list_blobs() is mocked by mock_containerclient.list_blobs.
I don't know if the content of this post can be useful for you, but I'm not able to simulate better your code in this moment.
I hope you can inspire to my code to find your real solution.
The structure of the answer didn't match my solution, perhaps both will work but it was important for me to patch pyspark even though i never call it, or exceptions would get thrown when my code tried to interact with spark.
Perhaps this will be useful to someone:
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_list_blobs_called_once(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
account_key = 'Secret Key'
mock_dbutils.secrets.get.return_value = account_key
bsc_mock: mock.Mock = mock.Mock()
container_client_mock = mock.Mock()
blob1 = Blob('newblob', datetime.today())
blob2 = Blob('oldfile', datetime.today() - timedelta(days=20))
container_client_mock.list_blobs.return_value = [blob1, blob2]
bsc_mock.get_container_client.return_value = container_client_mock
mock_blobserviceclient.from_connection_string.return_value = bsc_mock
# execute test
DeleteOldBlobs.main()
#Assert Results
container_client_mock.list_blobs.assert_called_once()
I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lit
from pyspark.context import SparkContext
import aws_encryption_sdk
from aws_encryption_sdk import CommitmentPolicy
client = aws_encryption_sdk.EncryptionSDKClient(
commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT
)
kms_kwargs = dict(key_ids=[key_arn])
global master_key_provider
master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs)
df = spark.read.csv('test.csv', inferSchema = True, header = True)
def encrypt_string(text):
encrypted_text, encryptor_header = client.encrypt(
source=text, key_provider=master_key_provider
)
return encrypted_text
udf_encrypt = udf(lambda text: encrypt_string(text))
def spark_encrypt(df, colmn):
return df.withColumn("segment_encrypt", udf_encrypt(col(colmn)))
df_out = spark_encrypt(df, "segment")
Is there a way to resolve this ?
I find it pretty confusing using a broadcasted variable inside a UDF from an imported function. Say I make a broadcasted variable inside an imported function from the main file. It works if I have a UDF defined inside the function (second_func) but not outside (third_func).
Why is this happening?
Are UDFs advised being defined inside the function that calls it?
# test_utils.py
from pyspark.sql import types as T
from pyspark.sql import functions as F
#F.udf(T.StringType())
def do_smth_out():
return broadcasted.value["a"]
def second_func(spark, df):
#F.udf(T.StringType())
def do_smth_in():
return broadcasted.value["a"]
data = {"a": "c"}
sc = spark.sparkContext
broadcasted = sc.broadcast(data)
return df.withColumn("a", do_smth_in())
def third_func(spark, df):
data = {"a": "c"}
sc = spark.sparkContext
broadcasted = sc.broadcast(data)
return df.withColumn("a", do_smth_out())
# main.py
from pyspark.sql import types as T
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from test_utils import first_func, second_func
#F.udf(T.StringType())
def do_smth():
return broadcasted.value["a"]
if __name__ == "__main__":
spark = SparkSession \
.builder \
.getOrCreate()
sc = spark.sparkContext
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
df = sc.parallelize(data).toDF(columns)
broadcasted = sc.broadcast({"a": "c"})
print("First trial")
df.withColumn("a", do_smth()).show()
# Works
print("Second trial")
second_func(spark, df).show()
# Works
print("Third trial")
third_func(spark, df).show()
# Doesn't work
Getting "configparser.NoSectionError: No section: 'database'" error while running from unnitest file.
.py file
import psycopg2 as pg2
from configparser import ConfigParser
parser = ConfigParser()
configFilePath = r'dev.ini'
parser.read(configFilePath)
print('1')
conn = pg2.connect(user=parser.get('database', 'user'),
password=parser.get('database', 'password'),
host=parser.get('database', 'host'),
port=parser.get('database', 'port'),
database=parser.get('database', 'database'))
print(conn.get_dsn_parameters(), "\n")
print('2')
def get_data_from_query(query):
"""return query output"""
cur = conn.cursor()
cur.execute(query)
data_from_query = cur.fetchall()
return data_from_query
def economical_bowlers(year,data_table_1="matches",data_table_2="deliveries"):
query = """select c.bowler , sum(c.sum*6/c.count)
from (select a.bowler ,sum(a.total_runs - a.bye_runs - a.legbye_runs ),count(a.ball)
from {2} a inner join {1} b
on a.match_id=b.id
where b.season='{0}'
group by a.bowler)
c group by c.bowler;""".format(year,data_table_1,data_table_2)
data = get_data_from_query(query)
return dict(data)
.py unittest file
import unittest
import os
import sys
sys.path.insert(2, os.path.join(os.getcwd(), '..'))
import economical_bowlers
from Extractor import extractor
import ipl_project_sql
class Economical_bowlers(unittest.TestCase):
def test_economical_bowlers(self):
expected_matches_data_of_all_teams = {'AD Russell': 10, 'P Kumar': 6}
calculated_matches_data_of_all_teams = ipl_project_sql.economical_bowlers('2015', data_table_1='matches',
data_table_2='deliveries')
self.assertEqual(expected_matches_data_of_all_teams,
calculated_matches_data_of_all_teams)
if __name__ == '__main__':
unittest.main()
Both files work perfectly when are independent.The second file just test the first files output.I just want to remove error.
I had one config file for encryption which needed to be in safe folder as that of test folder.
python call ansibleApi with celery return None,I have searched a few days.It works well with call deploy function without celery ,but with celery my code call ansibleApi return None.
reproduce steps.
1.tasks.py
from celery import shared_task
from .deploy_tomcat2 import django_process
#shared_task
def deploy(jira_num):
#return 'hello world {0}'.format(jira_num)
#rdb.set_trace()
return django_process(jira_num)
2.deploy_tomcat2.py
from .AnsibleApi import CallApi
def django_process(jira_num):
server = '10.10.10.30'
name = 'abc'
port = 11011
code = 'efs'
jdk = '1.12.13'
jvm = 'xxxx'
if str.isdigit(jira_num):
# import pdb
# pdb.set_trace()
call = CallApi(server,name,port,code,jdk,jvm)
return call.run_task()
3.AnsibleApi.py
#!/usr/bin/env python
import logging
from .Logger import Logger
from django.conf import settings
from collections import namedtuple
from ansible.parsing.dataloader import DataLoader
from ansible.vars import VariableManager
from ansible.inventory import Inventory
from ansible.playbook.play import Play
from ansible.executor.task_queue_manager import TaskQueueManager
from ansible.plugins.callback import CallbackBase
Log = Logger('/tmp/auto_deploy_tomcat.log',logging.INFO)
class ResultCallback(CallbackBase):
def __init__(self, *args, **kwargs):
super(ResultCallback ,self).__init__(*args, **kwargs)
self.host_ok = {}
self.host_unreachable = {}
self.host_failed = {}
def v2_runner_on_unreachable(self, result):
self.host_unreachable[result._host.get_name()] = result
def v2_runner_on_ok(self, result, *args, **kwargs):
self.host_ok[result._host.get_name()] = result
def v2_runner_on_failed(self, result, *args, **kwargs):
self.host_failed[result._host.get_name()] = result
class CallApi(object):
user = settings.SSH_USER
ssh_private_key_file = settings.SSH_PRIVATE_KEY_FILE
results_callback = ResultCallback()
Options = namedtuple('Options',
['connection', 'module_path', 'private_key_file', 'forks', 'become', 'become_method',
'become_user', 'check'])
def __init__(self,ip,name,port,code,jdk,jvm):
self.ip = ip
self.name = name
self.port = port
self.code = code
self.jdk = jdk
self.jvm = jvm
self.results_callback = ResultCallback()
self.results_raw = {}
def _gen_user_task(self):
tasks = []
deploy_script = 'autodeploy/tomcat_deploy.sh'
dst_script = '/tmp/tomcat_deploy.sh'
cargs = dict(src=deploy_script, dest=dst_script, owner=self.user, group=self.user, mode='0755')
args = "%s %s %d %s %s '%s'" % (dst_script, self.name, self.port, self.code, self.jdk, self.jvm)
tasks.append(dict(action=dict(module='copy', args=cargs),register='shell_out'))
tasks.append(dict(action=dict(module='debug', args=dict(msg='{{shell_out}}'))))
# tasks.append(dict(action=dict(module='command', args=args)))
# tasks.append(dict(action=dict(module='command', args=args), register='result'))
# tasks.append(dict(action=dict(module='debug', args=dict(msg='{{result.stdout}}'))))
self.tasks = tasks
def _set_option(self):
self._gen_user_task()
self.variable_manager = VariableManager()
self.loader = DataLoader()
self.options = self.Options(connection='smart', module_path=None, private_key_file=self.ssh_private_key_file, forks=None,
become=True, become_method='sudo', become_user='root', check=False)
self.inventory = Inventory(loader=self.loader, variable_manager=self.variable_manager, host_list=[self.ip])
self.variable_manager.set_inventory(self.inventory)
play_source = dict(
name = "auto deploy tomcat",
hosts = self.ip,
remote_user = self.user,
gather_facts='no',
tasks = self.tasks
)
self.play = Play().load(play_source, variable_manager=self.variable_manager, loader=self.loader)
def run_task(self):
self.results_raw = {'success':{}, 'failed':{}, 'unreachable':{}}
tqm = None
from celery.contrib import rdb;rdb.set_trace()
#import pdb;pdb.set_trace()
self._set_option()
try:
tqm = TaskQueueManager(
inventory=self.inventory,
variable_manager=self.variable_manager,
loader=self.loader,
options=self.options,
passwords=None,
stdout_callback=self.results_callback,
)
result = tqm.run(self.play)
finally:
if tqm is not None:
tqm.cleanup()
for host, result in self.results_callback.host_ok.items():
self.results_raw['success'][host] = result._result
for host, result in self.results_callback.host_failed.items():
self.results_raw['failed'][host] = result._result
for host, result in self.results_callback.host_unreachable.items():
self.results_raw['unreachable'][host]= result._result
Log.info("result is :%s" % self.results_raw)
return self.results_raw
4.celery worker
celery -A jira worker -Q queue.ops.deploy -n "deploy.%h" -l info
5.produce msg:
deploy.apply_async(args=['150'], queue='queue.ops.deploy', routing_key='ops.deploy')
It seems OK.
The only question is None is really the deploy task return?
It will be better that if you can post your celery worker log.
there are two method to solve this problem ,disable assert:
1.where celery starts set export PYTHONOPTIMIZE=1 OR start celery with this parameter -O OPTIMIZATION
2.disable python packet multiprocessing process.py line 102:
assert not _current_process._config.get('daemon'), \
'daemonic processes are not allowed to have children'