I am trying to create Azure Batch Job with Task which uses output_files as a Task Parameter
tasks = list()
command_task = (r"cmd /c dir")
# Not providing actual property value for security purpose
containerName = r'ContainerName'
azureStorageAccountName = r'AccountName'
azureStorageAccountKey = r'AccountKey'
sas_Token = generate_account_sas(account_name=azureStorageAccountName, account_key=azureStorageAccountKey, resource_types=ResourceTypes(object=True), permission=AccountSasPermissions(read=True, write=True), expiry=datetime.datetime.utcnow() + timedelta(hours=1))
url = f"https://{azureStorageAccountName}.blob.core.windows.net/{containerName}?{sas_Token}"
output_file = batchmodels.OutputFile(
file_pattern=r"..\std*.txt",
destination=batchmodels.OutputFileDestination(
container=batchmodels.OutputFileBlobContainerDestination(container_url=url),
path="abc"),
upload_options='taskCompletion')
tasks.append(batchmodels.TaskAddParameter(id='Task1', display_name='Task1', command_line=command_task, user_identity=user, output_files=[output_file]))
batch_service_client.task.add_collection(job_id, tasks)
On Deugging this code I am getting exception as
But on removing the output_files parameter , everything works fine and Job is created with task.
I missed out upload_options object while creating OutputFile object:
output_file = batchmodels.OutputFile(
file_pattern=r"..\std*.txt",
destination=batchmodels.OutputFileDestination(
container=batchmodels.OutputFileBlobContainerDestination(container_url=url),
path="abc"),
upload_options=batchmodels.OutputFileUploadOptions('taskCompletion'))
Related
I am new to Azure and dealing with all these paths is proving to be extremely challenging. I am trying to create a pipeline that contains a dataprep.py step and an AutoML step. What i want to do is (after passing the input to the dataprep block and performing several operations on it) to save the resulting tabulardataset in the datastore and have it as an output to then be able to reuse in my train block.
My dataprep.py file
-----dataprep stuff and imports
parser = argparse.ArgumentParser()
parser.add_argument("--month_train", required=True)
parser.add_argument("--year_train", required=True)
parser.add_argument('--output_path', dest = 'output_path', required=True)
args = parser.parse_args()
run = Run.get_context()
ws = run.experiment.workspace
datastore = ws.get_default_datastore()
name_dataset_input = 'Customer_data_'+str(args.year_train)
name_dataset_output = 'DATA_PREP_'+str(args.year_train)+'_'+str(args.month_train)
# get the input dataset by name
ds = Dataset.get_by_name(ws, name_dataset_input)
df = ds.to_pandas_dataframe()
# apply is one of my dataprep functions that i defined earlier
df = apply(df, args.mois_train)
# this is where i am having issues, I want to save this in the datastore but also have it as output
ds = Dataset.Tabular.register_pandas_dataframe(df, args.output_path ,name_dataset_output)
The pipeline block instructions.
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
prepped_data_path = OutputFileDatasetConfig(name="output_path", destination = (datastore, 'managed-dataset/{run-id}/{output-name}'))
dataprep_step = PythonScriptStep(
name="dataprep",
script_name="dataprep.py",
compute_target=compute_target,
runconfig=aml_run_config,
arguments=["--output_path", prepped_data_path, "--month_train", month_train,"--year_train",year_train],
allow_reuse=True
I am using confluent-kafka and I need to serialize my keys as strings and produce some messages. I have a working code for the case where I retrieve the schema from the schema registry and use it to produce a message. The problem is that it fails when I am trying to read the schema from a local file instead.
The code below is the working one for the schema registry:
import argparse
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka import SerializingProducer
import avro.schema
SCHEMA_HOST = '192.168.40.10'
TOPIC = 'my_topic'
SCHEMA = 'path/to/schema.avsc'
# Just parse argumments
parser = argparse.ArgumentParser(description="Avro Kafka Generator")
parser.add_argument('--schema_registry_host', default=SCHEMA_HOST, help="schema registry host")
parser.add_argument('--schema', type=str, default=SCHEMA, help="schema to produce under")
parser.add_argument('--topic', type=str, default=TOPIC, help="topic to publish to")
parser.add_argument('--frequency', type=float, default=1.0, help="number of message per second")
args = parser.parse_args()
# Actual code
schema_registry_conf = {'url': "http://{}:8081".format(SCHEMA_HOST)}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)
schema = schema_registry_client.get_latest_version(subject_name=TOPIC + "-value")
# schema = schema_registry_client.get_schema(schema.schema_id)
schema = schema_registry_client.get_schema(schema.schema_id)
schema_str = schema.schema_str
pro_conf = {"auto.register.schemas": False}
avro_serializer = AvroSerializer(schema_registry_client=schema_registry_client, schema_str=schema_str, conf=pro_conf)
conf = {'bootstrap.servers': "{}:9095".format(args.schema_registry_host),
'schema.registry.url': "http://{}:8081".format(args.schema_registry_host)}
# avro_producer = AvroProducer(conf, default_value_schema=value_schema)
producer_conf = {'bootstrap.servers': "{}:9095".format(SCHEMA_HOST),
'key.serializer': StringSerializer('utf_8'),
'value.serializer': avro_serializer}
avro_producer = SerializingProducer(producer_conf)
But when I try to use a variation for the local file it fails:
# Read schema from local file
value_schema = avro.schema.Parse(open(args.schema, "r").read())
schema_str = open(args.schema, "r").read().replace(' ', '').replace('\n', '')
pro_conf = {"auto.register.schemas": True}
avro_serializer = AvroSerializer(schema_registry_client=schema_registry_client, schema_str=schema_str, conf=pro_conf)
this part is common to both versions:
producer_conf = {'bootstrap.servers': "{}:9095".format(SCHEMA_HOST),
'key.serializer': StringSerializer('utf_8'),
'value.serializer': avro_serializer}
avro_producer = SerializingProducer(producer_conf)
avro_producer.produce(topic=args.topic, value=message)
The error I am getting is the following;
KafkaError{code=_VALUE_SERIALIZATION,val=-161,str="'RecordSchema'
object has no attribute 'lookup_schema'"}
Obviously, it's not the best approach and I guess if it worked the code seems ugly and error prone. But it doesn't even work so, I need some help on how I could read a local schema and use the AvroSerializer afterwards.
I've heard that Python multi-threading is a bit tricky, and I am not sure what is the best way to go about implementing what I need. Let's say I have a function called IO_intensive_function that does some API call which may take a while to get a response.
Say the process of queuing jobs can look something like this:
import thread
for job_args in jobs:
thread.start_new_thread(IO_intense_function, (job_args))
Would the IO_intense_function now just execute its task in the background and allow me to queue in more jobs?
I also looked at this question, which seems like the approach is to just do the following:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(2)
results = pool.map(IO_intensive_function, jobs)
As I don't need those tasks to communicate with each other, the only goal is to send my API requests as fast as possible. Is this the most efficient way? Thanks.
Edit:
The way I am making the API request is through a Thrift service.
I had to create code to do something similar recently. I've tried to make it generic below. Note I'm a novice coder, so please forgive the inelegance. What you may find valuable, however, is some of the error processing I found it necessary to embed to capture disconnects, etc.
I also found it valuable to perform the json processing in a threaded manner. You have the threads working for you, so why go "serial" again for a processing step when you can extract the info in parallel.
It is possible I will have mis-coded in making it generic. Please don't hesitate to ask follow-ups and I will clarify.
import requests
from multiprocessing.dummy import Pool as ThreadPool
from src_code.config import Config
with open(Config.API_PATH + '/api_security_key.pem') as f:
my_key = f.read().rstrip("\n")
f.close()
base_url = "https://api.my_api_destination.com/v1"
headers = {"Authorization": "Bearer %s" % my_key}
itm = list()
itm.append(base_url)
itm.append(headers)
def call_API(call_var):
base_url = call_var[0]
headers = call_var[1]
call_specific_tag = call_var[2]
endpoint = f'/api_path/{call_specific_tag}'
connection_tries = 0
for i in range(3):
try:
dat = requests.get((base_url + endpoint), headers=headers).json()
except:
connection_tries += 1
print(f'Call for {api_specific_tag} failed after {i} attempt(s). Pausing for 240 seconds.')
time.sleep(240)
else:
break
tag = list()
vars_to_capture_01 = list()
vars_to_capture_02 = list()
connection_tries = 0
try:
if 'record_id' in dat:
vars_to_capture_01.append(dat['record_id'])
vars_to_capture_02.append(dat['second_item_of_interest'])
else:
vars_to_capture_01.append(call_specific_tag)
print(f'Call specific tag {call_specific_tag} is unavailable. Successful pull.')
vars_to_capture_02.append(-1)
except:
print(f'{call_specific_tag} is unavailable. Unsuccessful pull.')
vars_to_capture_01.append(call_specific_tag)
vars_to_capture_02.append(-1)
time.sleep(240)
pack = list()
pack.append(vars_to_capture_01)
pack.append(vars_to_capture_02)
return pack
vars_to_capture_01 = list()
vars_to_capture_02 = list()
i = 0
max_i = len(all_tags)
while i < max_i:
ind_rng = range(i, min((i + 10), (max_i)), 1)
itm_lst = (itm.copy())
call_var = [itm_lst + [all_tags[q]] for q in ind_rng]
#packed = call_API(call_var[0]) # for testing of function without pooling
pool = ThreadPool(len(call_var))
packed = pool.map(call_API, call_var)
pool.close()
pool.join()
for pack in packed:
try:
vars_to_capture_01.append(pack[0][0])
except:
print(f'Unpacking error for {all_tags[i]}.')
vars_to_capture_02.append(pack[1][0])
For network API request you can use asyncio. Have a look at this article https://realpython.com/python-concurrency/#asyncio-version for an example how to implement it.
I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
I am copying the contents from multiple containers in Azure storage explorer and writing this to a bunch of new containers and want to know the most efficient way to do this.
The existing containers are called cycling-input-1, cycling-input-2,.... and the contents are written to new containers called cycling-output-1, cycling-output-2 etc. The containers are all of the of the same type (jpegs).
The for loop below creates a new container (cycling-output) with the required suffix and then copies the blobs from the relevant cycling-input container into here. I have about 30 containers each with 1000s of images in there, so not sure if this is the best way to do it (it's slow). Is there a better way to do it?
from azure.storage.blob.baseblobservice import BaseBlobService
account_name = 'name'
account_key = 'key'
# connect to the storage account
blob_service = BaseBlobService(account_name = account_name, account_key = account_key)
# get a list of the containers that need to be processed
cycling_containers = blob_service.list_containers(prefix = 'cycling-input')
# check the list of containers
for c in cycling_containers:
print(c.name)
# copy across the blobs from existing containers to new containers with a prefix cycling-output
prefix_of_new_container = 'cycling-output-'
for c in cycling_containers:
contname = c.name
generator = blob_service.list_blobs(contname)
container_index = ''.join(filter(str.isdigit, contname))
for blob in generator:
flag_of_new_container = blob_service.create_container("%s%s" % (prefix_of_new_container, container_index))
blob_service.copy_blob("%s%s" % (prefix_of_new_container, container_index), blob.name, "https://%s.blob.core.windows.net/%s/%s" % (account_name, contname, blob.name))
The simple way is to use multiprocessing module to parallel copy these blobs of all containers to their new containers named by replacing input with output.
Here is my sample code as reference.
from azure.storage.blob.baseblobservice import BaseBlobService
import multiprocessing
account_name = '<your account name>'
account_key = '<your account key>'
blob_service = BaseBlobService(
account_name=account_name,
account_key=account_key
)
cycling_containers = blob_service.list_containers(prefix = 'cycling-input')
def putBlobCopyTriples(queue, num_of_workers):
for c in cycling_containers:
container_name = c.name
new_container_name = container_name.replace('input', 'output')
blob_service.create_container(new_container_name)
for blob in blob_service.list_blobs(container_name):
blob_url = "https://%s.blob.core.windows.net/%s/%s" % (account_name, container_name, blob.name)
queue.put( (new_container_name, blob.name, blob_url) )
for i in range(num_of_workers):
queue.put( (None, None, None) )
def copyWorker(lock, queue, sn):
while True:
with lock:
(new_container_name, blob_name, new_blob_url) = queue.get()
if new_container_name == None:
break
print(sn, new_container_name, blob_name, new_blob_url)
blob_service.copy_blob(new_container_name, blob_name, new_blob_url)
if __name__ == '__main__':
num_of_workers = 4 # the number of workers what you want, for example, 4 is my cpu core count
lock = multiprocessing.Lock()
queue = multiprocessing.Queue()
multiprocessing.Process(target = putBlobCopyTriples, args = (queue, num_of_workers)).start()
workers = [multiprocessing.Process(target = copyWorker, args = (lock, queue, i)) for i in range(num_of_workers)]
for p in workers:
p.start()
Note: Except cpu core count on your environment, the copy-speed limits is depended on your IO bandwidth. The worker number is not the more, the better. Recommanded that the number is equals or less than your cpu count or hyper-threading count.