Read laz files are stored on IBM COS - python

I have a problem with reading laz files that are stored at IBM cloud object storage. I have built pywren-ibm library with all requirements that pdal one of them with docker and I then deployed it to IBM cloud function as an action, where the error that appear is "Unable to open stream for 'Colorea.laz" with error 'No such file or directory.' How can I read the files with pdal in IBM cloud function?
Here is some of the code:
import pywren_ibm_cloud as pywren
import pdal
import json
def manip_data(bucket, key, data_stream):
data = data_stream.read()
cr_json ={
"pipeline": [
{
"type": "readers.las",
"filename": f"{key}"
},
{
"type":"filters.range",
"limits":"Classification[9:9]"
}
]
}
pipeline = pdal.Pipeline(json.dumps(cr_json, indent=4))
pipeline.validate()
pipeline.loglevel = 8
n_points = pipeline.execute()
bucketname = 'The bucket name'
pw = pywren.ibm_cf_executor(runtime='ammarokran/pywren-pdal:1.0')
pw.map(manip_data, bucketname, chunk_size=None)
print(pw.get_result())
The code is running from local pc with jupyter notebook.

You'll need to specify some credentials and the correct endpoint for the bucket holding the files you're trying to access. Not totally sure how that works with a custom runtime, but typically you can just pass a config object in the executor.
import pywren_ibm_cloud as pywren
config = {'pywren' : {'storage_bucket' : 'BUCKET_NAME'},
'ibm_cf': {'endpoint': 'HOST',
'namespace': 'NAMESPACE',
'api_key': 'API_KEY'},
'ibm_cos': {'endpoint': 'REGION_ENDPOINT',
'api_key': 'API_KEY'}}
pw = pywren.ibm_cf_executor(config=config)

Related

how to create a tags in azure disk using python?

I want to add or create a new tag in Azure Disk using python but not able to do anyone please help me with python sdk/code for this.
for disk in compute_client.disks.list():
if disk.as_dict()["name"] == "test_disk_rohit":
tags = target_disk.tags["DetachedTime"] = datetime.now()
compute_client.disks.begin_create_or_update(resrc,disk.as_dict()["name"],disk)
This is what I tried to add/create a new tag for my azure disk called "test_disk_rohit".
Anyone help me with this..
Instead of using begin_create_or_update you can use create_or_update.
I have followed the below code snippet I can be able to create/update the tags in desk
AZURE_TENANT_ID= '<Tenent ID>'
AZURE_CLIENT_ID='<Client ID>'
AZURE_CLIENT_SECRET='<Client Secret>'
AZURE_SUBSCRIPTION_ID='<Sub_ID>'
credentials = ServicePrincipalCredentials(client_id=AZURE_CLIENT_ID,secret=AZURE_CLIENT_SECRET,tenant=AZURE_TENANT_ID)
resource_client = ResourceManagementClient(credentials, AZURE_SUBSCRIPTION_ID)
compute_client = ComputeManagementClient(credentials,AZURE_SUBSCRIPTION_ID)
Diskdetails = compute_client.disks.create_or_update(
'<ResourceGroupName>',
'<Disk Name>',
{
'location': 'eastasia',
'creation_data': {
'create_option': DiskCreateOption.copy,
'source_resource_id': <Source Resource ID>
},
"tags": {
"tagtest": "testtagGanesh"
},
}
)
disk_resource = Diskdetails.result()
#get Disk details
disk = compute_client.disks.get('<ResourceGroupName>','<Disk Name>')
print(disk.sku)

GCP Dataflow Error: "Failure getting groups, quitting"

I have a batch Dataflow pipeline that reads a csv file from a cloud storage bucket folder, processes the data and writes to a new file in the same bucket subfolder, and is triggered by a cloud function when a new file is uploaded to cloud storage. The pipeline runs fine and produces the desired output when testing with a small csv file (25 rows) but fails to write the output file when processing larger files, producing the error: "Failure getting groups, quitting".
Full error from logs explorer:
{
"insertId": "s=1f9f52b3276640528b537fd9e09a6c74;i=29b;b=715c0571349543b08fc296a56da392cb;m=b2fd5f;t=5d66d3d6020cf;x=8fb1cd537c367ea3",
"jsonPayload": {
"message": "Failure getting groups, quitting"
},
"resource": {
"type": "dataflow_step",
"labels": {
"project_id": "my-project",
"job_name": "Generate Clutch Product Code URLs - d29c0a",
"job_id": "2022-01-25_11_38_17-5732726158246265518",
"region": "us-central1",
"step_id": ""
}
},
"timestamp": "2022-01-25T19:39:13.042639Z",
"severity": "ERROR",
"labels": {
"dataflow.googleapis.com/log_type": "system",
"compute.googleapis.com/resource_name": "generateclutchproductcode-01251138-5h0y-harness-63q4",
"compute.googleapis.com/resource_type": "instance",
"dataflow.googleapis.com/job_name": "Generate Clutch Product Code URLs - d29c0a",
"compute.googleapis.com/resource_id": "3115486816356921127",
"dataflow.googleapis.com/region": "us-central1",
"dataflow.googleapis.com/job_id": "2022-01-25_11_38_17-5732726158246265518"
},
"logName": "projects/my-project/logs/dataflow.googleapis.com%2Fsystem",
"receiveTimestamp": "2022-01-25T19:39:23.792851821Z"
}
In addition to the error, I also get the following warning (which may or may not be related):
Discarding unparseable args: ['--beam_plugins=apache_beam.io.filesystem.FileSystem', '--beam_plugins=apache_beam.io.hadoopfilesystem.HadoopFileSystem', '--beam_plugins=apache_beam.io.localfilesystem.LocalFileSystem', '--beam_plugins=apache_beam.io.gcp.gcsfilesystem.GCSFileSystem', '--beam_plugins=apache_beam.io.aws.s3filesystem.S3FileSystem', '--beam_plugins=apache_beam.io.azure.blobstoragefilesystem.BlobStorageFileSystem', '--pipeline_type_check', '--pipelineUrl=gs://my-project-dataflows/Templates/staging/beamapp-user-0125193126-815021.1643139086.815242/pipeline.pb', '--gcpTempLocation=gs://dataflow-staging-us-central1-883825732987/temp', '--autoscalingAlgorithm=NONE', '--numWorkers=2', '--direct_runner_use_stacked_bundle', '--templateLocation=gs://my-project-dataflows/Templates/Generate_Clutch_Product_Codes.py', '--maxNumWorkers=0', '--dataflowJobId=2022-01-25_11_38_17-5732726158246265518', '--job_server_timeout=60']
My pipeline code:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io import WriteToText
import logging
import traceback
import csv
import sys
import logging
from cryptography.fernet import Fernet
from csv import reader, DictReader, DictWriter
import google.auth
from google.cloud import storage
class CustomOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--fernet_key', type=str, help='Fernet secret used to serialize product codes')
parser.add_argument('--bucket', type=str, help='Cloud Storage bucket containing relevant files')
parser.add_argument('--input_file', type=str, help='File containing product codes to convert')
parser.add_argument('--output_file', type=str, help='Destination of the new file')
def generate_product_code_urls_pipeline(project, env, region):
options = PipelineOptions(
streaming=False,
project=project,
region=region,
staging_location=f'gs://my-project-{env}-dataflows/Templates/staging',
temp_location=f'gs://my-project-{env}-dataflows/Templates/temp',
template_location=f'gs://my-project-{env}-dataflows/Templates/Generate_Clutch_Product_Codes.py',
subnetwork=f'https://www.googleapis.com/compute/v1/projects/{project}/regions/us-central1/subnetworks/{env}-private'
)
custom_options = options.view_as(CustomOptions)
custom_options.view_as(SetupOptions).save_main_session = True
logging.info(f'Custom Options: {custom_options}')
# Transform function
def genURLs(code):
from cryptography.fernet import Fernet
f = Fernet(custom_options.fernet_key)
encoded = code.encode()
encrypted = f.encrypt(encoded)
decrypted = f.decrypt(encrypted.decode().encode())
decoded = decrypted.decode()
if code != decoded:
logging.info(f'Original product code {code}, and decoded code {decoded} do not match')
sys.exit(1)
url = 'https://my-url.com/scan?code=' + encrypted.decode()
return url
class UpdateMetadata(beam.DoFn):
def __init__(self, bucket_name):
self.bucket_name = bucket_name
def start_bundle(self):
from google.cloud import storage
self.client = storage.Client()
def process(self, urls):
logging.info(f'Updating object metadata...')
bucket = self.client.bucket(self.bucket_name)
blob = bucket.get_blob(custom_options.output_file)
blob.content_type = 'text/csv'
blob.patch()
# End function
p = beam.Pipeline(options=options)
(p | 'Read Input CSV' >> beam.io.ReadFromText(f'gs://{custom_options.bucket}/{custom_options.input_file}', skip_header_lines=1)
| 'Map Codes' >> beam.Map(genURLs)
| 'Write PCollection to Bucket' >> WriteToText(f'gs://{custom_options.bucket}/{custom_options.output_file}', num_shards=1, shard_name_template='', header='URL')
| 'Update Object Metadata' >> beam.ParDo(UpdateMetadata(custom_options.bucket)))
p.run()
# Pipeline execution
try:
region = 'us-central1'
env = 'dev'
cred, project = google.auth.default()
generate_product_code_urls_pipeline(project, env, region)
logging.info('\n PIPELINE FINISHED \n')
except (KeyboardInterrupt, SystemExit):
raise
except:
logging.error('\n PIPELINE FAILED')
traceback.print_exc()
What's more, the job graph shows that all steps were successfully completed. It seems like it could be an issue with the workers writing the file to the desired location, but that's my best guess as I've had trouble finding information about this error. Any further info or suggestions would be a huge help and very appreciated.

How to get pod volume list using python?

My pod has a volume as:
"volumes": [
{
"name": "configs",
"secret": {
"defaultMode": 420,
"secretName": "some_secret"
}
},
....]
I want to be able to read it using Python as V1Volume.
Tried to do:
from kubernetes import config
config.load_incluster_config()
spec = client.V1PodSpec()
But I'm stuck as it gives me
raise ValueError("Invalid value for `containers`, must not be `None`")
and I'm not sure how to continue. How can I get the volumes from the V1PodSpec?
It gives you the error because you initialise V1PodSpec without any arguments. V1PodSpec used to create pods, not to read them.
To read pod spec from Kubernetes:
from kubernetes import client,config
config.load_kube_config()
# or
# config.load_incluster_config()
core_api = client.CoreV1Api()
response = core_api.read_namespaced_pod(name="debug-pod", namespace='dev')
# access volumes in the returned response
type(response.spec.volumes[0])
# returns:
# <class 'kubernetes.client.models.v1_volume.V1Volume'>

Uploading a Video to Azure Media Services with Python SDKs

I am currently looking for a way to upload a video to Azure Media Services (AMS v3) via Python SDKs. I have followed its instruction, and am able to connect to AMS successfully.
Example
credentials = AdalAuthentication(
context.acquire_token_with_client_credentials,
RESOURCE,
CLIENT,
KEY)
client = AzureMediaServices(credentials, SUBSCRIPTION_ID) # Successful
I also successfully get all the videos' details uploaded via its portal
for data in client.assets.list(RESOUCE_GROUP_NAME, ACCOUNT_NAME).get(0):
print(f'Asset_name: {data.name}, file_name: {data.description}')
# Asset_name: 4f904060-d15c-4880-8c5a-xxxxxxxx, file_name: 夢想全紀錄.mp4
# Asset_name: 8f2e5e36-d043-4182-9634-xxxxxxxx, file_name: an552Qb_460svvp9.webm
# Asset_name: aef495c1-a3dd-49bb-8e3e-xxxxxxxx, file_name: world_war_2.webm
# Asset_name: b53d8152-6ecd-41a2-a59e-xxxxxxxx, file_name: an552Qb_460svvp9.webm - Media Encoder Standard encoded
However, when I tried to use the following method; it failed. Since I have no idea what to parse as parameters - Link to Python SDKs
create_or_update(resource_group_name, account_name, asset_name,
parameters, custom_headers=None, raw=False, **operation_config)
Therefore, I would like to ask questions as follows (everything is done via Python SDKs):
What kind of parameters does it expect?
Can a video be uploaded directly to AMS or it should be uploaded to Blob Storage first?
Should an Asset contain only one video or multiple files are fine?
The documentation for the REST version of that method is at https://learn.microsoft.com/en-us/rest/api/media/assets/createorupdate. This is effectively the same as the Python parameters.
Videos are stored in Azure Storage for Media Services. This is true for input assets, the assets that are encoded, and any streamed content. It all is in Storage but accessed by Media Services. You do need to create an asset in Media Services which creates the Storage container. Once the Storage container exists you upload via the Storage APIs to that Media Services created container.
Technically multiple files are fine, but there are a number of issues with doing that that you may not expect. I'd recommend using 1 input video = 1 Media Services asset. On the encoding output side there will be more than one file in the asset. Encoding output contains one or more videos, manifests, and metadata files.
I have found my method to work around using Python SDKs and REST; however, I am not quite sure it's proper.
Log-In to Azure Media Services and Blob Storage via Python packages
import adal
from msrestazure.azure_active_directory import AdalAuthentication
from msrestazure.azure_cloud import AZURE_PUBLIC_CLOUD
from azure.mgmt.media import AzureMediaServices
from azure.mgmt.media.models import MediaService
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
Create Assets for an original file and an encoded one by parsing these parameters. Example of the original file Asset creation.
asset_name = 'asset-myvideo'
asset_properties = {
'properties': {
'description': 'Original File Description',
'storageAccountName': "storage-account-name"
}
}
client.assets.create_or_update(RESOUCE_GROUP_NAME, ACCOUNT_NAME, asset_name, asset_properties)
Upload a video to the Blob Storage derived from the created original asset
current_container = [data.container for data in client.assets.list(RESOUCE_GROUP_NAME, ACCOUNT_NAME).get(0) if data.name == asset_name][0] # Get Blob Storage location
file_name = "myvideo.mp4"
blob_client = blob_service_client.get_blob_client(container=current_container, blob=file_name)
with open('original_video.mp4', 'rb') as data:
blob_client.upload_blob(data)
print(f'Video uploaded to {current_container}')
And after that, I do Transform, Job, and Streaming Locator to get the video Streaming Link successfully.
I was able to get this to work with the newer python SDK. The python documentation is mostly missing, so I constructed this mainly from the python SDK source code and the C# examples.
azure-storage-blob==12.3.1
azure-mgmt-media==2.1.0
azure-mgmt-resource==9.0.0
adal~=1.2.2
msrestazure~=0.6.3
0) Import a lot of stuff
from azure.mgmt.media.models import Asset, Transform, Job,
BuiltInStandardEncoderPreset, TransformOutput, \
JobInputAsset, JobOutputAsset, AssetContainerSas, AssetContainerPermission
import adal
from msrestazure.azure_active_directory import AdalAuthentication
from msrestazure.azure_cloud import AZURE_PUBLIC_CLOUD
from azure.mgmt.media import AzureMediaServices
from azure.storage.blob import BlobServiceClient, ContainerClient
import datetime as dt
import time
LOGIN_ENDPOINT = AZURE_PUBLIC_CLOUD.endpoints.active_directory
RESOURCE = AZURE_PUBLIC_CLOUD.endpoints.active_directory_resource_id
# AzureSettings is a custom NamedTuple
1) Log in to AMS:
def get_ams_client(settings: AzureSettings) -> AzureMediaServices:
context = adal.AuthenticationContext(LOGIN_ENDPOINT + '/' +
settings.AZURE_MEDIA_TENANT_ID)
credentials = AdalAuthentication(
context.acquire_token_with_client_credentials,
RESOURCE,
settings.AZURE_MEDIA_CLIENT_ID,
settings.AZURE_MEDIA_SECRET
)
return AzureMediaServices(credentials, settings.AZURE_SUBSCRIPTION_ID)
2) Create an input and output asset
input_asset = create_or_update_asset(
input_asset_name, "My Input Asset", client, azure_settings)
input_asset = create_or_update_asset(
output_asset_name, "My Output Asset", client, azure_settings)
3) Get the Container Name. (most documentation refers to BlockBlobService, which is seems to have been removed from the SDK)
def get_container_name(client: AzureMediaServices, asset_name: str, settings: AzureSettings):
expiry_time = dt.datetime.now(dt.timezone.utc) + dt.timedelta(hours=4)
container_list: AssetContainerSas = client.assets.list_container_sas(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
asset_name=asset_name,
permissions = AssetContainerPermission.read_write,
expiry_time=expiry_time
)
sas_uri: str = container_list.asset_container_sas_urls[0]
container_client: ContainerClient = ContainerClient.from_container_url(sas_uri)
return container_client.container_name
4) Upload a file the the input asset container:
def upload_file_to_asset_container(
container: str, local_file, uploaded_file_name, settings: AzureSettings):
blob_service_client = BlobServiceClient.from_connection_string(settings.AZURE_MEDIA_STORAGE_CONNECTION_STRING))
blob_client = blob_service_client.get_blob_client(container=container, blob=uploaded_file_name)
with open(local_file, 'rb') as data:
blob_client.upload_blob(data)
5) Create a transform (in my case, using the adaptive streaming preset):
def get_or_create_transform(
client: AzureMediaServices,
transform_name: str,
settings: AzureSettings):
transform_output = TransformOutput(preset=BuiltInStandardEncoderPreset(preset_name="AdaptiveStreaming"))
transform: Transform = client.transforms.create_or_update(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
transform_name=transform_name,
outputs=[transform_output]
)
return transform
5) Submit the Job
def submit_job(
client: AzureMediaServices,
settings: AzureSettings,
input_asset: Asset,
output_asset: Asset,
transform_name: str,
correlation_data: dict) -> Job:
job_input = JobInputAsset(asset_name=input_asset.name)
job_outputs = [JobOutputAsset(asset_name=output_asset.name)]
job: Job = client.jobs.create(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
job_name=f"test_job_{UNIQUENESS}",
transform_name=transform_name,
parameters=Job(input=job_input,
outputs=job_outputs,
correlation_data=correlation_data)
)
return job
6) Then I get the URLs after the Event Grid has told me the job is done:
# side-effect warning: this starts the streaming endpoint $$$
def get_urls(client: AzureMediaServices, output_asset_name: str
locator_name: str):
try:
locator: StreamingLocator = client.streaming_locators.create(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
streaming_locator_name=locator_name,
parameters=StreamingLocator(
asset_name=output_asset_name,
streaming_policy_name="Predefined_ClearStreamingOnly"
)
)
except Exception as ex:
print("ignoring existing")
streaming_endpoint: StreamingEndpoint = client.streaming_endpoints.get(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
streaming_endpoint_name="default")
if streaming_endpoint:
if streaming_endpoint.resource_state != "Running":
client.streaming_endpoints.start(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
streaming_endpoint_name="default"
)
paths = client.streaming_locators.list_paths(
resource_group_name=settings.AZURE_MEDIA_RESOURCE_GROUP_NAME,
account_name=settings.AZURE_MEDIA_ACCOUNT_NAME,
streaming_locator_name=locator_name
)
return [f"https://{streaming_endpoint.host_name}{path.paths[0]}" for path in paths.streaming_paths]

How to upload a local CSV to google big query using python

I'm trying to upload a local CSV to google big query using python
def uploadCsvToGbq(self,table_name):
load_config = {
'destinationTable': {
'projectId': self.project_id,
'datasetId': self.dataset_id,
'tableId': table_name
}
}
load_config['schema'] = {
'fields': [
{'name':'full_name', 'type':'STRING'},
{'name':'age', 'type':'INTEGER'},
]
}
load_config['sourceFormat'] = 'CSV'
upload = MediaFileUpload('sample.csv',
mimetype='application/octet-stream',
# This enables resumable uploads.
resumable=True)
start = time.time()
job_id = 'job_%d' % start
# Create the job.
result = bigquery.jobs.insert(
projectId=self.project_id,
body={
'jobReference': {
'jobId': job_id
},
'configuration': {
'load': load_config
}
},
media_body=upload).execute()
return result
when I run this it throws error like
"NameError: global name 'MediaFileUpload' is not defined"
whether any module is needed please help.
One of easiest method to upload to csv file in GBQ is through pandas.Just import csv file to pandas (pd.read_csv()). Then from pandas to GBQ (df.to_gbq(full_table_id, project_id=project_id)).
import pandas as pd
import csv
df=pd.read_csv('/..localpath/filename.csv')
df.to_gbq(full_table_id, project_id=project_id)
Or you can use client api
from google.cloud import bigquery
import pandas as pd
df=pd.read_csv('/..localpath/filename.csv')
client = bigquery.Client()
dataset_ref = client.dataset('my_dataset')
table_ref = dataset_ref.table('new_table')
client.load_table_from_dataframe(df, table_ref).result()
pip install --upgrade google-api-python-client
Then on top of your python file write:
from googleapiclient.http import MediaFileUpload
But care you miss some parenthesis. Better write:
result = bigquery.jobs().insert(projectId=PROJECT_ID, body={'jobReference': {'jobId': job_id},'configuration': {'load': load_config}}, media_body=upload).execute(num_retries=5)
And by the way, you are going to upload all your CSV rows, including the top one that defines columns.
The class MediaFileUpload is in http.py. See https://google-api-python-client.googlecode.com/hg/docs/epy/apiclient.http.MediaFileUpload-class.html

Categories