How to handle incoming PubSub messages in Python? - python

I have created a Cloud Compute Engine instance on Debian, and have successfully created a PUSH subscription to a topic with
from google.cloud import pubsub_v1
project_id = "censored"
topic_name = "censored"
subscription_name = "censored"
endpoint = "https://censored.appspot.com/pubsub/push?token=censored"
def create_push_subscription(project_id,
topic_name,
subscription_name,
endpoint):
"""Create a new push subscription on the given topic."""
# [START pubsub_create_push_subscription]
subscriber = pubsub_v1.SubscriberClient()
topic_path = subscriber.topic_path(project_id, topic_name)
subscription_path = subscriber.subscription_path(
project_id, subscription_name)
push_config = pubsub_v1.types.PushConfig(
push_endpoint=endpoint)
subscription = subscriber.create_subscription(
subscription_path, topic_path, push_config)
print('Push subscription created: {}'.format(subscription))
print('Endpoint for subscription is: {}'.format(endpoint))
# [END pubsub_create_push_subscription]
create_push_subscription(project_id, topic_name, subscription_name, endpoint)
but I'm not sure how exactly incoming messages arrive. I have found this sample code to parse messages, but I'm not sure how to get it to monitor in the background and 'activate' whenever incoming messages arrive.
import argparse
import base64
import json
import sys
import time
from google.cloud import pubsub_v1
def summarize(message):
# [START parse_message]
data = message.data.decode('utf-8')
attributes = message.attributes
name = attributes['name']
time_created = attributes['timeCreated']
bucket_id = attributes['bucketId']
object_id = attributes['objectId']
generation = attributes['objectGeneration']
description = (
'\tName: {name}\n'
'\tTime Created: {time_created}\n'
'\tBucket ID: {bucket_id}\n'
'\tObject ID: {object_id}\n'
'\tGeneration: {generation}\n'
).format(
name=name,
time_created=time_created,
bucket_id=bucket_id,
object_id=object_id,
generation=generation
)
if 'overwroteGeneration' in attributes:
description += '\tOverwrote generation: %s\n' % (
attributes['overwroteGeneration'])
if 'overwrittenByGeneration' in attributes:
description += '\tOverwritten by generation: %s\n' % (
attributes['overwrittenByGeneration'])
payload_format = attributes['payloadFormat']
if payload_format == 'JSON_API_V1':
object_metadata = json.loads(data)
name = object_metadata['name']
time_created = object_metadata['timeCreated']
size = object_metadata['size']
content_type = object_metadata['contentType']
metageneration = object_metadata['metageneration']
description += (
'\tName: {name}\n'
'\tTime Created: {time_created}\n'
'\tContent type: {content_type}\n'
'\tSize: {object_size}\n'
'\tMetageneration: {metageneration}\n'
).format(
name=name,
time_created=time_created,
content_type=content_type,
object_size=size,
metageneration=metageneration
)
return description
print('Note for developer: If BucketId and ObjectId listed, utf encoding.')
print('If not, JSON_V1 encoding. Adjust accordingly.')
# [END parse_message]
while(True):
print("signpost 1")
summarize(message)
print("signpost 2")
time.sleep(10)
print("signpost 3")
For example, this code will return
NameError: name 'message' is not defined
which is expected...
Could someone please help me set it up properly?
I know it's different in PULL because then the message will be defined during the pull, but I'd like to keep it as PUSH, if possible.

You need to create a long-running process which is either able to continuously poll for new messages (pull subscription) or have a reachable endpoint to receive new messages (push subscription).
See the example here: https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/pubsub/cloud-client/subscriber.py, as well as the differences between push and pull here: https://cloud.google.com/pubsub/docs/subscriber

Related

How to poll and keep track of external job status through Airflow?

I am working on polling boto3 to check the status of a SageMaker Autopilot job using Airflow. I am using a PythonSensor to wait for the status to return Completed for both JobStatus and JobSecondaryStatus, then end the entire pipeline. These are the values that they can contain which I made enums of in the code:
'AutoMLJobStatus': 'Completed'|'InProgress'|'Failed'|'Stopped'|'Stopping',
'AutoMLJobSecondaryStatus': 'Starting'|'AnalyzingData'|'FeatureEngineering'|'ModelTuning'|'MaxCandidatesReached'|'Failed'|'Stopped'|'MaxAutoMLJobRuntimeReached'|'Stopping'|'CandidateDefinitionsGenerated'|'GeneratingExplainabilityReport'|'Completed'|'ExplainabilityError'|'DeployingModel'|'ModelDeploymentError'
_sagemaker_job_status takes automl_job_name through xcom from an upstream task and it successfully gets passed. With this job name I can pass it to descibe_auto_ml_job() to get the status through AutoMLJobStatus and AutoMLJobSecondaryStatus.
The main point of this is for messaging through Slack to see all the unique stages the job is at. Currently, I am trying to save all the unique job statuses to a set and then checking that set before sending a message with the job statuses in it.
But everytime _sagemaker_job_status is poked, the values of the set seem to be the same therefore sending a slack message everytime the function is poked, I logged the sets and both are empty. Below this I made a simpler example that worked.
import airflow
from airflow import DAG
from airflow.exceptions import AirflowFailException
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
import boto3
def _sagemaker_job_status(templates_dict, **context):
"""
Checks the SageMaker AutoMLJobStatus and AutoMLJobSecondaryStatus
for updates and when both are complete the entire process is marked as
successful
"""
automl_job_name = templates_dict.get("automl_job_name")
if not automl_job_name:
error_message = "AutoMLJobName was not passed from upstream"
print(error_message)
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
client = boto3.client("sagemaker", "us-east-1")
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
job_status = response.get("AutoMLJobStatus")
secondary_job_status = response.get("AutoMLJobSecondaryStatus")
past_job_statuses = set()
past_secondary_job_statuses = set()
print(f"Past Job Statuses : {past_job_statuses}")
print(f"Past Secondary Job Statuses : {past_secondary_job_statuses}")
# If the job status has not been already seen
if (
job_status not in past_job_statuses
and secondary_job_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {secondary_job_status}
"""
print(message)
task_success_slack_alert(
context=context,
extra_message=message,
)
past_job_statuses.add(job_status)
past_secondary_job_statuses.add(secondary_job_status)
# If the main job fails
if job_status == JobStatus.Failed.value:
error_message = "SageMaker Autopilot Job Failed!"
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
raise AirflowFailException(error_message)
return (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
)
args = {
"owner": "Yudhiesh",
"start_date": airflow.utils.dates.days_ago(1),
"schedule_interval": "#once",
"on_failure_callback": task_fail_slack_alert,
}
with DAG(
dag_id="02_lasic_retraining_sagemaker_autopilot",
default_args=args,
render_template_as_native_obj=True,
) as dag:
sagemaker_job_status = PythonSensor(
task_id="sagemaker_job_status",
python_callable=_sagemaker_job_status,
templates_dict={
"automl_job_name": "{{task_instance.xcom_pull(task_ids='train_model_sagemaker_autopilot')}}", # noqa: E501
},
)
end = DummyOperator(
task_id="end",
)
sagemaker_job_status >> end
I created a similar setup as before but this time I randomly generated the values from an enum of JobStatus & JobSecondaryStatus and tried to only print the values if they are unique, and turns out it works perfectly. Could anyone explain why this happens and what I can do to the main example to get it to work?
import airflow
import random
from airflow import DAG
from airflow.sensors.python import PythonSensor
from airflow.operators.dummy import DummyOperator
from airflow.exceptions import AirflowFailException
def _mimic_sagemaker_job_status():
job_statuses = [status.value for status in JobStatus]
job_secondary_statuses = [
secondary_status.value for secondary_status in JobSecondaryStatus
]
past_job_statuses = set()
past_secondary_job_statuses = set()
job_status = random.choice(job_statuses)
job_secondary_status = random.choice(job_secondary_statuses)
if (
job_status not in past_job_statuses
and job_secondary_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {job_secondary_status}
"""
# Send alerts on every new job status update
print(message)
past_job_statuses.add(job_status)
past_secondary_job_statuses.add(job_secondary_status)
if (
job_status == JobStatus.Failed.value
or job_secondary_status == JobSecondaryStatus.Failed.value
):
raise AirflowFailException("SageMaker Autopilot Job Failed!")
return (
job_secondary_status == JobSecondaryStatus.Completed.value
and job_status == JobStatus.Completed.value
)
with DAG(
dag_id="04_sagemaker_sensor",
start_date=airflow.utils.dates.days_ago(3),
schedule_interval="#once",
render_template_as_native_obj=True,
) as dag:
wait_for_status = PythonSensor(
task_id="wait_for_status",
python_callable=_mimic_sagemaker_job_status,
dag=dag,
)
end = DummyOperator(
task_id="end",
)
wait_for_status >> end
Enums used in the above code:
from enum import Enum
class JobStatus(Enum):
"""
Enum of all the potential values of a SageMaker Autopilot job status
"""
Completed = "Completed"
InProgress = "InProgress"
Failed = "Failed"
Stopped = "Stopped"
Stopping = "Stopping"
class JobSecondaryStatus(Enum):
"""
Enum of all the potential values of a SageMaker Autopilot job secondary
status
"""
Starting = "Starting"
AnalyzingData = "AnalyzingData"
FeatureEngineering = "FeatureEngineering"
ModelTuning = "ModelTuning"
MaxCandidatesReached = "MaxCandidatesReached"
Failed = "Failed"
Stopped = "Stopped"
MaxAutoMLJobRuntimeReached = "MaxAutoMLJobRuntimeReached"
Stopping = "Stopping"
CandidateDefinitionsGenerated = "CandidateDefinitionsGenerated"
GeneratingExplainabilityReport = "GeneratingExplainabilityReport"
Completed = "Completed"
ExplainabilityError = "ExplainabilityError"
DeployingModel = "DeployingModel"
ModelDeploymentError = "ModelDeploymentError"
EDIT:
I suppose another work around for the main example would be to have an operator create a temporary file containing JSON of the set before the sagemaker job status, then within the sagemaker job status I can check the job statuses saved to the file and then print them if they are unique. I just realised that I can make use of the database as well.
So I couldn't seem to get it working as it is so I resorted to creating a JSON file that stores the different SageMaker Autopilot job statuses which I read and write to in the PythonSensor.
This takes in the AutoMLJobName from the previous step, creates a temporary file of the job statuses, and returns the AutoMLJobName and the name of the JSON file.
import tempfile
def _create_job_status_json(templates_dict, **context):
automl_job_name = templates_dict.get("sagemaker_autopilot_data_paths")
if not automl_job_name:
error_message = "AutoMLJobName was not passed from upstream"
print(error_message)
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
initial = {
"JobStatus": [],
"JobSecondaryStatus": [],
}
file = tempfile.NamedTemporaryFile(mode="w", delete=False)
json.dump({"Status": initial}, file)
file.flush()
return (file.name, automl_job_name)
Next this function reads the JSON file based on the name and then checks the different job statuses based on the boto3 sagemaker client. If the main job fails then the whole run fails. It adds the job statuses to a dictionary if one of them are unique. Once that is done it will write the dictionary to the JSON file. When the entire job finishes, it sends some details about the best model as a Slack message. It returns true when both job statuses are Completed. Just a note, I also removed the JSON file if the job is successfull or if it fails.
import airflow
from airflow import DAG
from airflow.exceptions import AirflowFailException
import boto3
def _sagemaker_job_status(templates_dict, **context):
"""
Checks the SageMaker AutoMLJobStatus and AutoMLJobSecondaryStatus
for updates and when both are complete the entire process is marked as
successful
"""
file_name, automl_job_name = templates_dict.get("automl_job_data")
job_status_dict = {}
client = boto3.client("sagemaker", "us-east-1")
if not client:
raise AirflowFailException(
"Unable to get access to boto3 sagemaker client",
)
with open(file_name, "r") as json_file:
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
job_status = response.get("AutoMLJobStatus")
secondary_job_status = response.get("AutoMLJobSecondaryStatus")
job_status_dict = json.load(json_file)
status = job_status_dict.get("Status")
past_job_statuses = status.get("JobStatus")
past_secondary_job_statuses = status.get("JobSecondaryStatus")
if job_status == JobStatus.Failed.value:
error_message = "SageMaker Autopilot Job Failed!"
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
os.remove(file_name)
raise AirflowFailException(error_message)
if (
job_status not in past_job_statuses
or secondary_job_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {secondary_job_status}
"""
print(message)
task_success_slack_alert(
context=context,
extra_message=message,
)
past_job_statuses.append(job_status)
past_secondary_job_statuses.append(secondary_job_status)
with open(file_name, "w") as file:
json.dump(job_status_dict, file)
if (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
):
os.remove(file_name)
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
best_candidate = response.get("BestCandidate")
best_candidate_id = best_candidate.get("CandidateName")
best_metric_name = (
best_candidate.get("FinalAutoMLJobObjectiveMetric")
.get("MetricName")
.split(":")[1]
.upper()
)
best_metric_value = round(
best_candidate.get("FinalAutoMLJobObjectiveMetric").get(
"Value",
),
3,
)
message = f"""
Best Candidate ID : {best_candidate_id}
Best Candidate Metric Score : {best_metric_value}{best_metric_name}
""" # noqa: E501
task_success_slack_alert(
context=context,
extra_message=message,
)
return (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
)
DAG code:
import airflow
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
args = {
"owner": "Yudhiesh",
"start_date": airflow.utils.dates.days_ago(1),
"schedule_interval": "#once",
"on_failure_callback": task_fail_slack_alert,
}
with DAG(
dag_id="02_lasic_retraining_sagemaker_autopilot",
default_args=args,
render_template_as_native_obj=True,
) as dag:
create_job_status_json = PythonOperator(
task_id="create_job_status_json",
python_callable=_create_job_status_json,
templates_dict={
"sagemaker_autopilot_data_paths": "{{task_instance.xcom_pull(task_ids='train_model_sagemaker_autopilot')}}", # noqa: E501
},
)
sagemaker_job_status = PythonSensor(
task_id="sagemaker_job_status",
python_callable=_sagemaker_job_status,
templates_dict={
"automl_job_data": "{{task_instance.xcom_pull(task_ids='create_job_status_json')}}", # noqa: E501
},
)
# train_model_sagemaker_autopilot is not included but it initiates the training through boto3
train_model_sagemaker_autopilot >> create_job_status_json
create_job_status_json >> sagemaker_job_status

i want to export google sccfindings in biq query table using cloud functions but getting error 'client' object has no attribute 'list_findings'

Python function to list findings and getting error client' object has no attribute 'list_findings' and requirements.txt include (google-cloud-securitycenter and biq query)
import os
import json
from google.cloud import bigquery
def test_list_all_findings(request):
if request.method != 'POST':
return abort(405)
request_json = request.get_json()
# [START list_all_findings]
from google.cloud import securitycenter
# Create a client.
client = securitycenter.SecurityCenterClient()
client = bigquery.Client()
cuid = request_json['cuid']
organization_id = request_json['organization_id']
# organization_id is the numeric ID of the organization. e.g.:
organization_id = organization_id
org_name = "organizations/{org_id}".format(org_id=organization_id)
# The "sources/-" suffix lists findings across all sources. You
# also use a specific source_name instead.
all_sources = "{org_name}/sources/-".format(org_name=org_name)
finding_result_iterator = client.list_findings(all_sources)
job_config = bigquery.CopyJobConfig()
job_config.write_disposition = "WRITE_TRUNCATE"
destination_table_id = "gce-kubernetes.onboard_gcp.cc_data_billing_"+cuid
blob = destination_table_id.blob("findings.json")
f=open("/tmp/findings.json", "a+")
for i, finding_result in enumerate(finding_result_iterator):
s = "{}) 'name': {}, resource: {}, destination_table_id: {}".format(
i, finding_result.finding.name, finding_result.finding.resource_name, destination_table_id)
print(s)
f.write(str(finding_result))
f.write(",\n")
f.close()
blob.upload_from_filename('/tmp/findings.json')
os.remove("/tmp/findings.json")
# [END list_all_findings]

Pull Adwords Report for multiple mcc account

I'm looking for a way to pull reports for multiple mcc_account in one go using AdWords API for python, by below code I'm able to pass one MCC account at a time using google_ads.YAML file.
Is there a way to pass a multiple MCC accounts from CSV or text file and pull reports for them?
YAML file
# AdWordsClient configurations
adwords:
#############################################################################
# Required Fields #
#############################################################################
developer_token: XXXXXX
#############################################################################
# Optional Fields #
#############################################################################
client_customer_id: XXXX
user_agent: XXXX
# partial_failure: True
# validate_only: True
#############################################################################
# OAuth2 Configuration #
# Below you may provide credentials for either the installed application or #
# service account flows. Remove or comment the lines for the flow you're #
# not using. #
#############################################################################
# The following values configure the client for the installed application
# flow.
client_id: XXXXX
client_secret: XXXX
refresh_token: XXXXX
Code
import multiprocessing
import os
from Queue import Empty
import time
import googleads.adwords
import googleads.errors
# Timeout between retries in seconds.
BACKOFF_FACTOR = 5
# Maximum number of processes to spawn.
MAX_PROCESSES = multiprocessing.cpu_count()
# Maximum number of retries for 500 errors.
MAX_RETRIES = 5
# Maximum number of items to be sent in a single API response.
PAGE_SIZE = 100
# Directory to download the reports to.
REPORT_DOWNLOAD_DIRECTORY = '.'
def _DownloadReport(process_id, report_download_directory, customer_id,
report_definition):
report_downloader = (googleads.adwords.AdWordsClient.LoadFromStorage(' 'googleads.yaml')
.GetReportDownloader())
filepath = os.path.join(report_download_directory,
'adgroup_%d.csv' % customer_id)
retry_count = 0
while True:
print ('[%d/%d] Loading report for customer ID "%s" into "%s"...'
% (process_id, retry_count, customer_id, filepath))
try:
with open(filepath, 'wb') as handler:
report_downloader.DownloadReport(
report_definition, output=handler,
client_customer_id=customer_id)
return True, {'customerId': customer_id}
except googleads.errors.AdWordsReportError as e:
if e.code == 500 and retry_count < MAX_RETRIES:
time.sleep(retry_count * BACKOFF_FACTOR)
else:
print ('Report failed for customer ID "%s" with code "%d" after "%d" '
'retries.' % (customer_id, e.code, retry_count + 1))
return (False, {'customerId': customer_id, 'code': e.code,
'message': e.message})
class ReportWorker(multiprocessing.Process):
"""A worker Process used to download reports for a set of customer IDs."""
def __init__(self, report_download_directory, report_definition,
input_queue, success_queue, failure_queue):
"""Initializes a ReportWorker.
Args:
report_download_directory: A string indicating the directory where you
would like to download the reports.
report_definition: A dict containing the report definition that you would
like to run against all customer IDs in the input_queue.
input_queue: A Queue instance containing all of the customer IDs that
the report_definition will be run against.
success_queue: A Queue instance that the details of successful report
downloads will be saved to.
failure_queue: A Queue instance that the details of failed report
downloads will be saved to.
"""
super(ReportWorker, self).__init__()
self.report_download_directory = report_download_directory
self.report_definition = report_definition
self.input_queue = input_queue
self.success_queue = success_queue
self.failure_queue = failure_queue
def run(self):
while True:
try:
customer_id = self.input_queue.get(timeout=0.01)
except Empty:
break
result = _DownloadReport(self.ident, self.report_download_directory,
customer_id, self.report_definition)
(self.success_queue if result[0] else self.failure_queue).put(result[1])
def GetCustomerIDs(client):
"""Retrieves all CustomerIds in the account hierarchy.
Note that your configuration file must specify a client_customer_id belonging
to an AdWords manager account.
Args:
client: an AdWordsClient instance.
Raises:
Exception: if no CustomerIds could be found.
Returns:
A Queue instance containing all CustomerIds in the account hierarchy.
"""
# For this example, we will use ManagedCustomerService to get all IDs in
# hierarchy that do not belong to MCC accounts.
managed_customer_service = client.GetService('ManagedCustomerService',
version='v201809')
offset = 0
# Get the account hierarchy for this account.
selector = {
'fields': ['CustomerId'],
'predicates': [{
'field': 'CanManageClients',
'operator': 'EQUALS',
'values': [False]
}],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
# Using Queue to balance load between processes.
queue = multiprocessing.Queue()
more_pages = True
while more_pages:
page = managed_customer_service.get(selector)
if page and 'entries' in page and page['entries']:
for entry in page['entries']:
queue.put(entry['customerId'])
else:
raise Exception('Can\'t retrieve any customer ID.')
offset += PAGE_SIZE
selector['paging']['startIndex'] = str(offset)
more_pages = offset < int(page['totalNumEntries'])
return queue
def main(client, report_download_directory):
# Determine list of customer IDs to retrieve report for.
input_queue = GetCustomerIDs(client)
reports_succeeded = multiprocessing.Queue()
reports_failed = multiprocessing.Queue()
# Create report definition.
report_definition = {
'reportName': 'Custom ADGROUP_PERFORMANCE_REPORT',
'dateRangeType': 'LAST_7_DAYS',
'reportType': 'ADGROUP_PERFORMANCE_REPORT',
'downloadFormat': 'CSV',
'selector': {
'fields': ['CampaignId', 'AdGroupId', 'Impressions', 'Clicks',
'Cost'],
# Predicates are optional.
'predicates': {
'field': 'AdGroupStatus',
'operator': 'IN',
'values': ['ENABLED', 'PAUSED']
}
},
}
queue_size = input_queue.qsize()
num_processes = min(queue_size, MAX_PROCESSES)
print 'Retrieving %d reports with %d processes:' % (queue_size, num_processes)
# Start all the processes.
processes = [ReportWorker(report_download_directory,
report_definition, input_queue, reports_succeeded,
reports_failed)
for _ in range(num_processes)]
for process in processes:
process.start()
for process in processes:
process.join()
print 'Finished downloading reports with the following results:'
while True:
try:
success = reports_succeeded.get(timeout=0.01)
except Empty:
break
print '\tReport for CustomerId "%d" succeeded.' % success['customerId']
while True:
try:
failure = reports_failed.get(timeout=0.01)
except Empty:
break
print ('\tReport for CustomerId "%d" failed with error code "%s" and '
'message: %s.' % (failure['customerId'], failure['code'],
failure['message']))
if __name__ == '__main__':
adwords_client = googleads.adwords.AdWordsClient.LoadFromStorage(
'googleads.yaml')
main(adwords_client, REPORT_DOWNLOAD_DIRECTORY)
How can I get the performance reports for multiple MCC accounts?
You need to create different googleads.adwords.AdWordsClient instance for achieving the same as one client can only work with one adwords account (mcc or single account).
To create AdWordsClient instance, you can automate the flow without using YAML file for configuration and use below code to create the same (rest code will remain the same) -
"""Initializes a AdManagerClient without using yaml-cached credentials.
While our LoadFromStorage method provides a useful shortcut to instantiate a
client if you regularly use just one set of credentials, production applications
may need to swap out users. This example shows you how to create an OAuth2
client and a AdManagerClient without relying on a yaml file.
"""
from googleads import ad_manager
from googleads import oauth2
# OAuth2 credential information. In a real application, you'd probably be
# pulling these values from a credential storage.
CLIENT_ID = 'INSERT_CLIENT_ID_HERE'
CLIENT_SECRET = 'INSERT_CLIENT_SECRET_HERE'
REFRESH_TOKEN = 'INSERT_REFRESH_TOKEN_HERE'
# Ad Manager API information.
APPLICATION_NAME = 'INSERT_APPLICATION_NAME_HERE'
# Client customer id
CLIENT_CUSTOMER_ID = 'INSERT_CLIENT_CUSTOMER_ID_HERE'
def main(client_id, client_secret, refresh_token, application_name):
oauth2_client = oauth2.GoogleRefreshTokenClient(
client_id, client_secret, refresh_token)
ad_manager_client = ad_manager.AdManagerClient(
oauth2_client, application_name,client_customer_id=CLIENT_CUSTOMER_ID)
networks = ad_manager_client.GetService('NetworkService').getAllNetworks()
for network in networks:
print ('Network with network code "%s" and display name "%s" was found.'
% (network['networkCode'], network['displayName']))
if __name__ == '__main__':
main(CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, APPLICATION_NAME)
Code reference

Is there a method to retrieve all tracks from a playlist using Spotipy?

I'm trying to extract all songs from one of my playlists in Spotify. Using the spotipy package in Python to help accomplish this. Below is the current code I have, which only gets 100 tracks. I tried poking around the internet for solutions but nothing appeared to work. Below is the code that I have so far.
#Import Library
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import spotipy
from matplotlib import style
from spotipy import util
from spotipy.oauth2 import SpotifyClientCredentials
#Set up Connection
client_id = 'id' #Need to create developer profile
client_secret = 'secret'
username = 'username' #Store username
scope = 'user-library-read playlist-modify-public playlist-read-private'
redirect_uri='uri'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
client_secret=client_secret)#Create manager for ease
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
token = util.prompt_for_user_token(username, scope, client_id,
client_secret, redirect_uri)
if token:
sp = spotipy.Spotify(auth=token)
else:
print("Can't get token for", username)
#Connect to the best playlist, like ever
playlist_id = 'playlist'
playlist = sp.user_playlist(username,playlist_id)
#Extract list of songs/tracks
tracks = playlist["tracks"];
songs = tracks["items"];
track_ids = []
track_names = []
for i in range(0, len(songs)):
if songs[i]['track']['id'] != None: # Removes local tracks, if any
track_ids.append(songs[i]['track']['id'])
track_names.append(songs[i]['track']['name'])
You can use offset to collect the results from multiple calls. This function returns a list of track objects.
def user_playlist_tracks_full(spotify_connection, user, playlist_id=None, fields=None, market=None):
""" Get full details of the tracks of a playlist owned by a user.
https://developer.spotify.com/documentation/web-api/reference/playlists/get-playlists-tracks/
Parameters:
- user - the id of the user
- playlist_id - the id of the playlist
- fields - which fields to return
- market - an ISO 3166-1 alpha-2 country code.
"""
# first run through also retrieves total no of songs in library
response = spotify_connection.user_playlist_tracks(user, playlist_id, fields=fields, limit=100, market=market)
results = response["items"]
# subsequently runs until it hits the user-defined limit or has read all songs in the library
while len(results) < response["total"]:
response = spotify_connection.user_playlist_tracks(
user, playlist_id, fields=fields, limit=100, offset=len(results), market=market
)
results.extend(response["items"])

AWS Lambda - How do I convert my code to work in AWS?

I'm struggling to get a Lambda function working. I have a python script to access twitter API, pull information, and export that information into an excel sheet. I'm trying to transfer python script over to AWS/Lambda, and I'm having a lot of trouble.
What I've done so far: Created AWS account, setup S3 to have a bucket, and poked around trying to get things to work.
I think the main area I'm struggling is how to go from a python script that I'm executing via local CLI and transforming that code into lambda-capable code. I'm not sure I understand how the lambda_handler function works, what the event or context arguments actually mean (despite watching a half dozen different tutorial videos), or how to integrate my existing functions into Lambda in the context of the lambda_handler, and I'm just very confused and hoping someone might be able to help me get some clarity!
Code that I'm using to pull twitter data (just a sample):
import time
import datetime
import keys
import pandas as pd
from twython import Twython, TwythonError
import pymysql
def lambda_handler(event, context):
def oauth_authenticate():
twitter_oauth = Twython(keys.APP_KEY, keys.APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter_oauth.obtain_access_token()
twitter = Twython(keys.APP_KEY, access_token = ACCESS_TOKEN)
return twitter
def get_username():
"""
Prompts for the screen name of targetted account
"""
username = input("Enter the Twitter screenname you'd like information on. Do not include '#':")
return username
def get_user_followers(username):
"""
Returns data on all accounts following the targetted user.
WARNING: The number of followers can be huge, and the data isn't very valuable
"""
#username = get_username()
#import pdb; pdb.set_trace()
twitter = oauth_authenticate()
datestamp = str(datetime.datetime.now().strftime("%Y-%m-%d"))
target = twitter.lookup_user(screen_name = username)
for y in target:
target_id = y['id_str']
next_cursor = -1
index = 0
followersdata = {}
while next_cursor:
try:
get_followers = twitter.get_followers_list(screen_name = username,
count = 200,
cursor = next_cursor)
for x in get_followers['users']:
followersdata[index] = {}
followersdata[index]['screen_name'] = x['screen_name']
followersdata[index]['id_str'] = x['id_str']
followersdata[index]['name'] = x['name']
followersdata[index]['description'] = x['description']
followersdata[index]['date_checked'] = datestamp
followersdata[index]['targeted_account_id'] = target_id
index = index + 1
next_cursor = get_followers["next_cursor"]
except TwythonError as e:
print(e)
remainder = (float(twitter.get_lastfunction_header(header = 'x-rate-limit-reset')) \
- time.time())+1
print("Rate limit exceeded. Waiting for:", remainder/60, "minutes")
print("Current Time is:", time.strftime("%I:%M:%S"))
del twitter
time.sleep(remainder)
twitter = oauth_authenticate()
continue
followersDF = pd.DataFrame.from_dict(followersdata, orient = "index")
followersDF.to_excel("%s-%s-follower list.xlsx" % (username, datestamp),
index = False, encoding = 'utf-8')

Categories