I am using google analytics API to extract data. I would like to dump a year's data into the csv file. I have implemented the splitting too , to handle large data. When I run the code, it starts the dumping the data into the csv. However, it crashes with the following message. And this happens everytime I run the code. Also, it must be noted, I tried extracting the data for 1 day and still hit the same error
googleapiclient.errors.HttpError: <HttpError 503 when requesting https://analyticsreporting.googleapis.com/v4/reports:batchGet?alt=json returned "The service is currently unavailable.">
The following is my code. Any help on this would be greatly appreciated
import httplib2 as lib2
import google.oauth2.credentials
from google_auth_httplib2 import AuthorizedHttp
from datetime import datetime
import psycopg2
#Packages needed for connecting with Google API
from googleapiclient.discovery import build as google_build
#Data processing packages
import pandas
import numpy
import json
from datetime import datetime, timedelta
access_token = "***********"
refresh_token = "**********"
client_id = "***********"
client_secret = "*************"
token_uri = 'https://oauth2.googleapis.com/token'
token_expiry = datetime.now() - timedelta(days = 1)
#¯\_(ツ)_/¯
user_agent = 'my-user-agent/1.0'
credentials = google.oauth2.credentials.Credentials(access_token,
refresh_token=refresh_token,
token_uri='https://oauth2.googleapis.com/token',
client_id=client_id,
client_secret=client_secret)
#Authorize client
authorized = AuthorizedHttp(credentials=credentials)
api_name = 'analyticsreporting'
api_version = 'v4'
#Let's build the client
api_client_1dayactiveusers = google_build(serviceName=api_name, version=api_version, http=authorized)
pageToken_1dayactiveusers='firstcall'
# for user types
while pageToken_1dayactiveusers != None:
sample_request = {
'viewId': '**********',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 365),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [{'name': 'ga:date'}],
'metrics': [{'expression': 'ga:1dayUsers','alias':'onedayusers'}],
'pageToken': pageToken_1dayactiveusers
}
response_1dayactiveusers = api_client_1dayactiveusers.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()
print(response_1dayactiveusers)
pageToken = response_1dayactiveusers.get("reports")[0].get('nextPageToken', None)
print(pageToken)
def parse_response(report):
"""Parses and prints the Analytics Reporting API V4 response"""
# Initialize results, in list format because two dataframes might return
result_list = []
# Initialize empty data container for the two dateranges (if there are two that is)
data_csv = []
data_csv2 = []
# Initialize header rows
header_row = []
# Get column headers, metric headers, and dimension headers.
columnHeader = report.get('columnHeader', {})
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
dimensionHeaders = columnHeader.get('dimensions', [])
# Combine all of those headers into the header_row, which is in a list format
for dheader in dimensionHeaders:
header_row.append(dheader)
for mheader in metricHeaders:
header_row.append(mheader['name'])
# Get data from each of the rows, and append them into a list
rows = report.get('data', {}).get('rows', [])
for row in rows:
row_temp = []
dimensions = row.get('dimensions', [])
metrics = row.get('metrics', [])
for d in dimensions:
row_temp.append(d)
for m in metrics[0]['values']:
row_temp.append(m)
data_csv.append(row_temp)
# Putting those list formats into pandas dataframe, and append them into the final result
result_df = pandas.DataFrame(data_csv, columns=header_row)
result_list.append(result_df)
return result_list
response_data = response_1dayactiveusers.get('reports', [])[0]
df = parse_response(response_data)[0]
df.to_csv('/Users/ga_csv_2.csv', mode='a', header=False)
Related
I am trying to collect data from from e-conomic.dk, but the API will only let me retrieve 1000 rows per call, so I think I need to create a list indicating 1 row for every 1000 rows in the datasource. After that I need a logic that skips the first 1000 rows if already retrieved and then continue to retrieve the next 1000 rows etc.
API doc: https://restdocs.e-conomic.com/#endpoints
This is my Python code:
#This code calls the ENTRIES API from e-conomic and sends data to SQL
#API doc = https://restdocs.e-conomic.com/
import requests
import json
import openpyxl
import pandas as pd
import sqlalchemy.engine as sqle
HEADERS = {
"X-AgreementGrantToken": "demo",
"X-AppSecretToken": "demo",
}
def get_api_data(endpoint):
url = "https://restapi.e-conomic.com"
query = {
"pagesize": "1000",
"skippages": str(0)
}
response = requests.get(f'{url}/{endpoint}', headers = HEADERS, params = {**query})
data = response.json()
return data
def get_db_engine():
conn_str = "DRIVER={SQL SERVER};SERVER=JAKOB-MSI;DATABASE=MightyMonday;TRUSTED_CONNECTION=yes"
conn_url = sqle.URL.create("mssql+pyodbc", query = {'odbc_connect': conn_str})
engine = sqle.create_engine(conn_url)
return engine
source = get_api_data("accounting-years")
collection = source["collection"]
# print(source)
entries = pd.DataFrame(collection)
res = requests.get(entries['entries'][0] + '?skippages=0&pagesize=30', headers = HEADERS)
data = res.json()
dataset = pd.DataFrame(data['collection'])
dataset['accountNumber'] = [d.get('accountNumber') for d in dataset.account]
dataset = dataset[['accountNumber', 'amountInBaseCurrency', 'date', 'text']]
#Change the target SQL table here
dataset.to_sql('Entries_demo_values', con = get_db_engine(), if_exists = 'replace', index = False)
This is a fully functional M code written using PowerBI that skips the rows as intended.
let
function1 = (EndPoint as text, PagesToSkip as number) =>
let
Url = "https://restapi.e-conomic.com",
Headers = [#"X-AgreementGrantToken"="demo", #"X-AppSecretToken"="demo"],
Query = [pagesize = "1000", skippages = Text.From(PagesToSkip)],
data = Json.Document(Web.Contents(Url, [Headers = Headers, Query = Query, RelativePath = EndPoint]))
in data,
function2 = (tal) => List.Generate(()=>0, each _ <= tal, each _ +1),
Source = function1("accounts", 0),
collection = Source[collection],
#"Converted to Table" = Table.FromList(collection, Splitter.SplitByNothing(), null, null, ExtraValues.Error),
#"Expanded Column1" = Table.ExpandRecordColumn(#"Converted to Table", "Column1", {"accountNumber", "accountType", "balance", "blockDirectEntries", "debitCredit", "name", "accountingYears", "self", "vatAccount", "totalFromAccount", "openingAccount", "accountsSummed"}, {"accountNumber", "accountType", "balance", "blockDirectEntries", "debitCredit", "name", "accountingYears", "self", "vatAccount", "totalFromAccount", "openingAccount", "accountsSummed"}),
#"Removed Other Columns" = Table.SelectColumns(#"Expanded Column1",{"accountNumber", "accountType", "name"}),
#"Added Conditional Column" = Table.AddColumn(#"Removed Other Columns", "KontoOverskrift", each if [accountType] = "heading" then [name] else null),
#"Filled Down" = Table.FillDown(#"Added Conditional Column",{"KontoOverskrift"}),
#"Changed Type" = Table.TransformColumnTypes(#"Filled Down",{{"accountNumber", Int64.Type}, {"accountType", type text}, {"name", type text}, {"KontoOverskrift", type text}})
in
#"Changed Type"
With the this Python code I am able to retrieve the data and send to Excel. But the skip 1000 rows logic explained above
#This code calls the account API from e-conomic and writes a Excel file with the data
import requests
import json
import openpyxl
import pandas as pd
def Function1(endpoint):
url = "https://restapi.e-conomic.com"
headers = {
"X-AgreementGrantToken": "demo",
"X-AppSecretToken": "demo",
}
query = {
"pagesize": "1000",
"skippages": str(0)
}
response = requests.get(f'{url}/{endpoint}', headers = headers, params = {**query})
data = response.json()
return data
def function2(tal):
return range(0, tal+1)
source = Function1("accounts")
# source = function1("accounts", 0)
collection = source["collection"]
dataset = pd.DataFrame(collection)
data = dataset[['accountNumber','accountType','name']]
data.to_excel('AccountDataAuto.xlsx', index = False)
Why am I getting an error on the batch update line?
from googleapiclient.discovery import build
from google.oauth2 import service_account
import pandas as pd
# Authenticate into Target worksheet
# Provide the right methods from google and the right scopes (app services)
SERVICE_ACCOUNT_FILE = 'keys.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
creds = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
# The ID for the target spreadsheet.
TARGET_WORKBOOK = 'xxx'
service = build('sheets', 'v4', credentials=creds)
sheet_id = 2083229665
# Build the service object for the Google Sheets API
service = build('sheets', 'v4', credentials=creds)
# Load the DataFrame
data = [['Name', 'Age'], ['Alice', 25], ['Bob', 30],['steve','55'], ['gayle','54']]
df = pd.DataFrame(data)
datalist = df.values.tolist()
# Define the range of the data
range_ = 'ExpenseLog!A1:' + chr(ord('A') + len(df.columns) - 1) + str(len(df))
# Define the request body Execute the request
requests = []
for i, row in enumerate(datalist):
# Format the row values into the proper structure
values = [{'userEnteredValue': {'stringValue': cell}} for cell in row]
# Create the updateCells request for the current row
request = {
'updateCells': {
'range': {
'sheetId' : sheet_id,
'startRowIndex': i,
'endRowIndex': i + 1,
'startColumnIndex': 0,
'endColumnIndex': len(row)
},
'rows': [
{
'values': values
}
],
'fields': 'userEnteredValue'
}
}
# Append the request to the list of requests
requests.append(request)
# Perform the batch update
result = service.spreadsheets().batchUpdate(spreadsheetId=TARGET_WORKBOOK,
body={'requests': requests}).execute()
I've seen several attempts at describing this but they all seem to think that having requests as a list in the body section will work, but it doesn't seem to be.
Any thoughts?
Thanks
100's of print statements later and everything seems to have the values, but they are not getting read when used in the last line of the code.
I am trying to get the reports such as geo_performance_report, keywords_perfromance_report etc. but I am unable to figure how do to this with the new version of google-ads api.
I tried this way trying to use the new google-ads but was not successful.
is there any other ways to automate this process using Python.
def googleads_report(client, client_id, report_type, columns, start_date, end_date):
client.SetClientCustomerId(client_id)
report_downloader = googleads_client.GetReportDownloader(version="v201809")
report = {
'reportName': 'report-google-campaign-performance',
'dateRangeType': 'CUSTOM_DATE',
'reportType': report_type,
'downloadFormat': 'CSV',
'selector': {
'fields': columns,
'dateRange': {'min': start_date, 'max': end_date}
}
}
file = io.StringIO(report_downloader.DownloadReportAsString(
report,
skip_report_header=True,
skip_column_header=True,
skip_report_summary=True,
include_zero_impressions=False)
)
df = pd.read_csv(file, names=columns)
return df
def main(client, customer_id):
keyword_columns = [
'Date',
'AccountDescriptiveName',
'AdGroupId',
'AdGroupName',
'AdGroupStatus',
'CampaignId',
'CampaignName',
'CampaignStatus',
'CpcBid',
'Criteria',
'CriteriaDestinationUrl',
'ExternalCustomerId',
'FirstPageCpc',
'FirstPositionCpc',
'Id',
'KeywordMatchType',
'Labels',
'QualityScore',
'SearchImpressionShare',
'Status',
'TopOfPageCpc',
'Clicks',
'Conversions',
'Cost',
'ConversionValue',
'Impressions',
'ViewThroughConversions'
]
report_types = [
'KEYWORDS_PERFORMANCE_REPORT'
]
for report in report_types:
base_df = pd.DataFrame()
if report == 'CAMPAIGN_PERFORMANCE_REPORT':
table_suffix = 'campaigns'
#columns = campaign_columns
elif report == 'KEYWORDS_PERFORMANCE_REPORT':
table_suffix = 'keywords'
columns = keyword_columns
elif report == 'AD_PERFORMANCE_REPORT':
table_suffix = 'ads'
#columns = ad_columns
start_date = '2019-01-01'
df = googleads_report(client,customer_id, report, columns, start_date, yesterday)
df = df.applymap(str)
# Powershell output
print(df.head())
# csv output
df.to_csv('my_path' + table_suffix + '.csv')
if __name__ == "__main__":
# GoogleAdsClient will read the google-ads.yaml configuration file in the
# home directory if none is specified.
googleads_client = GoogleAdsClient.load_from_storage(path="mypath")
today = datetime.now().date()
yesterday = today - timedelta(days=1)
thirty_days_ago = today - timedelta(days=30)
try:
main( googleads_client, "#######")
except GoogleAdsException as ex:
print(
f'Request with ID "{ex.request_id}" failed with status '
f'"{ex.error.code().name}" and includes the following errors:'
)
for error in ex.failure.errors:
print(f'\tError with message "{error.message}".')
if error.location:
for field_path_element in error.location.field_path_elements:
print(f"\t\tOn field: {field_path_element.field_name}")
sys.exit(1)
Based upon your version='v201809' you are not using the most up to date version of the google ads api. That version of the API is scheduled for deprecation in spring 2022.
The newest version of the google ads api now uses a query language for their reporting examples.
Google ads provides a mapping for common reports into the fields required in their query language.
Once your client is authenticated with a newer version of the API, you can post the Google Ads query to the client.
from google.ads.googleads.client import GoogleAdsClient
client = GoogleAdsClient.load_from_storage("creds")
service = client.get_service("GoogleAdsService", version="v9")
#query below pulls all accounts that your MCC has access to
query = """
SELECT
customer_client.client_customer,
customer_client.level,
customer_client.manager,
customer_client.id
FROM customer_client
WHERE customer_client.manager != True
"""
search_request = client.get_type("SearchGoogleAdsStreamRequest")
search_request.customer_id = "1234adswordscustomer_id"
search_request.query = query
response = service.search_stream(search_request)
I am working in a google cloud function with the intention of putting the results in a dataframe and then porting all of that into BigQuery. My function was able to be deployed without error but when looking into the associated bq table I am seeing no data. Below is a view of my code:
# general setup, common imports
import json, requests, time, urllib.parse
import pandas as pd
from pandas import DataFrame
import datetime
import io
import os
from google.cloud import bigquery
from google.cloud.bigquery.client import Client
def crux_data():
# Read the URLs for auditing
url_list = open('pagespeedlist', 'r')
url_list.read()
results = []
for x in url_list:
url = x[0]
pagespeed_results = urllib.request.urlopen('https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={}&strategy=mobile&key=API_KEY'\
.format(url)).read().decode('UTF-8')
pagespeed_results_json = json.loads(pagespeed_results)
add_date = datetime.date.today()
largest_contentful_paint = pagespeed_results_json['lighthouseResult']['audits']['largest-contentful-paint']['displayValue'].replace(u'\xa0', u'') # Largest Contenful Paint
first_input_delay = str(round(pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['distributions'][2]['proportion'] * 1000, 1)) + 'ms' # First Input Delay
cumulative_layout_shift = pagespeed_results_json['lighthouseResult']['audits']['cumulative-layout-shift']['displayValue'] # CLS
crux_lcp = pagespeed_results_json['loadingExperience']['metrics']['LARGEST_CONTENTFUL_PAINT_MS']['category'] # Largest Contenful Paint Score
crux_fid = pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['category'] # First Input Delay Score
crux_cls = pagespeed_results_json['loadingExperience']['metrics']['CUMULATIVE_LAYOUT_SHIFT_SCORE']['category'] # CLS Score
result_url = [url,date,largest_contentful_paint,first_input_delay,cumulative_layout_shift,lcp_score,fid_score,cls_score]
results.append(result_url)
#Convert to dataframe
results_csv = DataFrame (results,columns=['URL','DATE','LCP','FID','CLS','LCP_SCORE','FID_SCORE','CLS_SCORE'])
# Construct a BigQuery client object.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'
client = Client()
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "db.datatable.dataLoc"
job_config = bigquery.LoadJobConfig()
job = client.load_table_from_dataframe(
results_csv, table_id, job_config=job_config
) # Make an API request.
job.result() # Wait for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
I do see the proper schema in the bq table but no actual data. Is there something I am missing with loading a df to bigquery?
Any help is much appreciated!
I'm still very early in my Python journey and I'm quite sure the answer to this is obvious. I've read a good half dozen similar posts but can't quite wrap my head around what the solution might be for my use case.
What I'm trying to do:
Automatically loop through Google Analytics reports day-by-day, loading each one to an external source once each day has been put into a Pandas DataFrame.
What I'm happy with:
Most of the process! I'm pulling down simple queries, loading them up to the DataFrame, converting them to CSV and shooting them to where they need to go.
What I need help with:
In trying to make my solution a little more fool-proof, I'm wanting to make sure that I step through any pagination that exists in the Google Analytics results, since queries are limited to 100k lines. I can't find a good example anywhere which shows how I might implement this kind of additional check and query, and how I might append that to my DataFrame.
I've cut out a lot of the bloat but essentially this is the code I'm working with. Note that I've limited page size purposely to 200 only.
import time
import pandas as pd
from pandas import json_normalize
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "secrets.json"
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import json
jsonConfig = open("config.json", "r")
configRead = json.loads(jsonConfig.read())
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = configRead['keyFileLocation']
VIEW_ID = configRead['gaViewId']
startDateOrig = 7
endDateOrig = 7
startDate = 7
endDate = 7
endDateScope = 3
def request1(page_token=None):
req1 = {
'reportRequests':
[
{
'viewId': VIEW_ID,
'dateRanges':
[{'startDate': f'{startDate}daysAgo',
'endDate': f'{endDate}daysAgo'}],
'metrics':
[{'expression': 'ga:sessions'}],
'dimensions':
[{'name': 'ga:date'},
{'name': 'ga:sourceMedium'},
{'name': 'ga:landingPagePath'}],
'pageSize': 200,
'pageToken': page_token,
'orderBys':
[{'fieldName': 'ga:sessions',
'sortOrder': 'DESCENDING'}]
}
]
}
return req1
def initialize_analyticsreporting():
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics):
global startDate
global endDate
return analytics.reports().batchGet(
body=request1()
).execute()
# Add in page_token loop here?
def parse_data(response):
df = pd.DataFrame
reports = response['reports'][0]
columnHeader = reports['columnHeader']['dimensions']
metricHeader = reports['columnHeader']['metricHeader']['metricHeaderEntries']
columns = columnHeader
for metric in metricHeader:
columns.append(metric['name'])
data = json_normalize(reports['data']['rows'])
data_dimensions = df(data['dimensions'].tolist())
data_metrics = df(data['metrics'].tolist())
data_metrics = data_metrics.applymap(lambda x: x['values'])
data_metrics = df(data_metrics[0].tolist())
result = pd.concat([data_dimensions, data_metrics], axis=1, ignore_index=True)
result.to_csv('result.csv', index=False, header=False)
return result
def resetDates():
global startDateOrig
global endDateOrig
global startDate
global endDate
startDate = startDateOrig
endDate = endDateOrig
return startDate
return endDate
def main():
global startDate
global endDate
global endDateScope
resetDates()
analytics = initialize_analyticsreporting()
while startDate >= endDateScope:
print(f'Running report for {startDate} days ago')
response = get_report(analytics)
parse_data(response)
startDate -= 1
endDate -= 1
time.sleep(0.2)
resetDates()
print("Done now, cheers")
You might notice that I've got a comment in the get_report() function. Would that be the best place to add in a loop to pull any additional pages of data required? I'm thinking the parse_data() function would also need to be modified and possibly the response variable turned into an array/list?
Any help would be appreciated!