I am working in a google cloud function with the intention of putting the results in a dataframe and then porting all of that into BigQuery. My function was able to be deployed without error but when looking into the associated bq table I am seeing no data. Below is a view of my code:
# general setup, common imports
import json, requests, time, urllib.parse
import pandas as pd
from pandas import DataFrame
import datetime
import io
import os
from google.cloud import bigquery
from google.cloud.bigquery.client import Client
def crux_data():
# Read the URLs for auditing
url_list = open('pagespeedlist', 'r')
url_list.read()
results = []
for x in url_list:
url = x[0]
pagespeed_results = urllib.request.urlopen('https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={}&strategy=mobile&key=API_KEY'\
.format(url)).read().decode('UTF-8')
pagespeed_results_json = json.loads(pagespeed_results)
add_date = datetime.date.today()
largest_contentful_paint = pagespeed_results_json['lighthouseResult']['audits']['largest-contentful-paint']['displayValue'].replace(u'\xa0', u'') # Largest Contenful Paint
first_input_delay = str(round(pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['distributions'][2]['proportion'] * 1000, 1)) + 'ms' # First Input Delay
cumulative_layout_shift = pagespeed_results_json['lighthouseResult']['audits']['cumulative-layout-shift']['displayValue'] # CLS
crux_lcp = pagespeed_results_json['loadingExperience']['metrics']['LARGEST_CONTENTFUL_PAINT_MS']['category'] # Largest Contenful Paint Score
crux_fid = pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['category'] # First Input Delay Score
crux_cls = pagespeed_results_json['loadingExperience']['metrics']['CUMULATIVE_LAYOUT_SHIFT_SCORE']['category'] # CLS Score
result_url = [url,date,largest_contentful_paint,first_input_delay,cumulative_layout_shift,lcp_score,fid_score,cls_score]
results.append(result_url)
#Convert to dataframe
results_csv = DataFrame (results,columns=['URL','DATE','LCP','FID','CLS','LCP_SCORE','FID_SCORE','CLS_SCORE'])
# Construct a BigQuery client object.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'
client = Client()
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "db.datatable.dataLoc"
job_config = bigquery.LoadJobConfig()
job = client.load_table_from_dataframe(
results_csv, table_id, job_config=job_config
) # Make an API request.
job.result() # Wait for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
I do see the proper schema in the bq table but no actual data. Is there something I am missing with loading a df to bigquery?
Any help is much appreciated!
Related
wondering if someone can help with that.
I have cloud function with python code that does query BigQuery table and stores query result in GCS bucket as csv file.
But in csv file I have got strange format like:
Row(('asser',), {'user_login': 0})
Row(('godx',), {'user_login': 0})
Row(('johnw',), {'user_login': 0})
Row(('miki',), {'user_login': 0})
But save data format is expexcted to be like:
asser,
godx,
johnw,
miki
When I do debug in GCP logging console I able to get expected format. Seems I do smth wrong when processing query result.
I use this code:
def main(event, context):
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
import datetime
project_name = my_project
destination_bucket = my_bucket
bq_dataset_name = my_dataset
bq_table_name = my_table
bq_table_full_path = f"""{project_name}.{bq_dataset_name}.{bq_table_name}"""
bq_client = bigquery.Client()
query_string = """
SELECT user_login
FROM `my_table_full_path`
WHERE DATE(insert_time) = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
GROUP BY user_login
"""
bq_response = bq_client.query(query_string)
df = pd.DataFrame(bq_response)
csv_data = df.to_csv(header=False, index=False)
# create and upload file to Google Storage
timestr = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%d')
file_name = 'daily_active_users_' + timestr + '.csv'
upload_blob(data=csv_data, destination_blob_name=file_name)
def upload_blob(data, destination_blob_name):
storage_client = storage.Client()
bucket = storage_client.get_bucket(destination_bucket)
blob = bucket.blob(destination_blob_name)
blob.upload_from_string(data, 'text/csv')
Thanks in advance!
Try to use the method to_dataframe of QueryJob to return the dataframe.
Instead of:
df = pd.DataFrame(bq_response)
Try this:
df = bq_response.to_dataframe()
I want to automatically create BQ table/s from a desktop folder containing csv files( i.e Automatically create schema and load to a new table)
If the same file is loaded next time just update the existing table, if a new file is loaded then create a new table. Is it possible to automate using Python?.
Current Code:
import pandas as pd
from google.cloud import bigquery
def bqDataLoad(event, context):
bucketName = event['test_vs']
blobName = event['gf-dev-models']
fileName = "gs://" + bucketName + "/" + blobName
bigqueryClient = bigquery.Client()
tableRef = bigqueryClient.dataset("gf-dev-models-204097").table("test_vs")
dataFrame = pd.read_csv(fileName) bigqueryJob = bigqueryClient.load_table_from_dataframe(dataFrame, tableRef) bigqueryJob.result()
#Project id = gf-dev-models
#dataset = gf-dev-models-204097
#table name = want a new table created
Here is my answer with reference to your question in the comment section:
Credential in Code:
You can create a service account with desired BigQuery roles and download JSON key file (example: data-lab.json).
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "data-lab.json"
Create Schema Automatically & Loading data to BigQuery:
from google.cloud import bigquery
bigqueryClient = bigquery.Client()
jobConfig = bigquery.LoadJobConfig()
jobConfig.skip_leading_rows = 1
jobConfig.source_format = bigquery.SourceFormat.CSV
jobConfig.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
jobConfig.autodetect=True
datasetName = "dataset-name"
targetTable = "table-name"
uri = "gs://bucket-name/file-name.csv"
tableRef = bigqueryClient.dataset(datasetName).table(targetTable)
bigqueryJob = bigqueryClient.load_table_from_uri(uri, tableRef, job_config=jobConfig)
bigqueryJob.result()
My problem lies with taking simple-salesforce queries out of the Salesforce and then Transforming them into a dataframe with Plotly Dashboard App.
I have tried to authenticate with Salesforce in Python (SUCCESS)
I can request data and receive it with HTTP GET status (SUCCESS)
I have an internal Dashboard App running Plotly Locally (SUCCESS)
I have a general layout for the Dashboard App Locally (SUCCESS)
Transforming Data query in Python into table/dataframe (FAILURE)
import os
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import salesforce_reporting
from simple_salesforce import Salesforce
import requests
import pandas as pd
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
# Salesforce User Session and Fields
# ==================================
from simple_salesforce import SalesforceAPI
sf = SalesforceAPI('<your username>', '<your salesforce password>', '<your salesforce token>')
Fields = ['isMosAlert__c',
'Milestone_violated__c',
'First_Reply__c',
'CaseNumber',
'Environment2__r.Name',
'Owner.Name',
'Severity_Level__c',
'Status',
'Subject',
'URL_for_support__c'
]
Items = str(Fields[1:10]).strip('[]')
print(Items).replace("'", "")
sf.query("SELECT %s from Case"% ','.join(Fields) )
#fig = go.Figure(data=[go.Table(header=dict(values=['A Scores', 'B Scores']),
# cells=dict(values=[[100, 90, 80, 90], [95, 85, 75, 95]]))
# ])
fig = go.Figure(data=[go.Table(header=dict(values=Fields),
cells=dict(values=[sf.query("SELECT %s from Case"% ','.join(Fields))]))
])
fig.show()
I actually do get a table, but all it shows is headers which I have defined as the fields data. Should the salesforce query be set to a variable? I want to get my table with Salesforce Data looking like the following picture that I took from Plotly Documentation
Actual information I get with the salesforce query returns like this while in python interactively.
(u'isMosAlert__c', True), (u'Milestone_violated__c', False), (u'First_Reply__c', True), (u'CaseNumber', u'1850637'), (u'Environment2__r', OrderedDict([(u'attributes', OrderedDict([(u'type', u'Environment__c'), (u'url', u'/services/data/v27.0/sobjects/Environment__c/a2X440000024ZkzEAE')])), (u'Name', u'MCP500 Production')])), (u'Owner', OrderedDict([(u'attributes', OrderedDict([(u'type', u'Name'), (u'url', u'/services/data/v27.0/sobjects/Group/00GE0000003YOIEMA4')])), (u'Name', u'L1 Support Group')])), (u'Severity_Level__c', None), (u'Status', u'Ignored'), (u'Subject', u'elasticsearch - critical'), (u'URL_for_Support__c', u'https://mirantis.my.salesforce.com/5004400000ykxS9')])])])
So I found out how to do this with simple-salesforce.
from pandas import DataFrame
from simple_salesforce import SalesforceAPI
sf = SalesforceAPI('<insert_username>', '<insert_password>', '<insert_token>')
# Saleforce Secure Connection TBD
# ===============================
# sf_sec = Salesforce(username = username,
# password = password,
# security_token= token,
# instance_url = instance,
# sandbox = isSandbox)
# Function to Return Salesforce SOQL Queries as Dataframes
# =========================================================
def sql_query(query_str):
qry_result = sf.query(query_str)
print('Record Count {0}'.format(qry_result['totalSize']))
is_done = qry_result['done']
if is_done:
df = DataFrame(qry_result['records'])
while not is_done:
try:
if not qry_result['done']:
df = df.append(DataFrame(qry_result['records']))
qry_result = sf.query_more(qry_result['nextRecordsUrl'], True)
else:
df = df.append(DataFrame(qry_result['records']))
is_done = True
print('completed')
break
except NameError:
df = DataFrame(qry_result['records'])
sf.query_more(qry_result['nextRecordsUrl'], True)
df = df.drop('attributes', axis=1)
return df
soql_test = 'SELECT Milestone_violated__c, First_Reply__c, CaseNumber, Environment2__r.Name,' \
' Owner.Name, Severity_Level__c, Status, Subject, URL_for_support__c from Case'
res = sql_query(soql_test)
print(res)
# SAMPLE SOQL COMMANDS
# ====================
# SOQL('SELECT Id FROM CASE')
The data takes awhile to be processed, but the returned Data looks like the following depending on what object API names your salesforce organization has:
Record Count 307027
completed
Milestone_violated__c ... URL_for_Support__c
0 False ... https://mirantis.my.salesforce.com/500E000000Z...
1 False ... https://mirantis.my.salesforce.com/500E000000Z...
2 False ... https://mirantis.my.salesforce.com/500E000000Z...
Is it possible to import data already in Cloud Storage to a temporary table in bigquery using Python? Can I create a BigQuery temporary table in Python and insert data into it?
You can only create temporary tables as part of a bigquery script or stored procedure.
What you can do is create tables with a random suffix name and a short expiry. One hour in my example. The example function create the temp table and only need a dataset as a parameter.
from google.cloud import bigquery
import datetime, pytz, random
PROJECT = "myproject"
def get_temp_table(dataset: str, table_name: str = None, project=None) -> bigquery.Table:
prefix = "temp"
suffix = random.randint(10000, 99999)
if not table_name:
table_name = "noname"
temp_table_name = f"{dataset}.{prefix}_{table_name}_{suffix}"
if project:
temp_table_name = f"{project}.{temp_table_name}"
tmp_table_def = bigquery.Table(temp_table_name)
tmp_table_def.expires = datetime.datetime.now(pytz.utc) + datetime.timedelta(
hours=1
)
return tmp_table_def
client = bigquery.Client(project=PROJECT)
tmp_table_def = get_temp_table("mydataset", "new_users", project=PROJECT)
tmp_table_def.schema = [
bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
]
tmp_table = client.create_table(tmp_table_def) # type: bigquery.Table
data = [
{"id": "c-1234", "full_name": "John Smith", "age": 39},
{"id": "c-1234", "full_name": "Patricia Smith", "age": 41},
]
errors = client.insert_rows(tmp_table, data)
print(f"Loaded {len(data)} rows into {tmp_table.dataset_id}:{tmp_table.table_id} with {len(errors)} errors")
(this draft don't consider a temporary table, but i think can help.)
I used this with google cloud functions and Python 3.7 and works fine.
from google.cloud import storage,bigquery
import json
import os
import csv
import io
import pandas as pd
def upload_dataframe_gbq(df,table_name):
bq_client = bigquery.Client()
dataset_id = 'your_dataset_id'
dataset_ref = bq_client.dataset(dataset_id)
table_ref = dataset_ref.table(table_name)
job = bq_client.load_table_from_dataframe(df, table_ref)
job.result() # Waits for table load to complete.
assert job.state == "DONE"
table = bq_client.get_table(table_ref)
print(table.num_rows)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="your_credentials.json"
client = storage.Client()
bucket = client.get_bucket('your_bucket_name')
blob = bucket.blob('sample.csv')
content = blob.download_as_string()
csv_content = BytesIO(content)
df = pd.read_csv(csv_content, sep=",", header=0 )
table_name = "your_big_query_table_name"
upload_dataframe_gbq(df,table_name)
I am having trouble writing a python script that loads or exports a file from google cloud storage to google bigquery.
#standardSQL
import json
import argparse
import time
import uuid
from google.cloud import bigquery
from google.cloud import storage
dataset = 'dataworks-356fa'
source = 'gs://dataworks-356fa-backups/pullnupload.json'
# def load_data_from_gcs(dataset, source):
# # load_data_from_gcs(dataworks-356fa, 'test10', gs://dataworks-356fa-backups/pullnupload.json):
# bigquery_client = bigquery.Client('dataworks-356fa')
# dataset = bigquery_client.dataset(FirebaseArchive)
# table = dataset.table(test10)
# job_name = str(uuid.uuid4())
#
# job = bigquery_client.load_table_from_storage(
# job_name, test10, 'gs://dataworks-356fa-backups/pullnupload.json')
#
# job.source_format = 'NEWLINE_DELIMITED_JSON'
# job.begin()
def load_data_from_gcs(dataset, test10, source ):
bigquery_client = bigquery.Client(dataset)
dataset = bigquery_client.dataset('FirebaseArchive')
table = dataset.table(test10)
job_name = str(uuid.uuid4())
job = bigquery_client.load_table_from_storage(
job_name, table, "gs://dataworks-356fa-backups/pullnupload.json")
job.source_format = 'NEWLINE_DELIMITED_JSON'
job.begin()
job.errors
So far this is my code. This file will run but it does not load anything into bigquery or come back with an error message. It runs then returns me to the normal terminal view.
From your previous question, you have the wait_for_job function. You should use it before printing for errors, like so:
def load_data_from_gcs(dataset, test10, source ):
bigquery_client = bigquery.Client(dataset)
dataset = bigquery_client.dataset('FirebaseArchive')
table = dataset.table(test10)
job_name = str(uuid.uuid4())
job = bigquery_client.load_table_from_storage(
job_name, table, "gs://dataworks-356fa-backups/pullnupload.json")
job.source_format = 'NEWLINE_DELIMITED_JSON'
job.begin()
wait_for_job(job)
print("state of job is: " + job.state)
print("errors: " + job.errors)
You can also use IPython to run each step by hand and observe what results on each line.
Notice that job.state must reach 'DONE' status before looking for errors.