I'm making a Google cloud Function with python that requests data to an API, perform an ETL, and finally put the resulting panda's dataframe in a big query table.
The deploy is correct, but when I trigger the function (HTTP trigger) I get this error:
Internal Server Error
The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.
Any idea of what I'am doing wrong??? Here is a simplified version of my code:
import pandas as pd
from google.cloud import bigquery, error_reporting
from bigquery_tools import update_table
def main(request):
if request:
try:
# BIGQUERY CLIENT
BIGQUERY_CREDENTIALS = "credentials.json"
BIGQUERY_PROJECT_ID = "my_project_id"
BIGQUERY_DATASET_ID = "my_dataset_id"
TABLE_ID = "my_table"
CLIENT = bigquery.Client(project=BIGQUERY_PROJECT_ID)
# SOME DATAFRAME
df = pd.DataFrame({
"debug": ["debug_a"]
})
# SAVE TO BIGQUERY
try:
dataset_ref = CLIENT.dataset(BIGQUERY_DATASET_ID)
table_ref = dataset_ref.table(TABLE_ID)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.PARQUET
job_config.autodetect = True
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job = CLIENT.load_table_from_dataframe(
df,
table_ref,
job_config=job_config
)
job.result() # Waits for table load to complete.
except Exception as e:
pass
except Exception as e:
pass
[SOLVED] the problem was that I forgot the statement return ('some message', http_code).
For example : return ('ok',200)
try:
dataset_ref = CLIENT.dataset(BIGQUERY_DATASET_ID)
table_ref = dataset_ref.table(TABLE_ID)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.PARQUET
job_config.autodetect = True
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job = CLIENT.load_table_from_dataframe(
df,
table_ref,
job_config=job_config
)
job.result() # Waits for table load to complete.
return ("ok", 200)
except Exception as e:
return ("error", 400)
Related
I am getting the below error while uploading a CSV to bigquery using Python:
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: Could not parse '80:00:00' as TIME for field global_time_for_first_response_goal (position 36) starting at location 11602908 with message 'Invalid time string "80:00:00"' File: gs://mybucket/mytickets/2023-02-1309:58:11:865588.csv
def upload_csv_bigquery_dataset():
# logging.info(">>> Uploading CSV to Big Query")
client = bigquery.Client()
table_id = "myproject-dev.tickets.ticket"
job_config = bigquery.LoadJobConfig(
write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE,
source_format = bigquery.SourceFormat.CSV,
schema = [bigquery.table_schema],
skip_leading_rows = 1,
autodetect = True,
allow_quoted_newlines = True
)
uri = "gs://mybucket/mytickets/2023-02-1309:58:11:865588.csv"
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print(">>> Loaded {} rows.".format(destination_table.num_rows))
Can someone please tell me a fix or a workaround please? Stuck at this.
Solution/Workaround:
The following fixed it for me.
So my entire CSV file of 10000 lines had just one row with an erroneous field. It was difficult to reproduce this or fix this in the code that was generating the CSV. So I did this:
def upload_csv_bigquery_dataset(uniqueDateTime):
logging.info(">>> Uploading CSV to Big Query")
client = bigquery.Client()
table_id = "my-project-dev.tickets.ticket"
MAX_BAD_RECORDS = 1
job_config = bigquery.LoadJobConfig(
write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE,
source_format = bigquery.SourceFormat.CSV,
skip_leading_rows = 1,
max_bad_records = MAX_BAD_RECORDS, #Skip one or more bad records
autodetect = True,
allow_quoted_newlines = True
)
uri = "gs://myticket/my/" + uniqueDateTime
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print(">>> Loaded {} rows.".format(destination_table.num_rows))
Basically used the max_bad_records option in my job_config to ignore that error record and write all my data to big query.
I'm not able to find any example to create external tables from Paquet files with autodetect schema. Here is my current code :
bq_client = bigquery.Client.from_service_account_json(key_path)
table_name = "my_table"
table_id = f"{PROJECT_ID}.{DATASET}.{table_name}"
dataset_ref = bq_client.dataset(DATASET)
table_ref = bigquery.TableReference(dataset_ref, table_id)
table_schema = [bigquery.schema.SchemaField("example","STRING")] # I don't want this
table = bigquery.Table(table_ref, table_schema) # I don't want this
external_config = bigquery.ExternalConfig(source_format='PARQUET')
source_uris = [f"gs://path/to/file_name.snappy.parquet"]
external_config.source_uris = source_uris
external_config.autodetect = True
table.external_data_configuration = external_config # Not sure how to do this
bq_client.create_table(table) # and this without table schema
logger.debug("Created table '{}'.".format(table_id))
Currently I have to specify the table schema. I want to autodetect the schema instead. Kindly help. Thank you.
Check out the documentation https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet#loading_parquet_data_into_a_new_table
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"
job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.PARQUET,)
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
I am going to set require_partition_filter to True on bigquery Table. But I can access only TableReference instead of Table. How to do this?
UPDATE
Maybe I did not express my question clearly. I need to write python program to do this. I would not like to set the configuration by commands or sql, because there are too many tables. In this program, I can generate TableReference table_ref in the following program. But how to set require_parition_filter on table_ref?
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
job = client.load_table_from_uri(
glob, # google cloud storage bucket
table_ref, # returned by table() function above
job_id='123',
job_config=config, # at beginning, set `require_parition_filter` here, but this deprecated
)
How to do this?
As mentioned in this answer you can use an ALTER command to alter your table as follow:
#standardSQL
ALTER TABLE IF EXISTS mydataset.newtable
SET OPTIONS(
require_partition_filter = false
)
You can change the require_partition_filter back to true if needed using the same command
Note that the preferred way would be to get a bigquery.Table object from TableReference, change require_partition_filter and patch the table (similar example with expiration date). However, as you mention that you can only access TableReference, you can also set
TimePartitioning.require_partition_filter (deprecated according to docs) when creating the table with a load job. For example:
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
and it works as expected:
$ bq query "SELECT * FROM test.require_filter"
Waiting on JOB_ID ... (0s) Current status: DONE
Error in query string: Error processing job 'JOB_ID': Cannot query over table 'test.require_filter' without a filter over column(s) '_PARTITION_LOAD_TIME',
'_PARTITIONDATE', '_PARTITIONTIME' that can be used for partition elimination
Full code:
import pandas as pd
from google.cloud import bigquery
PROJECT = "PROJECT_ID"
DATASET = "test"
TABLE = "require_filter"
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
client = bigquery.Client(project=PROJECT)
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
data = {"foo": ["bar"]}
df = pd.DataFrame(data)
table_ref = table(client, DATASET, TABLE)
load_job = client.load_table_from_dataframe(
df,
table_ref,
job_config = job_config
)
result = load_job.result()
I have a client API in python which executes BigQuery job to trigger a query and write query result into respective BigQuery table. How to determine whether that query result returns zero records at any execution or not?
Python code:
def main(request):
query = "select * from `myproject.mydataset.mytable`"
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
dest_dataset = client.dataset(destination_dataset, destination_project)
dest_table = dest_dataset.table(destination_table)
job_config.destination = dest_table
job_config.create_disposition = 'CREATE_IF_NEEDED'
job_config.write_disposition = 'WRITE_APPEND'
job = client.query(query, location='US', job_config=job_config)
job.result()
I want If query result is having none record then it should print some message for me. Can anybody suggest how to get this done.
QueryJob.result() returns a RowIterator, which has a property called total_rows.
So, something like:
result = job.result()
if result.total_rows == 0:
print('no results')
Documentation: RowIterator.total_rows
Updated per #Dom Zippilli's comment: . total_rows is what you're looking for.
I have a python script that executes a gbq job to load a csv file f to table in BigQuery. I am trying to upload data in csv format and getting the following error:
400 Invalid schema update. Cannot add fields (field: string_field_8)
this is my csv:
id,first_name,username,last_name,chat_username,chat_id,forward_date,message_text
231125223|Just|koso|swissborg_bounty|-1001368946079|1517903147|tes
481895079|Emerson|EmersonEmory|swissborg_bounty|-1001368946079|1517904387|pictu
316560356|Ken Sam|ICOnomix|swissborg_bounty|-1001368946079|1517904515|Today
this is my code:
from google.cloud.bigquery import Client
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '***.json'
os.environ['GOOGLE_CLOUD_DISABLE_GRPC'] = 'True'
from google.cloud import bigquery
dataset_name = 'test_temporary_dataset'
table_name='table_telega'
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset(dataset_name)
table = dataset.table(table_name)
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'text/csv'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job_config.fieldDelimiter='|'
job_config.allow_jagged_rows=True
job_config.ignoreUnknownValues=True
job_config.allow_quoted_newlines=True
with open('**.csv', 'rb') as source_file:
#job = table.upload_from_file(source_file, source_format='text/csv')
job=bigquery_client.load_table_from_file(source_file, table, job_config=job_config)
job.result()
print(job.result())
how to fix it? what should I change ?
Just add this line in your code
job_config._properties['load']['schemaUpdateOptions'] = ['ALLOW_FIELD_ADDITION']
and this will allow column addition to your existing schema.
Instead of job_config.autodetect = True, you set it = False.