BigQuery create external tables from Parquet files with autodetect schema using Python - python

I'm not able to find any example to create external tables from Paquet files with autodetect schema. Here is my current code :
bq_client = bigquery.Client.from_service_account_json(key_path)
table_name = "my_table"
table_id = f"{PROJECT_ID}.{DATASET}.{table_name}"
dataset_ref = bq_client.dataset(DATASET)
table_ref = bigquery.TableReference(dataset_ref, table_id)
table_schema = [bigquery.schema.SchemaField("example","STRING")] # I don't want this
table = bigquery.Table(table_ref, table_schema) # I don't want this
external_config = bigquery.ExternalConfig(source_format='PARQUET')
source_uris = [f"gs://path/to/file_name.snappy.parquet"]
external_config.source_uris = source_uris
external_config.autodetect = True
table.external_data_configuration = external_config # Not sure how to do this
bq_client.create_table(table) # and this without table schema
logger.debug("Created table '{}'.".format(table_id))
Currently I have to specify the table schema. I want to autodetect the schema instead. Kindly help. Thank you.

Check out the documentation https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet#loading_parquet_data_into_a_new_table
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"
job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.PARQUET,)
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Related

How can I fix this error while uploading csv to bigquery?

I am getting the below error while uploading a CSV to bigquery using Python:
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: Could not parse '80:00:00' as TIME for field global_time_for_first_response_goal (position 36) starting at location 11602908 with message 'Invalid time string "80:00:00"' File: gs://mybucket/mytickets/2023-02-1309:58:11:865588.csv
def upload_csv_bigquery_dataset():
# logging.info(">>> Uploading CSV to Big Query")
client = bigquery.Client()
table_id = "myproject-dev.tickets.ticket"
job_config = bigquery.LoadJobConfig(
write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE,
source_format = bigquery.SourceFormat.CSV,
schema = [bigquery.table_schema],
skip_leading_rows = 1,
autodetect = True,
allow_quoted_newlines = True
)
uri = "gs://mybucket/mytickets/2023-02-1309:58:11:865588.csv"
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print(">>> Loaded {} rows.".format(destination_table.num_rows))
Can someone please tell me a fix or a workaround please? Stuck at this.
Solution/Workaround:
The following fixed it for me.
So my entire CSV file of 10000 lines had just one row with an erroneous field. It was difficult to reproduce this or fix this in the code that was generating the CSV. So I did this:
def upload_csv_bigquery_dataset(uniqueDateTime):
logging.info(">>> Uploading CSV to Big Query")
client = bigquery.Client()
table_id = "my-project-dev.tickets.ticket"
MAX_BAD_RECORDS = 1
job_config = bigquery.LoadJobConfig(
write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE,
source_format = bigquery.SourceFormat.CSV,
skip_leading_rows = 1,
max_bad_records = MAX_BAD_RECORDS, #Skip one or more bad records
autodetect = True,
allow_quoted_newlines = True
)
uri = "gs://myticket/my/" + uniqueDateTime
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print(">>> Loaded {} rows.".format(destination_table.num_rows))
Basically used the max_bad_records option in my job_config to ignore that error record and write all my data to big query.

Big Query: Create table with time partitioning and clustering fields using Python

I can successfully create a Big Query table in Python as:
from google.cloud import bigquery
bq_client = bigquery.Client()
table_name = "my_test_table"
dataset = bq_client.dataset("MY_TEST_DATASET")
table_ref = dataset.table(table_name)
table = bigquery.Table(table_ref)
table = bq_client.create_table(table)
And later I upload a local Pandas DataFrame as:
# --- Define BQ options ---
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = "WRITE_APPEND"
job_config.source_format = bigquery.SourceFormat.CSV
# --- Load data ---
job = bq_client.load_table_from_dataframe(
df, f"MY_TEST_DATASET.{table_name}", job_config=job_config
)
How can I specify, while creating the table and using Python:
Partition by daily ingestion time
As clustering fields ["business_id", "software_house", "product_id"]
You can use the following Python script to create your BigQuery table with partitioning and clustering :
def create_table():
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
table_id = "your_project.your_dataset.table_test"
schema = [
bigquery.SchemaField("business_id", "STRING", mode="NULLABLE"),
bigquery.SchemaField("software_house", "STRING", mode="NULLABLE"),
bigquery.SchemaField("product_id", "STRING", mode="NULLABLE"),
bigquery.SchemaField("other_field", "STRING", mode="NULLABLE"),
bigquery.SchemaField("ingestion_time", "TIMESTAMP", mode="NULLABLE"),
]
table = bigquery.Table(table_id, schema=schema)
# Clustering.
table.clustering_fields = ["business_id", "software_house", "product_id"]
# Partitioning.
table.time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
field="ingestion_time", # name of column to use for partitioning
expiration_ms=7776000000
) # 90 days
table = client.create_table(table)
print(
"Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)
if __name__ == '__main__':
create_table()
In this case :
A partitoning was added on ingestion_time field per DAY
A clustering wad added on ["business_id", "software_house", "product_id"] fields
These docs show how adding partitioning and clustering on fields :
BQ table partitioning
BQ table clustering
The result in BigQuery is :

BigQuery: How to set `require_partition_filter` by TableReference

I am going to set require_partition_filter to True on bigquery Table. But I can access only TableReference instead of Table. How to do this?
UPDATE
Maybe I did not express my question clearly. I need to write python program to do this. I would not like to set the configuration by commands or sql, because there are too many tables. In this program, I can generate TableReference table_ref in the following program. But how to set require_parition_filter on table_ref?
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
job = client.load_table_from_uri(
glob, # google cloud storage bucket
table_ref, # returned by table() function above
job_id='123',
job_config=config, # at beginning, set `require_parition_filter` here, but this deprecated
)
How to do this?
As mentioned in this answer you can use an ALTER command to alter your table as follow:
#standardSQL
ALTER TABLE IF EXISTS mydataset.newtable
SET OPTIONS(
require_partition_filter = false
)
You can change the require_partition_filter back to true if needed using the same command
Note that the preferred way would be to get a bigquery.Table object from TableReference, change require_partition_filter and patch the table (similar example with expiration date). However, as you mention that you can only access TableReference, you can also set
TimePartitioning.require_partition_filter (deprecated according to docs) when creating the table with a load job. For example:
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
and it works as expected:
$ bq query "SELECT * FROM test.require_filter"
Waiting on JOB_ID ... (0s) Current status: DONE
Error in query string: Error processing job 'JOB_ID': Cannot query over table 'test.require_filter' without a filter over column(s) '_PARTITION_LOAD_TIME',
'_PARTITIONDATE', '_PARTITIONTIME' that can be used for partition elimination
Full code:
import pandas as pd
from google.cloud import bigquery
PROJECT = "PROJECT_ID"
DATASET = "test"
TABLE = "require_filter"
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
client = bigquery.Client(project=PROJECT)
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
data = {"foo": ["bar"]}
df = pd.DataFrame(data)
table_ref = table(client, DATASET, TABLE)
load_job = client.load_table_from_dataframe(
df,
table_ref,
job_config = job_config
)
result = load_job.result()

Cannot add fields when importing CSV to a table in BigQuery

I have a python script that executes a gbq job to load a csv file f to table in BigQuery. I am trying to upload data in csv format and getting the following error:
400 Invalid schema update. Cannot add fields (field: string_field_8)
this is my csv:
id,first_name,username,last_name,chat_username,chat_id,forward_date,message_text
231125223|Just|koso|swissborg_bounty|-1001368946079|1517903147|tes
481895079|Emerson|EmersonEmory|swissborg_bounty|-1001368946079|1517904387|pictu
316560356|Ken Sam|ICOnomix|swissborg_bounty|-1001368946079|1517904515|Today
this is my code:
from google.cloud.bigquery import Client
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '***.json'
os.environ['GOOGLE_CLOUD_DISABLE_GRPC'] = 'True'
from google.cloud import bigquery
dataset_name = 'test_temporary_dataset'
table_name='table_telega'
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset(dataset_name)
table = dataset.table(table_name)
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'text/csv'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job_config.fieldDelimiter='|'
job_config.allow_jagged_rows=True
job_config.ignoreUnknownValues=True
job_config.allow_quoted_newlines=True
with open('**.csv', 'rb') as source_file:
#job = table.upload_from_file(source_file, source_format='text/csv')
job=bigquery_client.load_table_from_file(source_file, table, job_config=job_config)
job.result()
print(job.result())
how to fix it? what should I change ?
Just add this line in your code
job_config._properties['load']['schemaUpdateOptions'] = ['ALLOW_FIELD_ADDITION']
and this will allow column addition to your existing schema.
Instead of job_config.autodetect = True, you set it = False.

create a table and query the value of this table using Python API in Bigquery

I want to create a new table and assign the value of this table by writing a query.
from google.cloud import bigquery
bigquery_client = bigquery.Client(project="myproject")
dataset = bigquery_client.dataset("mydataset")
table_ref = dataset.table('New table')
table = bigquery.Table(table_ref)
table = client.create_table(table)
So, I created an empty table
And now I wish to have some values for this table from another table that's called "Old Table":
query = "SELECT * FROM `{Old table}`"
How can I make sure that my table is linked to this query?
I tried
table.view_query = "SELECT * FROM `{Old table}`"
but it didn't work
Thanks,
I think you should use smth like this:
bigquery_client = bigquery.Client(project="myproject")
dataset = bigquery_client.dataset("mydataset")
table_ref = dataset.table('New_table')
sql_query = "SELECT * FROM `{project}.{dataset}.{table}`"
job_config = bigquery.QueryJobConfig()
# Set configuration.query.destinationTable
job_config.destination = table_ref
# Set configuration.query.createDisposition
job_config.create_disposition = 'CREATE_IF_NEEDED'
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_APPEND'
# Start the query
job = bigquery_client.query(sql_query, job_config=job_config)
# Wait for the query to finish
job.result()

Categories